Merge branch 'master' into master_fixes
Conflicts: Makefile bwape.c bwase.c bwtsw2_aux.c stdaln.c
This commit is contained in:
commit
aabd990e8f
32
Makefile
32
Makefile
|
|
@ -4,8 +4,8 @@ CXXFLAGS= $(CFLAGS)
|
|||
AR= ar
|
||||
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
||||
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
|
||||
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
is.o bwtindex.o bwape.o kopen.o \
|
||||
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
is.o bwtindex.o bwape.o kopen.o pemerge.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||
PROG= bwa bwamem-lite
|
||||
|
|
@ -23,10 +23,10 @@ SUBDIRS= .
|
|||
all:$(PROG)
|
||||
|
||||
bwa:libbwa.a $(AOBJS) main.o
|
||||
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
|
||||
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
bwamem-lite:libbwa.a example.o
|
||||
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
|
||||
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
libbwa.a:$(LOBJS)
|
||||
$(AR) -csru $@ $(LOBJS)
|
||||
|
|
@ -41,33 +41,33 @@ depend:
|
|||
|
||||
QSufSort.o: QSufSort.h
|
||||
bamlite.o: utils.h bamlite.h
|
||||
bntseq.o: bntseq.h main.h utils.h kseq.h
|
||||
bntseq.o: bntseq.h utils.h kseq.h
|
||||
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kseq.h
|
||||
bwamem.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
|
||||
bwamem.o: ksort.h kbtree.h
|
||||
bwamem_pair.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h kvec.h ksw.h
|
||||
bwape.o: bwtaln.h bwt.h stdaln.h kvec.h bntseq.h utils.h bwase.h bwa.h
|
||||
bwape.o: khash.h
|
||||
bwase.o: stdaln.h bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h
|
||||
bwaseqio.o: bwtaln.h bwt.h stdaln.h utils.h bamlite.h kseq.h
|
||||
bwape.o: bwtaln.h bwt.h kvec.h bntseq.h utils.h bwase.h bwa.h ksw.h khash.h
|
||||
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h ksw.h
|
||||
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h kseq.h
|
||||
bwt.o: utils.h bwt.h kvec.h
|
||||
bwt_gen.o: QSufSort.h utils.h
|
||||
bwt_lite.o: bwt_lite.h utils.h
|
||||
bwtaln.o: bwtaln.h bwt.h stdaln.h bwtgap.h utils.h bwa.h bntseq.h
|
||||
bwtgap.o: bwtgap.h bwt.h bwtaln.h stdaln.h utils.h
|
||||
bwtindex.o: bntseq.h bwt.h main.h utils.h
|
||||
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h stdaln.h kstring.h
|
||||
bwtsw2_aux.o: bwa.h kseq.h ksort.h
|
||||
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h
|
||||
bwtgap.o: bwtgap.h bwt.h bwtaln.h utils.h
|
||||
bwtindex.o: bntseq.h bwt.h utils.h
|
||||
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h bwa.h
|
||||
bwtsw2_aux.o: ksw.h kseq.h ksort.h
|
||||
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h utils.h ksort.h
|
||||
bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h utils.h khash.h
|
||||
bwtsw2_core.o: ksort.h
|
||||
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
|
||||
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h ksw.h
|
||||
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h utils.h
|
||||
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h utils.h kseq.h
|
||||
is.o: utils.h
|
||||
kopen.o: utils.h
|
||||
kstring.o: kstring.h utils.h
|
||||
ksw.o: ksw.h utils.h
|
||||
main.o: main.h utils.h
|
||||
stdaln.o: stdaln.h utils.h
|
||||
main.o: utils.h
|
||||
pemerge.o: ksw.h kseq.h utils.h kstring.h bwa.h bntseq.h bwt.h
|
||||
utils.o: utils.h ksort.h kseq.h
|
||||
|
|
|
|||
1
bntseq.c
1
bntseq.c
|
|
@ -32,7 +32,6 @@
|
|||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include "bntseq.h"
|
||||
#include "main.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include "kseq.h"
|
||||
|
|
|
|||
24
bwa.1
24
bwa.1
|
|
@ -1,4 +1,4 @@
|
|||
.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools"
|
||||
.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
bwa - Burrows-Wheeler Alignment Tool
|
||||
|
|
@ -81,6 +81,8 @@ genome.
|
|||
.IR minSeedLen ]
|
||||
.RB [ -w
|
||||
.IR bandWidth ]
|
||||
.RB [ -d
|
||||
.IR zDropoff ]
|
||||
.RB [ -r
|
||||
.IR seedSplitRatio ]
|
||||
.RB [ -c
|
||||
|
|
@ -163,6 +165,21 @@ Band width. Essentially, gaps longer than
|
|||
will not be found. Note that the maximum gap length is also affected by the
|
||||
scoring matrix and the hit length, not solely determined by this option. [100]
|
||||
.TP
|
||||
.BI -d \ INT
|
||||
Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between
|
||||
the best and the current extension score is above
|
||||
.RI | i - j |* A + INT ,
|
||||
where
|
||||
.I i
|
||||
and
|
||||
.I j
|
||||
are the current positions of the query and reference, respectively, and
|
||||
.I A
|
||||
is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it
|
||||
doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not
|
||||
only avoids unnecessary extension, but also reduces poor alignments inside a
|
||||
long good alignment. [100]
|
||||
.TP
|
||||
.BI -r \ FLOAT
|
||||
Trigger re-seeding for a MEM longer than
|
||||
.IR minSeedLen * FLOAT .
|
||||
|
|
@ -215,6 +232,11 @@ and will be converted to a TAB in the output SAM. The read group ID will be
|
|||
attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
|
||||
[null]
|
||||
.TP
|
||||
.BI -T \ INT
|
||||
Don't output alignment with score lower than
|
||||
.IR INT .
|
||||
This option only affects output. [30]
|
||||
.TP
|
||||
.B -a
|
||||
Output all found alignments for single-end or unpaired paired-end reads. These
|
||||
alignments will be flagged as secondary alignments.
|
||||
|
|
|
|||
13
bwa.c
13
bwa.c
|
|
@ -55,7 +55,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
|||
kseq2bseq1(ks2, &seqs[n]);
|
||||
size += seqs[n++].l_seq;
|
||||
}
|
||||
if (size >= chunk_size) break;
|
||||
if (size >= chunk_size && (n&1) == 0) break;
|
||||
}
|
||||
if (size == 0) { // test if the 2nd file is finished
|
||||
if (ks2 && kseq_read(ks2) >= 0)
|
||||
|
|
@ -69,6 +69,17 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
|||
* CIGAR related *
|
||||
*****************/
|
||||
|
||||
void bwa_fill_scmat(int a, int b, int8_t mat[25])
|
||||
{
|
||||
int i, j, k;
|
||||
for (i = k = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
mat[k++] = i == j? a : -b;
|
||||
mat[k++] = 0; // ambiguous base
|
||||
}
|
||||
for (j = 0; j < 5; ++j) mat[k++] = 0;
|
||||
}
|
||||
|
||||
// Generate CIGAR when the alignment end points are known
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
|
||||
{
|
||||
|
|
|
|||
1
bwa.h
1
bwa.h
|
|
@ -30,6 +30,7 @@ extern "C" {
|
|||
|
||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
|
||||
|
||||
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
|
||||
|
||||
|
|
|
|||
17
bwamem.c
17
bwamem.c
|
|
@ -44,6 +44,7 @@ mem_opt_t *mem_opt_init()
|
|||
o = xcalloc(1, sizeof(mem_opt_t));
|
||||
o->flag = 0;
|
||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||
o->T = 30;
|
||||
o->zdrop = 100;
|
||||
o->pen_unpaired = 9;
|
||||
o->pen_clip = 5;
|
||||
|
|
@ -58,21 +59,10 @@ mem_opt_t *mem_opt_init()
|
|||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
o->max_matesw = 100;
|
||||
mem_fill_scmat(o->a, o->b, o->mat);
|
||||
bwa_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
}
|
||||
|
||||
void mem_fill_scmat(int a, int b, int8_t mat[25])
|
||||
{
|
||||
int i, j, k;
|
||||
for (i = k = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
mat[k++] = i == j? a : -b;
|
||||
mat[k++] = 0; // ambiguous base
|
||||
}
|
||||
for (j = 0; j < 5; ++j) mat[k++] = 0;
|
||||
}
|
||||
|
||||
/***************************
|
||||
* SMEM iterator interface *
|
||||
***************************/
|
||||
|
|
@ -753,11 +743,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
|
|||
int k;
|
||||
kstring_t str;
|
||||
str.l = str.m = 0; str.s = 0;
|
||||
if (a->n > 0) {
|
||||
if (a->n > 0 && a->a[0].score >= opt->T) {
|
||||
int mapq0 = -1;
|
||||
for (k = 0; k < a->n; ++k) {
|
||||
bwahit_t h;
|
||||
mem_alnreg_t *p = &a->a[k];
|
||||
if (p->score < opt->T) continue;
|
||||
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
|
||||
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
|
||||
mem_alnreg2hit(p, &h);
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -24,6 +24,7 @@ typedef struct {
|
|||
int w; // band width
|
||||
int zdrop; // Z-dropoff
|
||||
|
||||
int T; // output score threshold; only affecting output
|
||||
int flag; // see MEM_F_* macros
|
||||
int min_seed_len; // minimum seed length
|
||||
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
|
||||
|
|
|
|||
|
|
@ -303,7 +303,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
|||
|
||||
no_pairing:
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (a[i].n) {
|
||||
if (a[i].n && a[i].a[0].score >= opt->T) {
|
||||
mem_alnreg2hit(&a[i].a[0], &h[i]);
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
|
||||
} else h[i].rb = h[i].re = -1;
|
||||
|
|
|
|||
41
bwape.c
41
bwape.c
|
|
@ -8,9 +8,9 @@
|
|||
#include "kvec.h"
|
||||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
#include "stdaln.h"
|
||||
#include "bwase.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
|
||||
typedef struct {
|
||||
int n;
|
||||
|
|
@ -397,16 +397,17 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
|
|||
#define SW_MIN_MAPQ 17
|
||||
|
||||
// cnt = n_mm<<16 | n_gapo<<8 | n_gape
|
||||
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen,
|
||||
int *n_cigar, uint32_t *_cnt)
|
||||
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt)
|
||||
{
|
||||
kswr_t r;
|
||||
uint32_t *cigar32 = 0;
|
||||
bwa_cigar_t *cigar = 0;
|
||||
ubyte_t *ref_seq;
|
||||
bwtint_t k, x, y, l;
|
||||
int path_len, ret, subo;
|
||||
AlnParam ap = aln_param_bwa;
|
||||
path_t *path, *p;
|
||||
int xtra;
|
||||
int8_t mat[25];
|
||||
|
||||
bwa_fill_scmat(1, 3, mat);
|
||||
// check whether there are too many N's
|
||||
if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
|
||||
for (k = 0, x = 0; k < len; ++k)
|
||||
|
|
@ -417,15 +418,19 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
|
|||
ref_seq = (ubyte_t*)xcalloc(reglen, 1);
|
||||
for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
|
||||
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
||||
path = (path_t*)xcalloc(l+len, sizeof(path_t));
|
||||
|
||||
// do alignment
|
||||
ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo);
|
||||
if (ret < 0 || subo == ret) { // no hit or tandem hits
|
||||
free(path); free(cigar); free(ref_seq); *n_cigar = 0;
|
||||
xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0);
|
||||
r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0);
|
||||
ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32);
|
||||
cigar = (bwa_cigar_t*)cigar32;
|
||||
for (k = 0; k < *n_cigar; ++k)
|
||||
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
|
||||
|
||||
if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits
|
||||
free(cigar); free(ref_seq); *n_cigar = 0;
|
||||
return 0;
|
||||
}
|
||||
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
|
||||
|
||||
// check whether the alignment is good enough
|
||||
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
||||
|
|
@ -435,17 +440,14 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
|
|||
else y += __cigar_len(c);
|
||||
}
|
||||
if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
|
||||
free(path); free(cigar); free(ref_seq);
|
||||
free(cigar); free(ref_seq);
|
||||
*n_cigar = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
{ // update cigar and coordinate;
|
||||
int start, end;
|
||||
p = path + path_len - 1;
|
||||
*beg += (p->i? p->i : 1) - 1;
|
||||
start = (p->j? p->j : 1) - 1;
|
||||
end = path->j;
|
||||
int start = r.qb, end = r.qe + 1;
|
||||
*beg += r.tb;
|
||||
cigar = (bwa_cigar_t*)xrealloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
|
||||
if (start) {
|
||||
memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
|
||||
|
|
@ -462,8 +464,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
|
|||
{ // set *cnt
|
||||
int n_mm, n_gapo, n_gape;
|
||||
n_mm = n_gapo = n_gape = 0;
|
||||
p = path + path_len - 1;
|
||||
x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0;
|
||||
x = r.tb; y = r.qb;
|
||||
for (k = 0; k < *n_cigar; ++k) {
|
||||
bwa_cigar_t c = cigar[k];
|
||||
if (__cigar_op(c) == FROM_M) {
|
||||
|
|
@ -479,7 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
|
|||
*_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
|
||||
}
|
||||
|
||||
free(ref_seq); free(path);
|
||||
free(ref_seq);
|
||||
return cigar;
|
||||
}
|
||||
|
||||
|
|
|
|||
100
bwase.c
100
bwase.c
|
|
@ -4,13 +4,13 @@
|
|||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include "stdaln.h"
|
||||
#include "bwase.h"
|
||||
#include "bwtaln.h"
|
||||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
#include "kstring.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
|
||||
int g_log_n[256];
|
||||
|
||||
|
|
@ -156,57 +156,58 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se
|
|||
bwt_destroy(bwt);
|
||||
}
|
||||
|
||||
/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
|
||||
* forward strand. This happens when p->pos is calculated by
|
||||
* bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
|
||||
* coordinate. This happens only for color-converted alignment. */
|
||||
bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
|
||||
int ext, int *n_cigar, int is_end_correct)
|
||||
#define SW_BW 50
|
||||
|
||||
bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, bwtint_t *_pos, int *n_cigar, int is_rev)
|
||||
{
|
||||
bwa_cigar_t *cigar = 0;
|
||||
ubyte_t *ref_seq;
|
||||
int l = 0, path_len, ref_len;
|
||||
AlnParam ap = aln_param_bwa;
|
||||
path_t *path;
|
||||
int64_t k, __pos = *_pos;
|
||||
uint32_t *cigar32 = 0;
|
||||
ubyte_t *rseq;
|
||||
int tle, qle, gtle, gscore;
|
||||
int64_t k, rb, re, rlen;
|
||||
int8_t mat[25];
|
||||
|
||||
ref_len = len + abs(ext);
|
||||
if (ext > 0) {
|
||||
ref_seq = (ubyte_t*)xcalloc(ref_len, 1);
|
||||
for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
|
||||
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
||||
} else {
|
||||
int64_t x = __pos + (is_end_correct? len : ref_len);
|
||||
ref_seq = (ubyte_t*)xcalloc(ref_len, 1);
|
||||
for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
|
||||
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
||||
}
|
||||
path = (path_t*)xcalloc(l+len, sizeof(path_t));
|
||||
|
||||
aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
|
||||
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
|
||||
|
||||
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand
|
||||
for (l = k = 0; k < *n_cigar; ++k) {
|
||||
if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
|
||||
else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
|
||||
bwa_fill_scmat(1, 3, mat);
|
||||
if (!is_rev) { // forward strand, the end position is correct
|
||||
re = *_pos + len;
|
||||
if (re > l_pac) re = l_pac;
|
||||
rb = re - (len + SW_BW);
|
||||
if (rb < 0) rb = 0;
|
||||
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
|
||||
seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences
|
||||
seq_reverse(rlen, rseq, 0);
|
||||
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0);
|
||||
if (gscore > 0) tle = gtle, qle = len;
|
||||
rb = re - tle; rlen = tle;
|
||||
seq_reverse(len, seq, 0);
|
||||
seq_reverse(rlen, rseq, 0);
|
||||
ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32);
|
||||
if (qle < len) { // write soft clip
|
||||
cigar = xrealloc(cigar, (*n_cigar + 1) * 4);
|
||||
memmove(cigar + 1, cigar, *n_cigar * 4);
|
||||
cigar[0] = (len - qle)<<4 | FROM_S;
|
||||
++(*n_cigar);
|
||||
}
|
||||
} else { // reverse strand, the start position is correct
|
||||
rb = *_pos; re = rb + len + SW_BW;
|
||||
if (re > l_pac) re = l_pac;
|
||||
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
|
||||
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0);
|
||||
if (gscore > 0) tle = gtle, qle = len;
|
||||
re = rb + tle; rlen = tle;
|
||||
ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension
|
||||
if (qle < len) {
|
||||
cigar = xrealloc(cigar, (*n_cigar + 1) * 4);
|
||||
cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S;
|
||||
++(*n_cigar);
|
||||
}
|
||||
__pos += l;
|
||||
}
|
||||
*_pos = rb;
|
||||
|
||||
if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end
|
||||
__pos += __cigar_len(cigar[0]);
|
||||
for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
|
||||
--(*n_cigar);
|
||||
}
|
||||
if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
|
||||
|
||||
// change "I" at either end of the read to S. just in case. This should rarely happen...
|
||||
if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
|
||||
if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
|
||||
|
||||
*_pos = (bwtint_t)__pos;
|
||||
free(ref_seq); free(path);
|
||||
cigar = (bwa_cigar_t*)cigar32;
|
||||
for (k = 0; k < *n_cigar; ++k)
|
||||
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
|
||||
free(rseq);
|
||||
return cigar;
|
||||
}
|
||||
|
||||
|
|
@ -314,13 +315,11 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
|
|||
bwt_multi1_t *q = s->multi + j;
|
||||
int n_cigar;
|
||||
if (q->gap == 0) continue;
|
||||
q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
||||
(q->strand? 1 : -1) * q->gap, &n_cigar, 1);
|
||||
q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, &n_cigar, q->strand);
|
||||
q->n_cigar = n_cigar;
|
||||
}
|
||||
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
|
||||
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
||||
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
|
||||
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand);
|
||||
}
|
||||
// generate MD tag
|
||||
str = (kstring_t*)xcalloc(1, sizeof(kstring_t));
|
||||
|
|
@ -606,5 +605,6 @@ int bwa_sai2sam_se(int argc, char *argv[])
|
|||
return 0;
|
||||
}
|
||||
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
|
||||
free(prefix);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
15
bwtaln.c
15
bwtaln.c
|
|
@ -312,18 +312,3 @@ int bwa_aln(int argc, char *argv[])
|
|||
free(opt); free(prefix);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
|
||||
__cigar_op and __cigar_len while keeping stdaln stand alone */
|
||||
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
|
||||
{
|
||||
uint32_t *cigar32;
|
||||
bwa_cigar_t *cigar;
|
||||
int i;
|
||||
cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
|
||||
cigar = (bwa_cigar_t*)cigar32;
|
||||
for (i = 0; i < *n_cigar; ++i)
|
||||
cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
|
||||
return cigar;
|
||||
}
|
||||
|
||||
|
|
|
|||
12
bwtaln.h
12
bwtaln.h
|
|
@ -28,6 +28,11 @@
|
|||
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
|
||||
#endif
|
||||
|
||||
#define FROM_M 0
|
||||
#define FROM_I 1
|
||||
#define FROM_D 2
|
||||
#define FROM_S 3
|
||||
|
||||
typedef struct {
|
||||
bwtint_t w;
|
||||
int bid;
|
||||
|
|
@ -138,13 +143,6 @@ extern "C" {
|
|||
|
||||
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
|
||||
|
||||
|
||||
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
|
||||
__cigar_op and __cigar_len while keeping stdaln stand alone */
|
||||
#include "stdaln.h"
|
||||
|
||||
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@
|
|||
#include <zlib.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
#include "main.h"
|
||||
#include "utils.h"
|
||||
|
||||
#ifdef _DIVBWT
|
||||
|
|
|
|||
94
bwtsw2_aux.c
94
bwtsw2_aux.c
|
|
@ -11,9 +11,9 @@
|
|||
#include "bwt_lite.h"
|
||||
#include "utils.h"
|
||||
#include "bwtsw2.h"
|
||||
#include "stdaln.h"
|
||||
#include "kstring.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
|
@ -94,13 +94,12 @@ bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b)
|
|||
|
||||
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
|
||||
{
|
||||
int i, matrix[25];
|
||||
int i;
|
||||
bwtint_t k;
|
||||
uint8_t *target = 0, *query;
|
||||
AlnParam par;
|
||||
int8_t mat[25];
|
||||
|
||||
par.matrix = matrix;
|
||||
__gen_ap(par, opt);
|
||||
bwa_fill_scmat(opt->a, opt->b, mat);
|
||||
query = xcalloc(lq, 1);
|
||||
// sort according to the descending order of query end
|
||||
ks_introsort(hit, b->n, b->hits);
|
||||
|
|
@ -111,8 +110,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
|
|||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
|
||||
int score, j;
|
||||
path_t path;
|
||||
int score, j, qle, tle;
|
||||
p->n_seeds = 1;
|
||||
if (p->l || p->k == 0) continue;
|
||||
for (j = score = 0; j < i; ++j) {
|
||||
|
|
@ -127,12 +125,12 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
|
|||
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
|
||||
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
||||
lt = j;
|
||||
score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem);
|
||||
score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0);
|
||||
if (score > p->G) { // extensible
|
||||
p->G = score;
|
||||
p->len += path.i;
|
||||
p->beg -= path.j;
|
||||
p->k -= path.i;
|
||||
p->k -= tle;
|
||||
p->len += tle;
|
||||
p->beg -= qle;
|
||||
}
|
||||
}
|
||||
free(query); free(target);
|
||||
|
|
@ -140,62 +138,49 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
|
|||
|
||||
void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
|
||||
{
|
||||
int i, matrix[25];
|
||||
int i;
|
||||
bwtint_t k;
|
||||
uint8_t *target;
|
||||
AlnParam par;
|
||||
|
||||
par.matrix = matrix;
|
||||
__gen_ap(par, opt);
|
||||
int8_t mat[25];
|
||||
|
||||
bwa_fill_scmat(opt->a, opt->b, mat);
|
||||
target = xcalloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
|
||||
int j, score;
|
||||
path_t path;
|
||||
int j, score, qle, tle;
|
||||
if (p->l) continue;
|
||||
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
|
||||
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
||||
lt = j;
|
||||
score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem);
|
||||
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0);
|
||||
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
|
||||
if (score >= p->G) {
|
||||
p->G = score;
|
||||
p->len = path.i;
|
||||
p->end = path.j + p->beg;
|
||||
p->len = tle;
|
||||
p->end = p->beg + qle;
|
||||
}
|
||||
}
|
||||
free(target);
|
||||
}
|
||||
|
||||
/* generate CIGAR array(s) in b->cigar[] */
|
||||
static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name)
|
||||
static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name)
|
||||
{
|
||||
uint8_t *target;
|
||||
int i, matrix[25];
|
||||
AlnParam par;
|
||||
path_t *path;
|
||||
int i;
|
||||
int8_t mat[25];
|
||||
|
||||
par.matrix = matrix;
|
||||
__gen_ap(par, opt);
|
||||
i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length
|
||||
target = xcalloc(i, 1);
|
||||
path = xcalloc(i + lq, sizeof(path_t));
|
||||
// generate CIGAR
|
||||
bwa_fill_scmat(opt->a, opt->b, mat);
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = b->hits + i;
|
||||
bsw2aux_t *q = b->aux + i;
|
||||
uint8_t *query;
|
||||
bwtint_t k;
|
||||
int path_len, beg, end;
|
||||
int beg, end, score;
|
||||
if (p->l) continue;
|
||||
beg = (p->flag & 0x10)? lq - p->end : p->beg;
|
||||
end = (p->flag & 0x10)? lq - p->beg : p->end;
|
||||
query = seq[(p->flag & 0x10)? 1 : 0] + beg;
|
||||
for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
|
||||
target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
||||
aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
|
||||
q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar);
|
||||
q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm);
|
||||
#if 0
|
||||
if (name && score != p->G) { // debugging only
|
||||
int j, glen = 0;
|
||||
|
|
@ -206,7 +191,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8
|
|||
__func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw);
|
||||
}
|
||||
#endif
|
||||
if (beg != 0 || end < lq) { // write soft clipping
|
||||
if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping
|
||||
q->cigar = xrealloc(q->cigar, 4 * (q->n_cigar + 2));
|
||||
if (beg != 0) {
|
||||
memmove(q->cigar + 1, q->cigar, q->n_cigar * 4);
|
||||
|
|
@ -219,7 +204,6 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8
|
|||
}
|
||||
}
|
||||
}
|
||||
free(target); free(path);
|
||||
}
|
||||
|
||||
/* this is for the debugging purpose only */
|
||||
|
|
@ -407,27 +391,6 @@ static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *c
|
|||
return n_cigar;
|
||||
}
|
||||
|
||||
static int compute_nm(bsw2hit_t *p, int n_cigar, const uint32_t *cigar, const uint8_t *pac, const uint8_t *seq)
|
||||
{
|
||||
int k, x, n_mm = 0, i, n_gap = 0;
|
||||
bwtint_t y;
|
||||
x = 0; y = p->k;
|
||||
for (k = 0; k < n_cigar; ++k) {
|
||||
int op = cigar[k]&0xf;
|
||||
int len = cigar[k]>>4;
|
||||
if (op == 0) { // match
|
||||
for (i = 0; i < len; ++i) {
|
||||
int ref = pac[(y+i)>>2] >> (~(y+i)&3)*2 & 0x3;
|
||||
if (seq[x + i] != ref) ++n_mm;
|
||||
}
|
||||
x += len; y += len;
|
||||
} else if (op == 1) x += len, n_gap += len;
|
||||
else if (op == 2) y += len, n_gap += len;
|
||||
else if (op == 4) x += len;
|
||||
}
|
||||
return n_mm + n_gap;
|
||||
}
|
||||
|
||||
static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name)
|
||||
{
|
||||
int i;
|
||||
|
|
@ -439,7 +402,7 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8
|
|||
}
|
||||
b->aux = xcalloc(b->n, sizeof(bsw2aux_t));
|
||||
// generate CIGAR
|
||||
gen_cigar(opt, qlen, seq, pac, b, name);
|
||||
gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name);
|
||||
// fix CIGAR, generate mapQ, and write chromosomal position
|
||||
for (i = 0; i < b->n; ++i) {
|
||||
bsw2hit_t *p = &b->hits[i];
|
||||
|
|
@ -451,8 +414,6 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8
|
|||
int subo;
|
||||
// fix out-of-boundary CIGAR
|
||||
q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar);
|
||||
// compute the NM tag
|
||||
q->nm = compute_nm(p, q->n_cigar, q->cigar, pac, seq[p->is_rev]);
|
||||
// compute mapQ
|
||||
subo = p->G2 > opt->t? p->G2 : opt->t;
|
||||
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
|
||||
|
|
@ -527,9 +488,10 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
|
|||
bsw2aux_t *q = b->aux + i;
|
||||
int j, beg, end, type = 0;
|
||||
// print mandatory fields before SEQ
|
||||
if (q->cigar == 0) q->flag |= 0x4;
|
||||
ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0));
|
||||
ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1);
|
||||
if (p->l == 0) { // not a repetitive hit
|
||||
if (p->l == 0 && q->cigar) { // not a repetitive hit
|
||||
ksprintf(&str, "\t%d\t", q->pqual);
|
||||
for (k = 0; k < q->n_cigar; ++k)
|
||||
ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]);
|
||||
|
|
@ -538,7 +500,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
|
|||
else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize);
|
||||
// get the sequence begin and end
|
||||
beg = 0; end = ks->l;
|
||||
if (opt->hard_clip) {
|
||||
if (opt->hard_clip && q->cigar) {
|
||||
if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4;
|
||||
if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4;
|
||||
}
|
||||
|
|
|
|||
13
fastmap.c
13
fastmap.c
|
|
@ -27,13 +27,14 @@ int main_mem(int argc, char *argv[])
|
|||
void *ko = 0, *ko2 = 0;
|
||||
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||
else if (c == 'w') opt->w = atoi(optarg);
|
||||
else if (c == 'A') opt->a = atoi(optarg);
|
||||
else if (c == 'B') opt->b = atoi(optarg);
|
||||
else if (c == 'O') opt->q = atoi(optarg);
|
||||
else if (c == 'E') opt->r = atoi(optarg);
|
||||
else if (c == 'T') opt->T = atoi(optarg);
|
||||
else if (c == 'L') opt->pen_clip = atoi(optarg);
|
||||
else if (c == 'U') opt->pen_unpaired = atoi(optarg);
|
||||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||
|
|
@ -59,7 +60,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
||||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||
fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop);
|
||||
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
|
|
@ -75,6 +76,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
|
||||
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
|
||||
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
|
||||
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
|
||||
fprintf(stderr, " -H hard clipping\n");
|
||||
|
|
@ -85,7 +87,7 @@ int main_mem(int argc, char *argv[])
|
|||
return 1;
|
||||
}
|
||||
|
||||
mem_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||
bwa_print_sam_hdr(idx->bns, rg_line);
|
||||
|
||||
|
|
@ -105,6 +107,11 @@ int main_mem(int argc, char *argv[])
|
|||
}
|
||||
while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
|
||||
int64_t size = 0;
|
||||
if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
|
||||
if (bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
|
||||
n = n>>1<<1;
|
||||
}
|
||||
if (!copy_comment)
|
||||
for (i = 0; i < n; ++i) {
|
||||
free(seqs[i].comment); seqs[i].comment = 0;
|
||||
|
|
|
|||
2
ksw.c
2
ksw.c
|
|
@ -427,7 +427,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
max_ie = gscore > h1? max_ie : i;
|
||||
gscore = gscore > h1? gscore : h1;
|
||||
}
|
||||
if (m == 0 || max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop) break; // drop to zero, or below Z-dropoff
|
||||
if (m == 0 || (zdrop > 0 && max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop)) break; // drop to zero, or below Z-dropoff
|
||||
if (m > max) {
|
||||
max = m, max_i = i, max_j = mj;
|
||||
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
|
||||
|
|
|
|||
27
main.c
27
main.c
|
|
@ -1,12 +1,29 @@
|
|||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "main.h"
|
||||
#include "utils.h"
|
||||
|
||||
#ifndef PACKAGE_VERSION
|
||||
#define PACKAGE_VERSION "0.7.0-r324-beta"
|
||||
#define PACKAGE_VERSION "0.7.0-r341-beta"
|
||||
#endif
|
||||
|
||||
int bwa_fa2pac(int argc, char *argv[]);
|
||||
int bwa_pac2bwt(int argc, char *argv[]);
|
||||
int bwa_bwtupdate(int argc, char *argv[]);
|
||||
int bwa_bwt2sa(int argc, char *argv[]);
|
||||
int bwa_index(int argc, char *argv[]);
|
||||
int bwt_bwtgen_main(int argc, char *argv[]);
|
||||
|
||||
int bwa_aln(int argc, char *argv[]);
|
||||
int bwa_sai2sam_se(int argc, char *argv[]);
|
||||
int bwa_sai2sam_pe(int argc, char *argv[]);
|
||||
|
||||
int bwa_bwtsw2(int argc, char *argv[]);
|
||||
|
||||
int main_fastmap(int argc, char *argv[]);
|
||||
int main_mem(int argc, char *argv[]);
|
||||
|
||||
int main_pemerge(int argc, char *argv[]);
|
||||
|
||||
static int usage()
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
|
@ -15,12 +32,13 @@ static int usage()
|
|||
fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
|
||||
fprintf(stderr, "Usage: bwa <command> [options]\n\n");
|
||||
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
|
||||
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
||||
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
|
||||
fprintf(stderr, " aln gapped/ungapped alignment\n");
|
||||
fprintf(stderr, " samse generate alignment (single ended)\n");
|
||||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
||||
fprintf(stderr, " bwasw BWA-SW for long queries\n");
|
||||
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||
|
|
@ -56,6 +74,7 @@ int main(int argc, char *argv[])
|
|||
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
|
||||
else {
|
||||
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
||||
return 1;
|
||||
|
|
|
|||
28
main.h
28
main.h
|
|
@ -1,28 +0,0 @@
|
|||
#ifndef BWA_MAIN_H
|
||||
#define BWA_MAIN_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int bwa_fa2pac(int argc, char *argv[]);
|
||||
int bwa_pac2bwt(int argc, char *argv[]);
|
||||
int bwa_bwtupdate(int argc, char *argv[]);
|
||||
int bwa_bwt2sa(int argc, char *argv[]);
|
||||
int bwa_index(int argc, char *argv[]);
|
||||
int bwa_aln(int argc, char *argv[]);
|
||||
int bwt_bwtgen_main(int argc, char *argv[]);
|
||||
|
||||
int bwa_sai2sam_se(int argc, char *argv[]);
|
||||
int bwa_sai2sam_pe(int argc, char *argv[]);
|
||||
|
||||
int bwa_bwtsw2(int argc, char *argv[]);
|
||||
|
||||
int main_fastmap(int argc, char *argv[]);
|
||||
int main_mem(int argc, char *argv[]);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,286 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <pthread.h>
|
||||
#include <errno.h>
|
||||
#include "ksw.h"
|
||||
#include "kseq.h"
|
||||
#include "kstring.h"
|
||||
#include "bwa.h"
|
||||
#include "utils.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
#define MAX_SCORE_RATIO 0.9f
|
||||
#define MAX_ERR 8
|
||||
|
||||
static const char *err_msg[MAX_ERR+1] = {
|
||||
"successful merges",
|
||||
"low-scoring pairs",
|
||||
"pairs where the best SW alignment is not an overlap (long left end)",
|
||||
"pairs where the best SW alignment is not an overlap (long right end)",
|
||||
"pairs with large 2nd best SW score",
|
||||
"pairs with gapped overlap",
|
||||
"pairs where the end-to-end alignment is inconsistent with SW",
|
||||
"pairs potentially with tandem overlaps",
|
||||
"pairs with high sum of errors"
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
int a, b, q, r, w;
|
||||
int q_def, q_thres;
|
||||
int T;
|
||||
int chunk_size;
|
||||
int n_threads;
|
||||
int flag; // bit 1: print merged; 2: print unmerged
|
||||
int8_t mat[25];
|
||||
} pem_opt_t;
|
||||
|
||||
pem_opt_t *pem_opt_init()
|
||||
{
|
||||
pem_opt_t *opt;
|
||||
opt = xcalloc(1, sizeof(pem_opt_t));
|
||||
opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20;
|
||||
opt->T = opt->a * 10;
|
||||
opt->q_def = 20;
|
||||
opt->q_thres = 70;
|
||||
opt->chunk_size = 10000000;
|
||||
opt->n_threads = 1;
|
||||
opt->flag = 3;
|
||||
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
return opt;
|
||||
}
|
||||
|
||||
int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2])
|
||||
{
|
||||
uint8_t *s[2], *q[2], *seq, *qual;
|
||||
int i, xtra, l, l_seq, sum_q, ret = 0;
|
||||
kswr_t r;
|
||||
|
||||
s[0] = xmalloc(x[0].l_seq); q[0] = xmalloc(x[0].l_seq);
|
||||
s[1] = xmalloc(x[1].l_seq); q[1] = xmalloc(x[1].l_seq);
|
||||
for (i = 0; i < x[0].l_seq; ++i) {
|
||||
int c = x[0].seq[i];
|
||||
s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c];
|
||||
q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def;
|
||||
}
|
||||
for (i = 0; i < x[1].l_seq; ++i) {
|
||||
int c = x[1].seq[x[1].l_seq - 1 - i];
|
||||
c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c];
|
||||
s[1][i] = c < 4? 3 - c : 4;
|
||||
q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def;
|
||||
}
|
||||
|
||||
xtra = KSW_XSTART | KSW_XSUBO;
|
||||
r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0);
|
||||
++r.qe; ++r.te; // change to the half-close-half-open coordinates
|
||||
|
||||
if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment
|
||||
if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end
|
||||
if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end
|
||||
if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large
|
||||
if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps
|
||||
|
||||
{ // test tandem match; O(n^2)
|
||||
int max_m, max_m2, min_l, max_l, max_l2;
|
||||
max_m = max_m2 = 0; max_l = max_l2 = 0;
|
||||
min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq;
|
||||
for (l = 1; l < min_l; ++l) {
|
||||
int m = 0, o = x[0].l_seq - l;
|
||||
uint8_t *s0o = &s[0][o], *s1 = s[1];
|
||||
for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck!
|
||||
m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i]
|
||||
if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l;
|
||||
else if (m > max_m2) max_m2 = m, max_l2 = l;
|
||||
}
|
||||
if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; }
|
||||
if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) {
|
||||
ret = -7; goto pem_ret;
|
||||
}
|
||||
if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; }
|
||||
}
|
||||
|
||||
l = x[0].l_seq - (r.tb - r.qb); // length to merge
|
||||
l_seq = x[0].l_seq + x[1].l_seq - l;
|
||||
seq = xmalloc(l_seq + 1);
|
||||
qual = xmalloc(l_seq + 1);
|
||||
memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l);
|
||||
memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l);
|
||||
for (i = 0, sum_q = 0; i < l; ++i) {
|
||||
int k = x[0].l_seq - l + i;
|
||||
if (s[0][k] == 4) { // ambiguous
|
||||
seq[k] = s[1][i];
|
||||
qual[k] = q[1][i];
|
||||
} else if (s[1][i] == 4) { // do nothing
|
||||
} else if (s[0][k] == s[1][i]) {
|
||||
qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i];
|
||||
} else { // s[0][k] != s[1][i] and neither is N
|
||||
int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i];
|
||||
sum_q += qq >= 3? qq<<1 : 1;
|
||||
seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i];
|
||||
qual[k] = abs((int)q[0][k] - (int)q[1][i]);
|
||||
}
|
||||
}
|
||||
if (sum_q>>1 > opt->q_thres) { // too many mismatches
|
||||
free(seq); free(qual);
|
||||
ret = -8; goto pem_ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33;
|
||||
seq[l_seq] = qual[l_seq] = 0;
|
||||
|
||||
free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment);
|
||||
memset(&x[1], 0, sizeof(bseq1_t));
|
||||
free(x[0].seq); free(x[0].qual);
|
||||
x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual;
|
||||
|
||||
pem_ret:
|
||||
free(s[0]); free(s[1]); free(q[0]); free(q[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void print_bseq(const bseq1_t *s, int rn)
|
||||
{
|
||||
err_putchar(s->qual? '@' : '>');
|
||||
err_fputs(s->name, stdout);
|
||||
if (rn == 1 || rn == 2) {
|
||||
err_putchar('/'); err_putchar('0' + rn); err_putchar('\n');
|
||||
} else err_puts(" merged");
|
||||
err_puts(s->seq);
|
||||
if (s->qual) {
|
||||
err_puts("+"); err_puts(s->qual);
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int n, start;
|
||||
bseq1_t *seqs;
|
||||
int64_t cnt[MAX_ERR+1];
|
||||
const pem_opt_t *opt;
|
||||
} worker_t;
|
||||
|
||||
void *worker(void *data)
|
||||
{
|
||||
worker_t *w = (worker_t*)data;
|
||||
int i;
|
||||
for (i = w->start; i < w->n>>1; i += w->opt->n_threads)
|
||||
++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])];
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1])
|
||||
{
|
||||
int i, j, n = n_>>1<<1;
|
||||
worker_t *w;
|
||||
|
||||
w = xcalloc(opt->n_threads, sizeof(worker_t));
|
||||
for (i = 0; i < opt->n_threads; ++i) {
|
||||
worker_t *p = &w[i];
|
||||
p->start = i; p->n = n;
|
||||
p->opt = opt;
|
||||
p->seqs = seqs;
|
||||
}
|
||||
if (opt->n_threads == 1) {
|
||||
worker(w);
|
||||
} else {
|
||||
pthread_t *tid;
|
||||
tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t));
|
||||
for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]);
|
||||
for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
|
||||
free(tid);
|
||||
}
|
||||
for (i = 0; i < opt->n_threads; ++i) {
|
||||
worker_t *p = &w[i];
|
||||
for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j];
|
||||
}
|
||||
free(w);
|
||||
for (i = 0; i < n>>1; ++i) {
|
||||
if (seqs[i<<1|1].l_seq != 0) {
|
||||
if (opt->flag&2) {
|
||||
print_bseq(&seqs[i<<1|0], 1);
|
||||
print_bseq(&seqs[i<<1|1], 2);
|
||||
}
|
||||
} else if (opt->flag&1)
|
||||
print_bseq(&seqs[i<<1|0], 0);
|
||||
}
|
||||
for (i = 0; i < n; ++i) {
|
||||
bseq1_t *s = &seqs[i];
|
||||
free(s->name); free(s->seq); free(s->qual); free(s->comment);
|
||||
}
|
||||
}
|
||||
|
||||
int main_pemerge(int argc, char *argv[])
|
||||
{
|
||||
int c, flag = 0, i, n, min_ovlp = 10;
|
||||
int64_t cnt[MAX_ERR+1];
|
||||
bseq1_t *bseq;
|
||||
gzFile fp, fp2 = 0;
|
||||
kseq_t *ks, *ks2 = 0;
|
||||
pem_opt_t *opt;
|
||||
|
||||
opt = pem_opt_init();
|
||||
while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) {
|
||||
if (c == 'm') flag |= 1;
|
||||
else if (c == 'u') flag |= 2;
|
||||
else if (c == 'Q') opt->q_thres = atoi(optarg);
|
||||
else if (c == 't') opt->n_threads = atoi(optarg);
|
||||
else if (c == 'T') min_ovlp = atoi(optarg);
|
||||
}
|
||||
if (flag == 0) flag = 3;
|
||||
opt->flag = flag;
|
||||
opt->T = opt->a * min_ovlp;
|
||||
|
||||
if (optind == argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
|
||||
fprintf(stderr, "Options: -m output merged reads only\n");
|
||||
fprintf(stderr, " -u output unmerged reads only\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp);
|
||||
fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres);
|
||||
fprintf(stderr, "\n");
|
||||
free(opt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Couldn't open %s : %s\n",
|
||||
strcmp(argv[optind], "-") ? argv[optind] : "stdin",
|
||||
errno ? strerror(errno) : "Out of memory");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ks = kseq_init(fp);
|
||||
if (optind + 1 < argc) {
|
||||
fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Couldn't open %s : %s\n",
|
||||
strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin",
|
||||
errno ? strerror(errno) : "Out of memory");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
ks2 = kseq_init(fp2);
|
||||
}
|
||||
|
||||
memset(cnt, 0, 8 * (MAX_ERR+1));
|
||||
while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
|
||||
process_seqs(opt, n, bseq, cnt);
|
||||
free(bseq);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]);
|
||||
for (i = 1; i <= MAX_ERR; ++i)
|
||||
fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]);
|
||||
kseq_destroy(ks);
|
||||
err_gzclose(fp);
|
||||
if (ks2) {
|
||||
kseq_destroy(ks2);
|
||||
err_gzclose(fp2);
|
||||
}
|
||||
free(opt);
|
||||
|
||||
err_fflush(stdout);
|
||||
|
||||
return 0;
|
||||
}
|
||||
162
stdaln.h
162
stdaln.h
|
|
@ -1,162 +0,0 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2003-2006, 2008, by Heng Li <lh3lh3@gmail.com>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
2009-07-23, 0.10.0
|
||||
|
||||
- Use 32-bit to store CIGAR
|
||||
|
||||
- Report suboptimal aligments
|
||||
|
||||
- Implemented half-fixed-half-open DP
|
||||
|
||||
2009-04-26, 0.9.10
|
||||
|
||||
- Allow to set a threshold for local alignment
|
||||
|
||||
2009-02-18, 0.9.9
|
||||
|
||||
- Fixed a bug when no residue matches
|
||||
|
||||
2008-08-04, 0.9.8
|
||||
|
||||
- Fixed the wrong declaration of aln_stdaln_aux()
|
||||
|
||||
- Avoid 0 coordinate for global alignment
|
||||
|
||||
2008-08-01, 0.9.7
|
||||
|
||||
- Change gap_end penalty to 5 in aln_param_bwa
|
||||
|
||||
- Add function to convert path_t to the CIGAR format
|
||||
|
||||
2008-08-01, 0.9.6
|
||||
|
||||
- The first gap now costs (gap_open+gap_ext), instead of
|
||||
gap_open. Scoring systems are modified accordingly.
|
||||
|
||||
- Gap end is now correctly handled. Previously it is not correct.
|
||||
|
||||
- Change license to MIT.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef LH3_STDALN_H_
|
||||
#define LH3_STDALN_H_
|
||||
|
||||
|
||||
#define STDALN_VERSION 0.11.0
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define FROM_M 0
|
||||
#define FROM_I 1
|
||||
#define FROM_D 2
|
||||
#define FROM_S 3
|
||||
|
||||
#define ALN_TYPE_LOCAL 0
|
||||
#define ALN_TYPE_GLOBAL 1
|
||||
#define ALN_TYPE_EXTEND 2
|
||||
|
||||
/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */
|
||||
#define MINOR_INF -1073741823
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int gap_open;
|
||||
int gap_ext;
|
||||
int gap_end;
|
||||
|
||||
int *matrix;
|
||||
int row;
|
||||
int band_width;
|
||||
} AlnParam;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int i, j;
|
||||
unsigned char ctype;
|
||||
} path_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
path_t *path; /* for advanced users... :-) */
|
||||
int path_len; /* for advanced users... :-) */
|
||||
int start1, end1; /* start and end of the first sequence, coordinations are 1-based */
|
||||
int start2, end2; /* start and end of the second sequence, coordinations are 1-based */
|
||||
int score, subo; /* score */
|
||||
|
||||
char *out1, *out2; /* print them, and then you will know */
|
||||
char *outm;
|
||||
|
||||
int n_cigar;
|
||||
uint32_t *cigar32;
|
||||
} AlnAln;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap,
|
||||
int type, int do_align, int len1, int len2);
|
||||
AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align);
|
||||
void aln_free_AlnAln(AlnAln *aa);
|
||||
|
||||
int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
|
||||
path_t *path, int *path_len);
|
||||
int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
|
||||
path_t *path, int *path_len, int _thres, int *_subo);
|
||||
int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
|
||||
path_t *path, int *path_len, int G0, uint8_t *_mem);
|
||||
uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar);
|
||||
uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/********************
|
||||
* global variables *
|
||||
********************/
|
||||
|
||||
extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */
|
||||
extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */
|
||||
extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */
|
||||
extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */
|
||||
extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */
|
||||
|
||||
/* common nucleotide score matrix for 16 bases */
|
||||
extern int aln_sm_nt[], aln_sm_bwa[];
|
||||
|
||||
/* BLOSUM62 and BLOSUM45 */
|
||||
extern int aln_sm_blosum62[], aln_sm_blosum45[];
|
||||
|
||||
/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */
|
||||
extern int aln_sm_read[];
|
||||
|
||||
/* human-mouse score matrix for 4 bases */
|
||||
extern int aln_sm_hs[];
|
||||
|
||||
#endif
|
||||
Loading…
Reference in New Issue