r329: ditch stdaln.{c,h}; no changes to bwa-mem

stdaln.{c,h} was written ten years ago. Its local and SW extension code are
actually buggy (though that rarely happens and usually does not affect the
results too much). ksw.{c,h} is more concise, potentially faster, less buggy,
and richer in features.
This commit is contained in:
Heng Li 2013-03-05 12:00:24 -05:00
parent bb37e14d02
commit 98f8966750
8 changed files with 33 additions and 1281 deletions

View File

@ -4,7 +4,7 @@ CXXFLAGS= $(CFLAGS)
AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
@ -48,8 +48,8 @@ fastmap.o:bwt.h bwamem.h
bwtaln.o:bwt.h bwtaln.h kseq.h
bwtgap.o:bwtgap.h bwtaln.h bwt.h
bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h
bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h
bwtsw2_main.o:bwtsw2.h
clean:

41
bwape.c
View File

@ -8,9 +8,9 @@
#include "kvec.h"
#include "bntseq.h"
#include "utils.h"
#include "stdaln.h"
#include "bwase.h"
#include "bwa.h"
#include "ksw.h"
typedef struct {
int n;
@ -397,16 +397,17 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
#define SW_MIN_MAPQ 17
// cnt = n_mm<<16 | n_gapo<<8 | n_gape
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen,
int *n_cigar, uint32_t *_cnt)
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt)
{
kswr_t r;
uint32_t *cigar32 = 0;
bwa_cigar_t *cigar = 0;
ubyte_t *ref_seq;
bwtint_t k, x, y, l;
int path_len, ret, subo;
AlnParam ap = aln_param_bwa;
path_t *path, *p;
int xtra;
int8_t mat[25];
bwa_fill_scmat(1, 3, mat);
// check whether there are too many N's
if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
for (k = 0, x = 0; k < len; ++k)
@ -417,15 +418,19 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
ref_seq = (ubyte_t*)calloc(reglen, 1);
for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
path = (path_t*)calloc(l+len, sizeof(path_t));
// do alignment
ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo);
if (ret < 0 || subo == ret) { // no hit or tandem hits
free(path); free(cigar); free(ref_seq); *n_cigar = 0;
xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0);
r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0);
ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32);
cigar = (bwa_cigar_t*)cigar32;
for (k = 0; k < *n_cigar; ++k)
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits
free(cigar); free(ref_seq); *n_cigar = 0;
return 0;
}
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
// check whether the alignment is good enough
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
@ -435,17 +440,14 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
else y += __cigar_len(c);
}
if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
free(path); free(cigar); free(ref_seq);
free(cigar); free(ref_seq);
*n_cigar = 0;
return 0;
}
{ // update cigar and coordinate;
int start, end;
p = path + path_len - 1;
*beg += (p->i? p->i : 1) - 1;
start = (p->j? p->j : 1) - 1;
end = path->j;
int start = r.qb, end = r.qe + 1;
*beg += r.tb;
cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
if (start) {
memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
@ -462,8 +464,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
{ // set *cnt
int n_mm, n_gapo, n_gape;
n_mm = n_gapo = n_gape = 0;
p = path + path_len - 1;
x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0;
x = r.tb; y = r.qb;
for (k = 0; k < *n_cigar; ++k) {
bwa_cigar_t c = cigar[k];
if (__cigar_op(c) == FROM_M) {
@ -479,7 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
*_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
}
free(ref_seq); free(path);
free(ref_seq);
return cigar;
}

View File

@ -4,7 +4,6 @@
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "stdaln.h"
#include "bwase.h"
#include "bwtaln.h"
#include "bntseq.h"
@ -205,8 +204,8 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l
if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
// change "I" at either end of the read to S. just in case. This should rarely happen...
if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(FROM_S, (__cigar_len(cigar[*n_cigar-1])));
if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(FROM_S, (__cigar_len(cigar[0])));
*_pos = (bwtint_t)__pos;
free(ref_seq);
@ -589,5 +588,6 @@ int bwa_sai2sam_se(int argc, char *argv[])
return 0;
}
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
free(prefix);
return 0;
}

View File

@ -312,18 +312,3 @@ int bwa_aln(int argc, char *argv[])
free(opt); free(prefix);
return 0;
}
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
__cigar_op and __cigar_len while keeping stdaln stand alone */
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
{
uint32_t *cigar32;
bwa_cigar_t *cigar;
int i;
cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
cigar = (bwa_cigar_t*)cigar32;
for (i = 0; i < *n_cigar; ++i)
cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
return cigar;
}

View File

@ -28,6 +28,11 @@
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
#endif
#define FROM_M 0
#define FROM_I 1
#define FROM_D 2
#define FROM_S 3
typedef struct {
bwtint_t w;
int bid;
@ -138,13 +143,6 @@ extern "C" {
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
__cigar_op and __cigar_len while keeping stdaln stand alone */
#include "stdaln.h"
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar);
#ifdef __cplusplus
}
#endif

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.0-r324-beta"
#define PACKAGE_VERSION "0.7.0-r329-beta"
#endif
static int usage()

1070
stdaln.c

File diff suppressed because it is too large Load Diff

162
stdaln.h
View File

@ -1,162 +0,0 @@
/* The MIT License
Copyright (c) 2003-2006, 2008, by Heng Li <lh3lh3@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
2009-07-23, 0.10.0
- Use 32-bit to store CIGAR
- Report suboptimal aligments
- Implemented half-fixed-half-open DP
2009-04-26, 0.9.10
- Allow to set a threshold for local alignment
2009-02-18, 0.9.9
- Fixed a bug when no residue matches
2008-08-04, 0.9.8
- Fixed the wrong declaration of aln_stdaln_aux()
- Avoid 0 coordinate for global alignment
2008-08-01, 0.9.7
- Change gap_end penalty to 5 in aln_param_bwa
- Add function to convert path_t to the CIGAR format
2008-08-01, 0.9.6
- The first gap now costs (gap_open+gap_ext), instead of
gap_open. Scoring systems are modified accordingly.
- Gap end is now correctly handled. Previously it is not correct.
- Change license to MIT.
*/
#ifndef LH3_STDALN_H_
#define LH3_STDALN_H_
#define STDALN_VERSION 0.11.0
#include <stdint.h>
#define FROM_M 0
#define FROM_I 1
#define FROM_D 2
#define FROM_S 3
#define ALN_TYPE_LOCAL 0
#define ALN_TYPE_GLOBAL 1
#define ALN_TYPE_EXTEND 2
/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */
#define MINOR_INF -1073741823
typedef struct
{
int gap_open;
int gap_ext;
int gap_end;
int *matrix;
int row;
int band_width;
} AlnParam;
typedef struct
{
int i, j;
unsigned char ctype;
} path_t;
typedef struct
{
path_t *path; /* for advanced users... :-) */
int path_len; /* for advanced users... :-) */
int start1, end1; /* start and end of the first sequence, coordinations are 1-based */
int start2, end2; /* start and end of the second sequence, coordinations are 1-based */
int score, subo; /* score */
char *out1, *out2; /* print them, and then you will know */
char *outm;
int n_cigar;
uint32_t *cigar32;
} AlnAln;
#ifdef __cplusplus
extern "C" {
#endif
AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap,
int type, int do_align, int len1, int len2);
AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align);
void aln_free_AlnAln(AlnAln *aa);
int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
path_t *path, int *path_len);
int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
path_t *path, int *path_len, int _thres, int *_subo);
int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
path_t *path, int *path_len, int G0, uint8_t *_mem);
uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar);
uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar);
#ifdef __cplusplus
}
#endif
/********************
* global variables *
********************/
extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */
extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */
extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */
extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */
extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */
/* common nucleotide score matrix for 16 bases */
extern int aln_sm_nt[], aln_sm_bwa[];
/* BLOSUM62 and BLOSUM45 */
extern int aln_sm_blosum62[], aln_sm_blosum45[];
/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */
extern int aln_sm_read[];
/* human-mouse score matrix for 4 bases */
extern int aln_sm_hs[];
#endif