removed color-space support
which has been broken since 0.6.x
This commit is contained in:
parent
e5ab59db53
commit
6ad5a3c086
3
Makefile
3
Makefile
|
|
@ -6,8 +6,7 @@ DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
|||
LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \
|
||||
bseq.o bwaseqio.o bwase.o kstring.o
|
||||
AOBJS= QSufSort.o bwt_gen.o \
|
||||
is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \
|
||||
bwape.o cs2nt.o \
|
||||
is.o bwtmisc.o bwtindex.o ksw.o bwape.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||
PROG= bwa
|
||||
|
|
|
|||
38
bwape.c
38
bwape.c
|
|
@ -212,19 +212,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
|
|||
last_pos[x.y&1][1] = x;
|
||||
}
|
||||
}
|
||||
} else if (opt->type == BWA_PET_SOLID) {
|
||||
for (i = 0; i < d->arr.n; ++i) {
|
||||
pair64_t x = d->arr.a[i];
|
||||
int strand = x.y>>1&1;
|
||||
if ((strand^x.y)&1) { // push
|
||||
int y = 1 - (x.y&1);
|
||||
__pairing_aux(last_pos[y][1], x);
|
||||
__pairing_aux(last_pos[y][0], x);
|
||||
} else { // check
|
||||
last_pos[x.y&1][0] = last_pos[x.y&1][1];
|
||||
last_pos[x.y&1][1] = x;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[paring] not implemented yet!\n");
|
||||
exit(1);
|
||||
|
|
@ -567,11 +554,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
|
|||
++n_tot[is_singleton];
|
||||
cigar[0] = cigar[1] = 0;
|
||||
n_cigar[0] = n_cigar[1] = 0;
|
||||
if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
|
||||
if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
|
||||
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
|
||||
ubyte_t *seq;
|
||||
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
|
||||
if (popt->type == BWA_PET_STD) {
|
||||
{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
|
||||
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
|
||||
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
|
||||
seq = p[k]->rseq;
|
||||
|
|
@ -580,17 +567,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
|
|||
seq = p[k]->seq;
|
||||
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
|
||||
}
|
||||
} else { // BWA_PET_SOLID
|
||||
if (p[1-k]->strand == 0) { // R3-F3 pairing
|
||||
if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
|
||||
else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
|
||||
seq = p[k]->rseq;
|
||||
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
|
||||
} else { // F3-R3 pairing
|
||||
if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
|
||||
else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
|
||||
seq = p[k]->seq;
|
||||
}
|
||||
}
|
||||
// perform SW alignment
|
||||
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
|
||||
|
|
@ -654,7 +630,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
bwa_seq_t *seqs[2];
|
||||
bwa_seqio_t *ks[2];
|
||||
clock_t t;
|
||||
bntseq_t *bns, *ntbns = 0;
|
||||
bntseq_t *bns;
|
||||
FILE *fp_sa[2];
|
||||
gap_opt_t opt, opt0;
|
||||
khint_t iter;
|
||||
|
|
@ -679,10 +655,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
opt0 = opt;
|
||||
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
|
||||
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
|
||||
if (!(opt.mode & BWA_MODE_COMPREAD)) {
|
||||
popt->type = BWA_PET_SOLID;
|
||||
ntbns = bwa_open_nt(prefix);
|
||||
} else { // for Illumina alignment only
|
||||
{ // for Illumina alignment only
|
||||
if (popt->is_preload) {
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
|
|
@ -715,7 +688,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
|
||||
for (j = 0; j < 2; ++j)
|
||||
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
|
||||
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
if (pac == 0) free(pacseq);
|
||||
|
||||
|
|
@ -740,7 +713,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
|
||||
// destroy
|
||||
bns_destroy(bns);
|
||||
if (ntbns) bns_destroy(ntbns);
|
||||
for (i = 0; i < 2; ++i) {
|
||||
bwa_seq_close(ks[i]);
|
||||
fclose(fp_sa[i]);
|
||||
|
|
|
|||
43
bwase.c
43
bwase.c
|
|
@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s)
|
|||
s->len = s->full_len;
|
||||
}
|
||||
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
|
||||
{
|
||||
ubyte_t *pacseq, *ntpac = 0;
|
||||
ubyte_t *pacseq;
|
||||
int i, j;
|
||||
kstring_t *str;
|
||||
|
||||
if (ntbns) { // in color space
|
||||
ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
|
||||
rewind(ntbns->fp_pac);
|
||||
fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
|
||||
}
|
||||
|
||||
if (!_pacseq) {
|
||||
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
||||
rewind(bns->fp_pac);
|
||||
|
|
@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
|
|||
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
||||
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
|
||||
}
|
||||
#if 0
|
||||
if (ntbns) { // in color space
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *s = seqs + i;
|
||||
bwa_cs2nt_core(s, bns->l_pac, ntpac);
|
||||
for (j = 0; j < s->n_multi; ++j) {
|
||||
bwt_multi1_t *q = s->multi + j;
|
||||
int n_cigar;
|
||||
if (q->gap == 0) continue;
|
||||
free(q->cigar);
|
||||
q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
||||
(q->strand? 1 : -1) * q->gap, &n_cigar, 0);
|
||||
q->n_cigar = n_cigar;
|
||||
}
|
||||
if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
|
||||
free(s->cigar);
|
||||
s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
||||
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// generate MD tag
|
||||
str = (kstring_t*)calloc(1, sizeof(kstring_t));
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
|
|
@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
|
|||
if (s->type != BWA_TYPE_NO_MATCH) {
|
||||
int nm;
|
||||
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
|
||||
bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
|
||||
bns->l_pac, pacseq, str, &nm);
|
||||
s->nm = nm;
|
||||
}
|
||||
}
|
||||
free(str->s); free(str);
|
||||
|
||||
// correct for trimmed reads
|
||||
if (!ntbns) // trimming is only enabled for Illumina reads
|
||||
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
|
||||
|
||||
if (!_pacseq) free(pacseq);
|
||||
free(ntpac);
|
||||
}
|
||||
|
||||
int64_t pos_end(const bwa_seq_t *p)
|
||||
|
|
@ -587,7 +557,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
bwa_seq_t *seqs;
|
||||
bwa_seqio_t *ks;
|
||||
clock_t t;
|
||||
bntseq_t *bns, *ntbns = 0;
|
||||
bntseq_t *bns;
|
||||
FILE *fp_sa;
|
||||
gap_opt_t opt;
|
||||
|
||||
|
|
@ -599,8 +569,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
|
||||
m_aln = 0;
|
||||
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
||||
if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
|
||||
ntbns = bwa_open_nt(prefix);
|
||||
bwa_print_sam_SQ(bns);
|
||||
//bwa_print_sam_PG();
|
||||
// set ks
|
||||
|
|
@ -628,7 +596,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
|
||||
bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
|
||||
bwa_refine_gapped(bns, n_seqs, seqs, 0);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] print alignments... ");
|
||||
|
|
@ -642,7 +610,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
|
||||
// destroy
|
||||
bwa_seq_close(ks);
|
||||
if (ntbns) bns_destroy(ntbns);
|
||||
bns_destroy(bns);
|
||||
fclose(fp_sa);
|
||||
free(aln);
|
||||
|
|
|
|||
2
bwase.h
2
bwase.h
|
|
@ -14,7 +14,7 @@ extern "C" {
|
|||
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
|
||||
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
|
||||
// Refine the approximate position of the sequence to an actual placement for the sequence.
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
|
||||
// Backfill certain alignment properties mainly centering around number of matches.
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
// Calculate the end position of a read given a certain sequence.
|
||||
|
|
|
|||
4
bwtaln.c
4
bwtaln.c
|
|
@ -252,7 +252,7 @@ int bwa_aln(int argc, char *argv[])
|
|||
char *prefix;
|
||||
|
||||
opt = gap_init_opt();
|
||||
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
|
||||
|
|
@ -272,7 +272,6 @@ int bwa_aln(int argc, char *argv[])
|
|||
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
|
||||
case 'R': opt->max_top2 = atoi(optarg); break;
|
||||
case 'q': opt->trim_qual = atoi(optarg); break;
|
||||
case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
|
||||
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
|
||||
case 'f': xreopen(optarg, "wb", stdout); break;
|
||||
case 'b': opt->mode |= BWA_MODE_BAM; break;
|
||||
|
|
@ -310,7 +309,6 @@ int bwa_aln(int argc, char *argv[])
|
|||
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
|
||||
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
|
||||
fprintf(stderr, " -B INT length of barcode\n");
|
||||
// fprintf(stderr, " -c input sequences are in the color space\n");
|
||||
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
|
||||
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
|
||||
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
|
||||
|
|
|
|||
1
bwtaln.h
1
bwtaln.h
|
|
@ -107,7 +107,6 @@ typedef struct {
|
|||
} gap_opt_t;
|
||||
|
||||
#define BWA_PET_STD 1
|
||||
#define BWA_PET_SOLID 2
|
||||
|
||||
typedef struct {
|
||||
int max_isize, force_isize;
|
||||
|
|
|
|||
24
bwtindex.c
24
bwtindex.c
|
|
@ -42,11 +42,11 @@ void bwa_pac_rev_core(const char *fn, const char *fn_rev);
|
|||
int bwa_index(int argc, char *argv[])
|
||||
{
|
||||
char *prefix = 0, *str, *str2, *str3;
|
||||
int c, algo_type = 0, is_color = 0, is_64 = 0;
|
||||
int c, algo_type = 0, is_64 = 0;
|
||||
clock_t t;
|
||||
int64_t l_pac;
|
||||
|
||||
while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'a': // if -a is not set, algo_type will be determined later
|
||||
if (strcmp(optarg, "div") == 0) algo_type = 1;
|
||||
|
|
@ -55,7 +55,6 @@ int bwa_index(int argc, char *argv[])
|
|||
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
|
||||
break;
|
||||
case 'p': prefix = strdup(optarg); break;
|
||||
case 'c': is_color = 1; break;
|
||||
case '6': is_64 = 1; break;
|
||||
default: return 1;
|
||||
}
|
||||
|
|
@ -67,7 +66,6 @@ int bwa_index(int argc, char *argv[])
|
|||
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
|
||||
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
|
||||
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
|
||||
// fprintf(stderr, " -c build color-space index\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
|
||||
fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
|
||||
|
|
@ -83,29 +81,13 @@ int bwa_index(int argc, char *argv[])
|
|||
str2 = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
str3 = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
|
||||
if (is_color == 0) { // nucleotide indexing
|
||||
{ // nucleotide indexing
|
||||
gzFile fp = xzopen(argv[optind], "r");
|
||||
t = clock();
|
||||
fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
||||
l_pac = bns_fasta2bntseq(fp, prefix, 0);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
gzclose(fp);
|
||||
} else { // color indexing
|
||||
gzFile fp = xzopen(argv[optind], "r");
|
||||
strcat(strcpy(str, prefix), ".nt");
|
||||
t = clock();
|
||||
fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
|
||||
l_pac = bns_fasta2bntseq(fp, str, 0);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
gzclose(fp);
|
||||
{
|
||||
char *tmp_argv[3];
|
||||
tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
|
||||
t = clock();
|
||||
fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
|
||||
bwa_pac2cspac(3, tmp_argv);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
}
|
||||
}
|
||||
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
|
||||
{
|
||||
|
|
|
|||
191
cs2nt.c
191
cs2nt.c
|
|
@ -1,191 +0,0 @@
|
|||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include "bwtaln.h"
|
||||
#include "stdaln.h"
|
||||
|
||||
/*
|
||||
Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
|
||||
decode as ATTGAC(RBGOG), there are one color change and one nt change;
|
||||
if we decode as ATTAAC(RBRBG), there are two color changes.
|
||||
|
||||
In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
|
||||
as the penalty; otherwise, we will use color quality as the
|
||||
penalty. This means we always prefer two consistent color changes over
|
||||
a nt change, but if a color has high quality, we may prefer one nt
|
||||
change.
|
||||
|
||||
In the above example, the penalties of the two types of decoding are
|
||||
q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
|
||||
otherwise the second. Note that no matter what we choose, the fourth
|
||||
base will get a low nt quality.
|
||||
*/
|
||||
|
||||
#define COLOR_MM 19
|
||||
#define NUCL_MM 25
|
||||
|
||||
static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
|
||||
|
||||
/*
|
||||
{A,C,G,T,N} -> {0,1,2,3,4}
|
||||
nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
|
||||
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
|
||||
nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
|
||||
btarray[0..4*size]: backtrack array (working space)
|
||||
*/
|
||||
void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
|
||||
{
|
||||
int h[8], curr, last;
|
||||
int x, y, xmin, hmin, k;
|
||||
|
||||
// h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
|
||||
|
||||
// recursion: initial value
|
||||
if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
|
||||
else {
|
||||
for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
|
||||
h[nt_ref[0]] = 0;
|
||||
}
|
||||
// recursion: main loop
|
||||
curr = 1; last = 0;
|
||||
for (k = 1; k <= size; ++k) {
|
||||
for (x = 0; x != 4; ++x) {
|
||||
int min = 0x7fffffff, ymin = 0;
|
||||
for (y = 0; y != 4; ++y) {
|
||||
int s = h[last<<2|y];
|
||||
if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
|
||||
s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
|
||||
if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
|
||||
if (s < min) {
|
||||
min = s; ymin = y;
|
||||
}
|
||||
}
|
||||
h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
|
||||
}
|
||||
last = curr; curr = 1 - curr; // swap
|
||||
}
|
||||
// back trace
|
||||
hmin = 0x7fffffff; xmin = 0;
|
||||
for (x = 0; x != 4; ++x) {
|
||||
if (h[last<<2|x] < hmin) {
|
||||
hmin = h[last<<2|x]; xmin = x;
|
||||
}
|
||||
}
|
||||
nt_read[size] = xmin;
|
||||
for (k = size - 1; k >= 0; --k)
|
||||
nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
|
||||
}
|
||||
/*
|
||||
nt_read[0..size]: nucleotide read sequence: 0/1/2/3
|
||||
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
|
||||
tarray[0..size*2-1]: temporary array
|
||||
*/
|
||||
uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
|
||||
{
|
||||
int k, c1, c2;
|
||||
uint8_t *t2array = tarray + size;
|
||||
// get the color sequence of nt_read
|
||||
c1 = nt_read[0];
|
||||
for (k = 1; k <= size; ++k) {
|
||||
c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
|
||||
tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
|
||||
c1 = c2;
|
||||
}
|
||||
for (k = 1; k != size; ++k) {
|
||||
int q = 0;
|
||||
if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
|
||||
q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
|
||||
} else if (tarray[k-1] == cs_read[k-1]>>6) {
|
||||
q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
|
||||
} else if (tarray[k] == cs_read[k]>>6) {
|
||||
q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
|
||||
} // else, q = 0
|
||||
if (q < 0) q = 0;
|
||||
if (q > 60) q = 60;
|
||||
t2array[k] = nt_read[k]<<6 | q;
|
||||
if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
|
||||
}
|
||||
return t2array + 1; // of size-2
|
||||
}
|
||||
|
||||
// this function will be called when p->seq has been reversed by refine_gapped()
|
||||
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
|
||||
{
|
||||
uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
|
||||
int i, len;
|
||||
uint8_t *seq;
|
||||
|
||||
// set temporary arrays
|
||||
if (p->type == BWA_TYPE_NO_MATCH) return;
|
||||
len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
|
||||
ta = (uint8_t*)malloc(len * 7);
|
||||
nt_ref = ta;
|
||||
cs_read = nt_ref + len;
|
||||
nt_read = cs_read + len;
|
||||
btarray = nt_read + len;
|
||||
tarray = nt_read + len;
|
||||
|
||||
#define __gen_csbase(_cs, _i, _seq) do { \
|
||||
int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
|
||||
if (q > 60) q = 60; \
|
||||
if (_seq[_i] > 3) q = 63; \
|
||||
(_cs) = _seq[_i]<<6 | q; \
|
||||
} while (0)
|
||||
|
||||
// generate len, nt_ref[] and cs_read
|
||||
seq = p->strand? p->rseq : p->seq;
|
||||
nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
|
||||
if (p->cigar == 0) { // no gap or clipping
|
||||
len = p->len;
|
||||
for (i = 0; i < p->len; ++i) {
|
||||
__gen_csbase(cs_read[i], i, seq);
|
||||
nt_ref[i+1] = bns_pac(pac, p->pos + i);
|
||||
}
|
||||
} else {
|
||||
int k, z;
|
||||
bwtint_t x, y;
|
||||
x = p->pos; y = 0;
|
||||
for (k = z = 0; k < p->n_cigar; ++k) {
|
||||
int l = __cigar_len(p->cigar[k]);
|
||||
if (__cigar_op(p->cigar[k]) == FROM_M) {
|
||||
for (i = 0; i < l; ++i, ++x, ++y) {
|
||||
__gen_csbase(cs_read[z], y, seq);
|
||||
nt_ref[z+1] = bns_pac(pac, x);
|
||||
++z;
|
||||
}
|
||||
} else if (__cigar_op(p->cigar[k]) == FROM_I) {
|
||||
for (i = 0; i < l; ++i, ++y) {
|
||||
__gen_csbase(cs_read[z], y, seq);
|
||||
nt_ref[z+1] = 4;
|
||||
++z;
|
||||
}
|
||||
} else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
|
||||
else x += l;
|
||||
}
|
||||
len = z;
|
||||
}
|
||||
|
||||
cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
|
||||
new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
|
||||
|
||||
// update p
|
||||
p->len = p->full_len = len - 1;
|
||||
for (i = 0; i < p->len; ++i) {
|
||||
if ((new_nt_read[i]&0x3f) == 63) {
|
||||
p->qual[i] = 33; seq[i] = 4;
|
||||
} else {
|
||||
p->qual[i] = (new_nt_read[i]&0x3f) + 33;
|
||||
seq[i] = new_nt_read[i]>>6;
|
||||
}
|
||||
}
|
||||
p->qual[p->len] = seq[p->len] = 0;
|
||||
if (p->strand) {
|
||||
memcpy(p->seq, seq, p->len);
|
||||
seq_reverse(p->len, p->seq, 1);
|
||||
seq_reverse(p->len, p->qual, 0);
|
||||
} else {
|
||||
memcpy(p->rseq, seq, p->len);
|
||||
seq_reverse(p->len, p->rseq, 1);
|
||||
}
|
||||
free(ta);
|
||||
}
|
||||
3
main.c
3
main.c
|
|
@ -28,7 +28,6 @@ static int usage()
|
|||
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
|
||||
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
|
||||
fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n");
|
||||
fprintf(stderr, " stdsw standard SW/NW alignment\n");
|
||||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -51,11 +50,9 @@ int main(int argc, char *argv[])
|
|||
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
|
|
|
|||
2
main.h
2
main.h
|
|
@ -17,8 +17,6 @@ extern "C" {
|
|||
int bwa_sai2sam_se(int argc, char *argv[]);
|
||||
int bwa_sai2sam_pe(int argc, char *argv[]);
|
||||
|
||||
int bwa_stdsw(int argc, char *argv[]);
|
||||
|
||||
int bwa_bwtsw2(int argc, char *argv[]);
|
||||
|
||||
int main_fastmap(int argc, char *argv[]);
|
||||
|
|
|
|||
162
simple_dp.c
162
simple_dp.c
|
|
@ -1,162 +0,0 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <stdint.h>
|
||||
#include "stdaln.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
typedef struct {
|
||||
int l;
|
||||
unsigned char *s;
|
||||
char *n;
|
||||
} seq1_t;
|
||||
|
||||
typedef struct {
|
||||
int n_seqs, m_seqs;
|
||||
seq1_t *seqs;
|
||||
} seqs_t;
|
||||
|
||||
unsigned char aln_rev_table[256] = {
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
|
||||
'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
|
||||
'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N',
|
||||
'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
|
||||
};
|
||||
|
||||
static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0;
|
||||
static AlnParam g_aln_param;
|
||||
|
||||
static void revseq(int len, uint8_t *seq)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len>>1; ++i) {
|
||||
uint8_t tmp = aln_rev_table[seq[len-1-i]];
|
||||
seq[len-1-i] = aln_rev_table[seq[i]];
|
||||
seq[i] = tmp;
|
||||
}
|
||||
if (len&1) seq[i] = aln_rev_table[seq[i]];
|
||||
}
|
||||
|
||||
static seqs_t *load_seqs(const char *fn)
|
||||
{
|
||||
seqs_t *s;
|
||||
seq1_t *p;
|
||||
gzFile fp;
|
||||
int l;
|
||||
kseq_t *seq;
|
||||
|
||||
fp = xzopen(fn, "r");
|
||||
seq = kseq_init(fp);
|
||||
s = (seqs_t*)calloc(1, sizeof(seqs_t));
|
||||
s->m_seqs = 256;
|
||||
s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t));
|
||||
while ((l = kseq_read(seq)) >= 0) {
|
||||
if (s->n_seqs == s->m_seqs) {
|
||||
s->m_seqs <<= 1;
|
||||
s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t));
|
||||
}
|
||||
p = s->seqs + (s->n_seqs++);
|
||||
p->l = seq->seq.l;
|
||||
p->s = (unsigned char*)malloc(p->l + 1);
|
||||
memcpy(p->s, seq->seq.s, p->l);
|
||||
p->s[p->l] = 0;
|
||||
p->n = strdup((const char*)seq->name.s);
|
||||
}
|
||||
kseq_destroy(seq);
|
||||
gzclose(fp);
|
||||
fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
|
||||
return s;
|
||||
}
|
||||
|
||||
static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < ss->n_seqs; ++i) {
|
||||
AlnAln *aa;
|
||||
seq1_t *p = ss->seqs + i;
|
||||
g_aln_param.band_width = l + p->l;
|
||||
aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l);
|
||||
if (aa->score >= g_thres || g_is_global) {
|
||||
printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand,
|
||||
aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo);
|
||||
// NB: I put the short sequence as the first sequence in SW, an insertion to
|
||||
// the reference becomes a deletion from the short sequence. Therefore, I use
|
||||
// "MDI" here rather than "MID", and print ->out2 first rather than ->out1.
|
||||
for (i = 0; i != aa->n_cigar; ++i)
|
||||
printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]);
|
||||
printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1);
|
||||
}
|
||||
aln_free_AlnAln(aa);
|
||||
}
|
||||
}
|
||||
|
||||
static void aln_seqs(const seqs_t *ss, const char *fn)
|
||||
{
|
||||
gzFile fp;
|
||||
kseq_t *seq;
|
||||
int l;
|
||||
|
||||
fp = xzopen(fn, "r");
|
||||
seq = kseq_init(fp);
|
||||
while ((l = kseq_read(seq)) >= 0) {
|
||||
if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+');
|
||||
if (g_strand&2) {
|
||||
revseq(l, (uint8_t*)seq->seq.s);
|
||||
aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-');
|
||||
}
|
||||
}
|
||||
kseq_destroy(seq);
|
||||
gzclose(fp);
|
||||
}
|
||||
|
||||
int bwa_stdsw(int argc, char *argv[])
|
||||
{
|
||||
int c;
|
||||
seqs_t *ss;
|
||||
|
||||
while ((c = getopt(argc, argv, "gT:frp")) >= 0) {
|
||||
switch (c) {
|
||||
case 'g': g_is_global = 1; break;
|
||||
case 'T': g_thres = atoi(optarg); break;
|
||||
case 'f': g_strand |= 1; break;
|
||||
case 'r': g_strand |= 2; break;
|
||||
case 'p': g_aa = 1; break;
|
||||
}
|
||||
}
|
||||
if (g_strand == 0) g_strand = 3;
|
||||
if (g_aa) g_strand = 1;
|
||||
if (optind + 1 >= argc) {
|
||||
fprintf(stderr, "\nUsage: bwa stdsw [options] <seq1.long.fa> <seq2.short.fa>\n\n");
|
||||
fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres);
|
||||
fprintf(stderr, " -p protein alignment (suppressing -r)\n");
|
||||
fprintf(stderr, " -f forward strand only\n");
|
||||
fprintf(stderr, " -r reverse strand only\n");
|
||||
fprintf(stderr, " -g global alignment\n\n");
|
||||
fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n");
|
||||
fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n");
|
||||
fprintf(stderr, " sequence.\n\n");
|
||||
return 1;
|
||||
}
|
||||
g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast;
|
||||
g_aln_param.gap_end = 0;
|
||||
ss = load_seqs(argv[optind]);
|
||||
aln_seqs(ss, argv[optind+1]);
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in New Issue