removed color-space support

which has been broken since 0.6.x
This commit is contained in:
Heng Li 2013-02-12 10:21:17 -05:00
parent e5ab59db53
commit 6ad5a3c086
11 changed files with 17 additions and 458 deletions

View File

@ -6,8 +6,7 @@ DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \
bseq.o bwaseqio.o bwase.o kstring.o
AOBJS= QSufSort.o bwt_gen.o \
is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \
bwape.o cs2nt.o \
is.o bwtmisc.o bwtindex.o ksw.o bwape.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa

38
bwape.c
View File

@ -212,19 +212,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
last_pos[x.y&1][1] = x;
}
}
} else if (opt->type == BWA_PET_SOLID) {
for (i = 0; i < d->arr.n; ++i) {
pair64_t x = d->arr.a[i];
int strand = x.y>>1&1;
if ((strand^x.y)&1) { // push
int y = 1 - (x.y&1);
__pairing_aux(last_pos[y][1], x);
__pairing_aux(last_pos[y][0], x);
} else { // check
last_pos[x.y&1][0] = last_pos[x.y&1][1];
last_pos[x.y&1][1] = x;
}
}
} else {
fprintf(stderr, "[paring] not implemented yet!\n");
exit(1);
@ -567,11 +554,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
++n_tot[is_singleton];
cigar[0] = cigar[1] = 0;
n_cigar[0] = n_cigar[1] = 0;
if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
ubyte_t *seq;
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
if (popt->type == BWA_PET_STD) {
{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
seq = p[k]->rseq;
@ -580,17 +567,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
seq = p[k]->seq;
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
}
} else { // BWA_PET_SOLID
if (p[1-k]->strand == 0) { // R3-F3 pairing
if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
seq = p[k]->rseq;
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
} else { // F3-R3 pairing
if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
seq = p[k]->seq;
}
}
// perform SW alignment
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
@ -654,7 +630,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
bwa_seq_t *seqs[2];
bwa_seqio_t *ks[2];
clock_t t;
bntseq_t *bns, *ntbns = 0;
bntseq_t *bns;
FILE *fp_sa[2];
gap_opt_t opt, opt0;
khint_t iter;
@ -679,10 +655,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
opt0 = opt;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
if (!(opt.mode & BWA_MODE_COMPREAD)) {
popt->type = BWA_PET_SOLID;
ntbns = bwa_open_nt(prefix);
} else { // for Illumina alignment only
{ // for Illumina alignment only
if (popt->is_preload) {
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
@ -715,7 +688,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
for (j = 0; j < 2; ++j)
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
if (pac == 0) free(pacseq);
@ -740,7 +713,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
// destroy
bns_destroy(bns);
if (ntbns) bns_destroy(ntbns);
for (i = 0; i < 2; ++i) {
bwa_seq_close(ks[i]);
fclose(fp_sa[i]);

43
bwase.c
View File

@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s)
s->len = s->full_len;
}
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
{
ubyte_t *pacseq, *ntpac = 0;
ubyte_t *pacseq;
int i, j;
kstring_t *str;
if (ntbns) { // in color space
ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
rewind(ntbns->fp_pac);
fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
}
if (!_pacseq) {
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
rewind(bns->fp_pac);
@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
}
#if 0
if (ntbns) { // in color space
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *s = seqs + i;
bwa_cs2nt_core(s, bns->l_pac, ntpac);
for (j = 0; j < s->n_multi; ++j) {
bwt_multi1_t *q = s->multi + j;
int n_cigar;
if (q->gap == 0) continue;
free(q->cigar);
q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
(q->strand? 1 : -1) * q->gap, &n_cigar, 0);
q->n_cigar = n_cigar;
}
if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
free(s->cigar);
s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
}
}
}
#endif
// generate MD tag
str = (kstring_t*)calloc(1, sizeof(kstring_t));
for (i = 0; i != n_seqs; ++i) {
@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
if (s->type != BWA_TYPE_NO_MATCH) {
int nm;
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
bns->l_pac, pacseq, str, &nm);
s->nm = nm;
}
}
free(str->s); free(str);
// correct for trimmed reads
if (!ntbns) // trimming is only enabled for Illumina reads
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
if (!_pacseq) free(pacseq);
free(ntpac);
}
int64_t pos_end(const bwa_seq_t *p)
@ -587,7 +557,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
bwa_seq_t *seqs;
bwa_seqio_t *ks;
clock_t t;
bntseq_t *bns, *ntbns = 0;
bntseq_t *bns;
FILE *fp_sa;
gap_opt_t opt;
@ -599,8 +569,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
m_aln = 0;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
ntbns = bwa_open_nt(prefix);
bwa_print_sam_SQ(bns);
//bwa_print_sam_PG();
// set ks
@ -628,7 +596,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
bwa_refine_gapped(bns, n_seqs, seqs, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] print alignments... ");
@ -642,7 +610,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
// destroy
bwa_seq_close(ks);
if (ntbns) bns_destroy(ntbns);
bns_destroy(bns);
fclose(fp_sa);
free(aln);

View File

@ -14,7 +14,7 @@ extern "C" {
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
// Refine the approximate position of the sequence to an actual placement for the sequence.
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
// Backfill certain alignment properties mainly centering around number of matches.
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
// Calculate the end position of a read given a certain sequence.

View File

@ -252,7 +252,7 @@ int bwa_aln(int argc, char *argv[])
char *prefix;
opt = gap_init_opt();
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
switch (c) {
case 'n':
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
@ -272,7 +272,6 @@ int bwa_aln(int argc, char *argv[])
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
case 'R': opt->max_top2 = atoi(optarg); break;
case 'q': opt->trim_qual = atoi(optarg); break;
case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
case 'f': xreopen(optarg, "wb", stdout); break;
case 'b': opt->mode |= BWA_MODE_BAM; break;
@ -310,7 +309,6 @@ int bwa_aln(int argc, char *argv[])
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
fprintf(stderr, " -B INT length of barcode\n");
// fprintf(stderr, " -c input sequences are in the color space\n");
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");

View File

@ -107,7 +107,6 @@ typedef struct {
} gap_opt_t;
#define BWA_PET_STD 1
#define BWA_PET_SOLID 2
typedef struct {
int max_isize, force_isize;

View File

@ -42,11 +42,11 @@ void bwa_pac_rev_core(const char *fn, const char *fn_rev);
int bwa_index(int argc, char *argv[])
{
char *prefix = 0, *str, *str2, *str3;
int c, algo_type = 0, is_color = 0, is_64 = 0;
int c, algo_type = 0, is_64 = 0;
clock_t t;
int64_t l_pac;
while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "div") == 0) algo_type = 1;
@ -55,7 +55,6 @@ int bwa_index(int argc, char *argv[])
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
break;
case 'p': prefix = strdup(optarg); break;
case 'c': is_color = 1; break;
case '6': is_64 = 1; break;
default: return 1;
}
@ -67,7 +66,6 @@ int bwa_index(int argc, char *argv[])
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
// fprintf(stderr, " -c build color-space index\n");
fprintf(stderr, "\n");
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
@ -83,29 +81,13 @@ int bwa_index(int argc, char *argv[])
str2 = (char*)calloc(strlen(prefix) + 10, 1);
str3 = (char*)calloc(strlen(prefix) + 10, 1);
if (is_color == 0) { // nucleotide indexing
{ // nucleotide indexing
gzFile fp = xzopen(argv[optind], "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
gzclose(fp);
} else { // color indexing
gzFile fp = xzopen(argv[optind], "r");
strcat(strcpy(str, prefix), ".nt");
t = clock();
fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
l_pac = bns_fasta2bntseq(fp, str, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
gzclose(fp);
{
char *tmp_argv[3];
tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
t = clock();
fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
bwa_pac2cspac(3, tmp_argv);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
}
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
{

191
cs2nt.c
View File

@ -1,191 +0,0 @@
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include "bwtaln.h"
#include "stdaln.h"
/*
Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
decode as ATTGAC(RBGOG), there are one color change and one nt change;
if we decode as ATTAAC(RBRBG), there are two color changes.
In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
as the penalty; otherwise, we will use color quality as the
penalty. This means we always prefer two consistent color changes over
a nt change, but if a color has high quality, we may prefer one nt
change.
In the above example, the penalties of the two types of decoding are
q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
otherwise the second. Note that no matter what we choose, the fourth
base will get a low nt quality.
*/
#define COLOR_MM 19
#define NUCL_MM 25
static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
/*
{A,C,G,T,N} -> {0,1,2,3,4}
nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
btarray[0..4*size]: backtrack array (working space)
*/
void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
{
int h[8], curr, last;
int x, y, xmin, hmin, k;
// h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
// recursion: initial value
if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
else {
for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
h[nt_ref[0]] = 0;
}
// recursion: main loop
curr = 1; last = 0;
for (k = 1; k <= size; ++k) {
for (x = 0; x != 4; ++x) {
int min = 0x7fffffff, ymin = 0;
for (y = 0; y != 4; ++y) {
int s = h[last<<2|y];
if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
if (s < min) {
min = s; ymin = y;
}
}
h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
}
last = curr; curr = 1 - curr; // swap
}
// back trace
hmin = 0x7fffffff; xmin = 0;
for (x = 0; x != 4; ++x) {
if (h[last<<2|x] < hmin) {
hmin = h[last<<2|x]; xmin = x;
}
}
nt_read[size] = xmin;
for (k = size - 1; k >= 0; --k)
nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
}
/*
nt_read[0..size]: nucleotide read sequence: 0/1/2/3
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
tarray[0..size*2-1]: temporary array
*/
uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
{
int k, c1, c2;
uint8_t *t2array = tarray + size;
// get the color sequence of nt_read
c1 = nt_read[0];
for (k = 1; k <= size; ++k) {
c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
c1 = c2;
}
for (k = 1; k != size; ++k) {
int q = 0;
if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
} else if (tarray[k-1] == cs_read[k-1]>>6) {
q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
} else if (tarray[k] == cs_read[k]>>6) {
q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
} // else, q = 0
if (q < 0) q = 0;
if (q > 60) q = 60;
t2array[k] = nt_read[k]<<6 | q;
if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
}
return t2array + 1; // of size-2
}
// this function will be called when p->seq has been reversed by refine_gapped()
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
{
uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
int i, len;
uint8_t *seq;
// set temporary arrays
if (p->type == BWA_TYPE_NO_MATCH) return;
len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
ta = (uint8_t*)malloc(len * 7);
nt_ref = ta;
cs_read = nt_ref + len;
nt_read = cs_read + len;
btarray = nt_read + len;
tarray = nt_read + len;
#define __gen_csbase(_cs, _i, _seq) do { \
int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
if (q > 60) q = 60; \
if (_seq[_i] > 3) q = 63; \
(_cs) = _seq[_i]<<6 | q; \
} while (0)
// generate len, nt_ref[] and cs_read
seq = p->strand? p->rseq : p->seq;
nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
if (p->cigar == 0) { // no gap or clipping
len = p->len;
for (i = 0; i < p->len; ++i) {
__gen_csbase(cs_read[i], i, seq);
nt_ref[i+1] = bns_pac(pac, p->pos + i);
}
} else {
int k, z;
bwtint_t x, y;
x = p->pos; y = 0;
for (k = z = 0; k < p->n_cigar; ++k) {
int l = __cigar_len(p->cigar[k]);
if (__cigar_op(p->cigar[k]) == FROM_M) {
for (i = 0; i < l; ++i, ++x, ++y) {
__gen_csbase(cs_read[z], y, seq);
nt_ref[z+1] = bns_pac(pac, x);
++z;
}
} else if (__cigar_op(p->cigar[k]) == FROM_I) {
for (i = 0; i < l; ++i, ++y) {
__gen_csbase(cs_read[z], y, seq);
nt_ref[z+1] = 4;
++z;
}
} else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
else x += l;
}
len = z;
}
cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
// update p
p->len = p->full_len = len - 1;
for (i = 0; i < p->len; ++i) {
if ((new_nt_read[i]&0x3f) == 63) {
p->qual[i] = 33; seq[i] = 4;
} else {
p->qual[i] = (new_nt_read[i]&0x3f) + 33;
seq[i] = new_nt_read[i]>>6;
}
}
p->qual[p->len] = seq[p->len] = 0;
if (p->strand) {
memcpy(p->seq, seq, p->len);
seq_reverse(p->len, p->seq, 1);
seq_reverse(p->len, p->qual, 0);
} else {
memcpy(p->rseq, seq, p->len);
seq_reverse(p->len, p->rseq, 1);
}
free(ta);
}

3
main.c
View File

@ -28,7 +28,6 @@ static int usage()
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n");
fprintf(stderr, " stdsw standard SW/NW alignment\n");
fprintf(stderr, "\n");
return 1;
}
@ -51,11 +50,9 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1);
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1);
else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1);
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);

2
main.h
View File

@ -17,8 +17,6 @@ extern "C" {
int bwa_sai2sam_se(int argc, char *argv[]);
int bwa_sai2sam_pe(int argc, char *argv[]);
int bwa_stdsw(int argc, char *argv[]);
int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);

View File

@ -1,162 +0,0 @@
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <zlib.h>
#include <stdint.h>
#include "stdaln.h"
#include "utils.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
typedef struct {
int l;
unsigned char *s;
char *n;
} seq1_t;
typedef struct {
int n_seqs, m_seqs;
seq1_t *seqs;
} seqs_t;
unsigned char aln_rev_table[256] = {
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N',
'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
};
static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0;
static AlnParam g_aln_param;
static void revseq(int len, uint8_t *seq)
{
int i;
for (i = 0; i < len>>1; ++i) {
uint8_t tmp = aln_rev_table[seq[len-1-i]];
seq[len-1-i] = aln_rev_table[seq[i]];
seq[i] = tmp;
}
if (len&1) seq[i] = aln_rev_table[seq[i]];
}
static seqs_t *load_seqs(const char *fn)
{
seqs_t *s;
seq1_t *p;
gzFile fp;
int l;
kseq_t *seq;
fp = xzopen(fn, "r");
seq = kseq_init(fp);
s = (seqs_t*)calloc(1, sizeof(seqs_t));
s->m_seqs = 256;
s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t));
while ((l = kseq_read(seq)) >= 0) {
if (s->n_seqs == s->m_seqs) {
s->m_seqs <<= 1;
s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t));
}
p = s->seqs + (s->n_seqs++);
p->l = seq->seq.l;
p->s = (unsigned char*)malloc(p->l + 1);
memcpy(p->s, seq->seq.s, p->l);
p->s[p->l] = 0;
p->n = strdup((const char*)seq->name.s);
}
kseq_destroy(seq);
gzclose(fp);
fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
return s;
}
static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand)
{
int i;
for (i = 0; i < ss->n_seqs; ++i) {
AlnAln *aa;
seq1_t *p = ss->seqs + i;
g_aln_param.band_width = l + p->l;
aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l);
if (aa->score >= g_thres || g_is_global) {
printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand,
aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo);
// NB: I put the short sequence as the first sequence in SW, an insertion to
// the reference becomes a deletion from the short sequence. Therefore, I use
// "MDI" here rather than "MID", and print ->out2 first rather than ->out1.
for (i = 0; i != aa->n_cigar; ++i)
printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]);
printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1);
}
aln_free_AlnAln(aa);
}
}
static void aln_seqs(const seqs_t *ss, const char *fn)
{
gzFile fp;
kseq_t *seq;
int l;
fp = xzopen(fn, "r");
seq = kseq_init(fp);
while ((l = kseq_read(seq)) >= 0) {
if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+');
if (g_strand&2) {
revseq(l, (uint8_t*)seq->seq.s);
aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-');
}
}
kseq_destroy(seq);
gzclose(fp);
}
int bwa_stdsw(int argc, char *argv[])
{
int c;
seqs_t *ss;
while ((c = getopt(argc, argv, "gT:frp")) >= 0) {
switch (c) {
case 'g': g_is_global = 1; break;
case 'T': g_thres = atoi(optarg); break;
case 'f': g_strand |= 1; break;
case 'r': g_strand |= 2; break;
case 'p': g_aa = 1; break;
}
}
if (g_strand == 0) g_strand = 3;
if (g_aa) g_strand = 1;
if (optind + 1 >= argc) {
fprintf(stderr, "\nUsage: bwa stdsw [options] <seq1.long.fa> <seq2.short.fa>\n\n");
fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres);
fprintf(stderr, " -p protein alignment (suppressing -r)\n");
fprintf(stderr, " -f forward strand only\n");
fprintf(stderr, " -r reverse strand only\n");
fprintf(stderr, " -g global alignment\n\n");
fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n");
fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n");
fprintf(stderr, " sequence.\n\n");
return 1;
}
g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast;
g_aln_param.gap_end = 0;
ss = load_seqs(argv[optind]);
aln_seqs(ss, argv[optind+1]);
return 0;
}