From 6ad5a3c086c82139f6f30f191fbfcc3cae2ec542 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 10:21:17 -0500 Subject: [PATCH] removed color-space support which has been broken since 0.6.x --- Makefile | 3 +- bwape.c | 38 ++--------- bwase.c | 45 ++----------- bwase.h | 2 +- bwtaln.c | 4 +- bwtaln.h | 1 - bwtindex.c | 24 +------ cs2nt.c | 191 ---------------------------------------------------- main.c | 3 - main.h | 2 - simple_dp.c | 162 -------------------------------------------- 11 files changed, 17 insertions(+), 458 deletions(-) delete mode 100644 cs2nt.c delete mode 100644 simple_dp.c diff --git a/Makefile b/Makefile index 6a3fc1e..8cf767a 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,7 @@ DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ - is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ - bwape.o cs2nt.o \ + is.o bwtmisc.o bwtindex.o ksw.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwape.c b/bwape.c index 644b5bd..4201cf2 100644 --- a/bwape.c +++ b/bwape.c @@ -212,19 +212,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, last_pos[x.y&1][1] = x; } } - } else if (opt->type == BWA_PET_SOLID) { - for (i = 0; i < d->arr.n; ++i) { - pair64_t x = d->arr.a[i]; - int strand = x.y>>1&1; - if ((strand^x.y)&1) { // push - int y = 1 - (x.y&1); - __pairing_aux(last_pos[y][1], x); - __pairing_aux(last_pos[y][0], x); - } else { // check - last_pos[x.y&1][0] = last_pos[x.y&1][1]; - last_pos[x.y&1][1] = x; - } - } } else { fprintf(stderr, "[paring] not implemented yet!\n"); exit(1); @@ -567,11 +554,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, ++n_tot[is_singleton]; cigar[0] = cigar[1] = 0; n_cigar[0] = n_cigar[1] = 0; - if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered + if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified ubyte_t *seq; if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip - if (popt->type == BWA_PET_STD) { + { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate __set_rght_coor(beg[k], end[k], p[1-k], p[k]); seq = p[k]->rseq; @@ -580,17 +567,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, seq = p[k]->seq; seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly } - } else { // BWA_PET_SOLID - if (p[1-k]->strand == 0) { // R3-F3 pairing - if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->rseq; - seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed - } else { // F3-R3 pairing - if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->seq; - } } // perform SW alignment cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); @@ -654,7 +630,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa[2]; gap_opt_t opt, opt0; khint_t iter; @@ -679,10 +655,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f opt0 = opt; fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); - if (!(opt.mode & BWA_MODE_COMPREAD)) { - popt->type = BWA_PET_SOLID; - ntbns = bwa_open_nt(prefix); - } else { // for Illumina alignment only + { // for Illumina alignment only if (popt->is_preload) { strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); @@ -715,7 +688,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); for (j = 0; j < 2; ++j) - bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); if (pac == 0) free(pacseq); @@ -740,7 +713,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // destroy bns_destroy(bns); - if (ntbns) bns_destroy(ntbns); for (i = 0; i < 2; ++i) { bwa_seq_close(ks[i]); fclose(fp_sa[i]); diff --git a/bwase.c b/bwase.c index 35744e7..8fa79ac 100644 --- a/bwase.c +++ b/bwase.c @@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s) s->len = s->full_len; } -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) { - ubyte_t *pacseq, *ntpac = 0; + ubyte_t *pacseq; int i, j; kstring_t *str; - if (ntbns) { // in color space - ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); - rewind(ntbns->fp_pac); - fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); - } - if (!_pacseq) { pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); rewind(bns->fp_pac); @@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); } -#if 0 - if (ntbns) { // in color space - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - bwa_cs2nt_core(s, bns->l_pac, ntpac); - for (j = 0; j < s->n_multi; ++j) { - bwt_multi1_t *q = s->multi + j; - int n_cigar; - if (q->gap == 0) continue; - free(q->cigar); - q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 0); - q->n_cigar = n_cigar; - } - if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again - free(s->cigar); - s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); - } - } - } -#endif // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { @@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t if (s->type != BWA_TYPE_NO_MATCH) { int nm; s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, - bns->l_pac, ntbns? ntpac : pacseq, str, &nm); + bns->l_pac, pacseq, str, &nm); s->nm = nm; } } free(str->s); free(str); // correct for trimmed reads - if (!ntbns) // trimming is only enabled for Illumina reads - for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); if (!_pacseq) free(pacseq); - free(ntpac); } int64_t pos_end(const bwa_seq_t *p) @@ -587,7 +557,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa; gap_opt_t opt; @@ -599,8 +569,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f m_aln = 0; fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac - ntbns = bwa_open_nt(prefix); bwa_print_sam_SQ(bns); //bwa_print_sam_PG(); // set ks @@ -628,7 +596,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); - bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] print alignments... "); @@ -642,7 +610,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f // destroy bwa_seq_close(ks); - if (ntbns) bns_destroy(ntbns); bns_destroy(bns); fclose(fp_sa); free(aln); diff --git a/bwase.h b/bwase.h index f8e9b0a..26a9f68 100644 --- a/bwase.h +++ b/bwase.h @@ -14,7 +14,7 @@ extern "C" { // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); // Refine the approximate position of the sequence to an actual placement for the sequence. - void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); // Backfill certain alignment properties mainly centering around number of matches. void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. diff --git a/bwtaln.c b/bwtaln.c index efc7f66..84be510 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -252,7 +252,7 @@ int bwa_aln(int argc, char *argv[]) char *prefix; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -272,7 +272,6 @@ int bwa_aln(int argc, char *argv[]) case 'L': opt->mode |= BWA_MODE_LOGGAP; break; case 'R': opt->max_top2 = atoi(optarg); break; case 'q': opt->trim_qual = atoi(optarg); break; - case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; case 'f': xreopen(optarg, "wb", stdout); break; case 'b': opt->mode |= BWA_MODE_BAM; break; @@ -310,7 +309,6 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); fprintf(stderr, " -B INT length of barcode\n"); -// fprintf(stderr, " -c input sequences are in the color space\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); diff --git a/bwtaln.h b/bwtaln.h index 39eaf4b..412cc04 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -107,7 +107,6 @@ typedef struct { } gap_opt_t; #define BWA_PET_STD 1 -#define BWA_PET_SOLID 2 typedef struct { int max_isize, force_isize; diff --git a/bwtindex.c b/bwtindex.c index 938e982..c01fa95 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -42,11 +42,11 @@ void bwa_pac_rev_core(const char *fn, const char *fn_rev); int bwa_index(int argc, char *argv[]) { char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_color = 0, is_64 = 0; + int c, algo_type = 0, is_64 = 0; clock_t t; int64_t l_pac; - while ((c = getopt(argc, argv, "6ca:p:")) >= 0) { + while ((c = getopt(argc, argv, "6a:p:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; @@ -55,7 +55,6 @@ int bwa_index(int argc, char *argv[]) else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; case 'p': prefix = strdup(optarg); break; - case 'c': is_color = 1; break; case '6': is_64 = 1; break; default: return 1; } @@ -67,7 +66,6 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); -// fprintf(stderr, " -c build color-space index\n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); @@ -83,29 +81,13 @@ int bwa_index(int argc, char *argv[]) str2 = (char*)calloc(strlen(prefix) + 10, 1); str3 = (char*)calloc(strlen(prefix) + 10, 1); - if (is_color == 0) { // nucleotide indexing + { // nucleotide indexing gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); - } else { // color indexing - gzFile fp = xzopen(argv[optind], "r"); - strcat(strcpy(str, prefix), ".nt"); - t = clock(); - fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - l_pac = bns_fasta2bntseq(fp, str, 0); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); - { - char *tmp_argv[3]; - tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; - t = clock(); - fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... "); - bwa_pac2cspac(3, tmp_argv); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { diff --git a/cs2nt.c b/cs2nt.c deleted file mode 100644 index dfbce60..0000000 --- a/cs2nt.c +++ /dev/null @@ -1,191 +0,0 @@ -#include -#include -#include -#include "bwtaln.h" -#include "stdaln.h" - -/* - Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we - decode as ATTGAC(RBGOG), there are one color change and one nt change; - if we decode as ATTAAC(RBRBG), there are two color changes. - - In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM - as the penalty; otherwise, we will use color quality as the - penalty. This means we always prefer two consistent color changes over - a nt change, but if a color has high quality, we may prefer one nt - change. - - In the above example, the penalties of the two types of decoding are - q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first; - otherwise the second. Note that no matter what we choose, the fourth - base will get a low nt quality. - */ - -#define COLOR_MM 19 -#define NUCL_MM 25 - -static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 }; - -/* - {A,C,G,T,N} -> {0,1,2,3,4} - nt_ref[0..size]: nucleotide reference: 0/1/2/3/4 - cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N - nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned) - btarray[0..4*size]: backtrack array (working space) - */ -void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray) -{ - int h[8], curr, last; - int x, y, xmin, hmin, k; - - // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last - - // recursion: initial value - if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2); - else { - for (x = 0; x != 4; ++x) h[x] = NUCL_MM; - h[nt_ref[0]] = 0; - } - // recursion: main loop - curr = 1; last = 0; - for (k = 1; k <= size; ++k) { - for (x = 0; x != 4; ++x) { - int min = 0x7fffffff, ymin = 0; - for (y = 0; y != 4; ++y) { - int s = h[last<<2|y]; - if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<= 0; --k) - nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]]; -} -/* - nt_read[0..size]: nucleotide read sequence: 0/1/2/3 - cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N - tarray[0..size*2-1]: temporary array - */ -uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray) -{ - int k, c1, c2; - uint8_t *t2array = tarray + size; - // get the color sequence of nt_read - c1 = nt_read[0]; - for (k = 1; k <= size; ++k) { - c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case - tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<>6 && tarray[k] == cs_read[k]>>6) { - q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10; - } else if (tarray[k-1] == cs_read[k-1]>>6) { - q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f); - } else if (tarray[k] == cs_read[k]>>6) { - q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f); - } // else, q = 0 - if (q < 0) q = 0; - if (q > 60) q = 60; - t2array[k] = nt_read[k]<<6 | q; - if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0; - } - return t2array + 1; // of size-2 -} - -// this function will be called when p->seq has been reversed by refine_gapped() -void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac) -{ - uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read; - int i, len; - uint8_t *seq; - - // set temporary arrays - if (p->type == BWA_TYPE_NO_MATCH) return; - len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space - ta = (uint8_t*)malloc(len * 7); - nt_ref = ta; - cs_read = nt_ref + len; - nt_read = cs_read + len; - btarray = nt_read + len; - tarray = nt_read + len; - -#define __gen_csbase(_cs, _i, _seq) do { \ - int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \ - if (q > 60) q = 60; \ - if (_seq[_i] > 3) q = 63; \ - (_cs) = _seq[_i]<<6 | q; \ - } while (0) - - // generate len, nt_ref[] and cs_read - seq = p->strand? p->rseq : p->seq; - nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4; - if (p->cigar == 0) { // no gap or clipping - len = p->len; - for (i = 0; i < p->len; ++i) { - __gen_csbase(cs_read[i], i, seq); - nt_ref[i+1] = bns_pac(pac, p->pos + i); - } - } else { - int k, z; - bwtint_t x, y; - x = p->pos; y = 0; - for (k = z = 0; k < p->n_cigar; ++k) { - int l = __cigar_len(p->cigar[k]); - if (__cigar_op(p->cigar[k]) == FROM_M) { - for (i = 0; i < l; ++i, ++x, ++y) { - __gen_csbase(cs_read[z], y, seq); - nt_ref[z+1] = bns_pac(pac, x); - ++z; - } - } else if (__cigar_op(p->cigar[k]) == FROM_I) { - for (i = 0; i < l; ++i, ++y) { - __gen_csbase(cs_read[z], y, seq); - nt_ref[z+1] = 4; - ++z; - } - } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l; - else x += l; - } - len = z; - } - - cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray); - new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray); - - // update p - p->len = p->full_len = len - 1; - for (i = 0; i < p->len; ++i) { - if ((new_nt_read[i]&0x3f) == 63) { - p->qual[i] = 33; seq[i] = 4; - } else { - p->qual[i] = (new_nt_read[i]&0x3f) + 33; - seq[i] = new_nt_read[i]>>6; - } - } - p->qual[p->len] = seq[p->len] = 0; - if (p->strand) { - memcpy(p->seq, seq, p->len); - seq_reverse(p->len, p->seq, 1); - seq_reverse(p->len, p->qual, 0); - } else { - memcpy(p->rseq, seq, p->len); - seq_reverse(p->len, p->rseq, 1); - } - free(ta); -} diff --git a/main.c b/main.c index 2718732..fc63c2e 100644 --- a/main.c +++ b/main.c @@ -28,7 +28,6 @@ static int usage() fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); - fprintf(stderr, " stdsw standard SW/NW alignment\n"); fprintf(stderr, "\n"); return 1; } @@ -51,11 +50,9 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); - else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); - else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1); else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); diff --git a/main.h b/main.h index 1a0292a..7b638ca 100644 --- a/main.h +++ b/main.h @@ -17,8 +17,6 @@ extern "C" { int bwa_sai2sam_se(int argc, char *argv[]); int bwa_sai2sam_pe(int argc, char *argv[]); - int bwa_stdsw(int argc, char *argv[]); - int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); diff --git a/simple_dp.c b/simple_dp.c deleted file mode 100644 index d2b4b71..0000000 --- a/simple_dp.c +++ /dev/null @@ -1,162 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "stdaln.h" -#include "utils.h" - -#include "kseq.h" -KSEQ_DECLARE(gzFile) - -typedef struct { - int l; - unsigned char *s; - char *n; -} seq1_t; - -typedef struct { - int n_seqs, m_seqs; - seq1_t *seqs; -} seqs_t; - -unsigned char aln_rev_table[256] = { - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', - 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', - 'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N', - 'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' -}; - -static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0; -static AlnParam g_aln_param; - -static void revseq(int len, uint8_t *seq) -{ - int i; - for (i = 0; i < len>>1; ++i) { - uint8_t tmp = aln_rev_table[seq[len-1-i]]; - seq[len-1-i] = aln_rev_table[seq[i]]; - seq[i] = tmp; - } - if (len&1) seq[i] = aln_rev_table[seq[i]]; -} - -static seqs_t *load_seqs(const char *fn) -{ - seqs_t *s; - seq1_t *p; - gzFile fp; - int l; - kseq_t *seq; - - fp = xzopen(fn, "r"); - seq = kseq_init(fp); - s = (seqs_t*)calloc(1, sizeof(seqs_t)); - s->m_seqs = 256; - s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); - while ((l = kseq_read(seq)) >= 0) { - if (s->n_seqs == s->m_seqs) { - s->m_seqs <<= 1; - s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); - } - p = s->seqs + (s->n_seqs++); - p->l = seq->seq.l; - p->s = (unsigned char*)malloc(p->l + 1); - memcpy(p->s, seq->seq.s, p->l); - p->s[p->l] = 0; - p->n = strdup((const char*)seq->name.s); - } - kseq_destroy(seq); - gzclose(fp); - fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); - return s; -} - -static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand) -{ - int i; - for (i = 0; i < ss->n_seqs; ++i) { - AlnAln *aa; - seq1_t *p = ss->seqs + i; - g_aln_param.band_width = l + p->l; - aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l); - if (aa->score >= g_thres || g_is_global) { - printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand, - aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo); - // NB: I put the short sequence as the first sequence in SW, an insertion to - // the reference becomes a deletion from the short sequence. Therefore, I use - // "MDI" here rather than "MID", and print ->out2 first rather than ->out1. - for (i = 0; i != aa->n_cigar; ++i) - printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1); - } - aln_free_AlnAln(aa); - } -} - -static void aln_seqs(const seqs_t *ss, const char *fn) -{ - gzFile fp; - kseq_t *seq; - int l; - - fp = xzopen(fn, "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+'); - if (g_strand&2) { - revseq(l, (uint8_t*)seq->seq.s); - aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-'); - } - } - kseq_destroy(seq); - gzclose(fp); -} - -int bwa_stdsw(int argc, char *argv[]) -{ - int c; - seqs_t *ss; - - while ((c = getopt(argc, argv, "gT:frp")) >= 0) { - switch (c) { - case 'g': g_is_global = 1; break; - case 'T': g_thres = atoi(optarg); break; - case 'f': g_strand |= 1; break; - case 'r': g_strand |= 2; break; - case 'p': g_aa = 1; break; - } - } - if (g_strand == 0) g_strand = 3; - if (g_aa) g_strand = 1; - if (optind + 1 >= argc) { - fprintf(stderr, "\nUsage: bwa stdsw [options] \n\n"); - fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres); - fprintf(stderr, " -p protein alignment (suppressing -r)\n"); - fprintf(stderr, " -f forward strand only\n"); - fprintf(stderr, " -r reverse strand only\n"); - fprintf(stderr, " -g global alignment\n\n"); - fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n"); - fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n"); - fprintf(stderr, " sequence.\n\n"); - return 1; - } - g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast; - g_aln_param.gap_end = 0; - ss = load_seqs(argv[optind]); - aln_seqs(ss, argv[optind+1]); - return 0; -}