From 1bba5ef20e3704e83495995af32520e0c25d3f0a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Aug 2014 10:31:54 -0400 Subject: [PATCH 01/19] r807: allow to change block size in bwt_gen For a very large reference genome, the default is too small. --- bwt.h | 1 + bwt_gen.c | 9 +++++++-- bwtindex.c | 17 +++++++++++------ main.c | 2 +- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/bwt.h b/bwt.h index d2ff0ac..c862352 100644 --- a/bwt.h +++ b/bwt.h @@ -92,6 +92,7 @@ extern "C" { void bwt_destroy(bwt_t *bwt); void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW + void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW void bwt_cal_sa(bwt_t *bwt, int intv); void bwt_bwtupdate_core(bwt_t *bwt); diff --git a/bwt_gen.c b/bwt_gen.c index 6139d80..76f28c9 100644 --- a/bwt_gen.c +++ b/bwt_gen.c @@ -1598,15 +1598,20 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o } } -void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) +void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size) { BWTInc *bwtInc; - bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000); + bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size); printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); BWTIncFree(bwtInc); } +void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) +{ + bwt_bwtgen2(fn_pac, fn_bwt, 10000000); +} + int bwt_bwtgen_main(int argc, char *argv[]) { if (argc < 3) { diff --git a/bwtindex.c b/bwtindex.c index 9e3ec15..18523da 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -189,11 +189,11 @@ int bwa_index(int argc, char *argv[]) // the "index" command extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_64 = 0; + int c, algo_type = 0, is_64 = 0, block_size = 10000000; clock_t t; int64_t l_pac; - while ((c = getopt(argc, argv, "6a:p:")) >= 0) { + while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; @@ -203,20 +203,25 @@ int bwa_index(int argc, char *argv[]) // the "index" command break; case 'p': prefix = strdup(optarg); break; case '6': is_64 = 1; break; + case 'b': + block_size = strtol(optarg, &str, 10); + if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024; + else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024; + else if (*str == 'K' || *str == 'k') block_size *= 1024; default: return 1; } } if (optind + 1 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] \n\n"); + fprintf(stderr, "Usage: bwa index [options] \n\n"); fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); + fprintf(stderr, " -b INT block size for the bwtsw algorithm (force -a bwtsw) [%d]\n", block_size); fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); - fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); - fprintf(stderr, " according to the length of the genome.\n\n"); + fprintf(stderr, " `-a div' do not work not for long genomes.\n\n"); return 1; } if (prefix == 0) { @@ -242,7 +247,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command strcpy(str2, prefix); strcat(str2, ".bwt"); t = clock(); fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); - if (algo_type == 2) bwt_bwtgen(str, str2); + if (algo_type == 2) bwt_bwtgen2(str, str2, block_size); else if (algo_type == 1 || algo_type == 3) { bwt_t *bwt; bwt = bwt_pac2bwt(str, algo_type == 3); diff --git a/main.c b/main.c index b873b75..d212e24 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r789" +#define PACKAGE_VERSION "0.7.10-r807-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From bf7d1d46caedf120ec16a2ae95bfe783fe5f6ba2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Aug 2014 10:36:24 -0400 Subject: [PATCH 02/19] r808: a minor bug with the new index -b --- bwtindex.c | 3 ++- main.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bwtindex.c b/bwtindex.c index 18523da..2bfd667 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -208,6 +208,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024; else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024; else if (*str == 'K' || *str == 'k') block_size *= 1024; + break; default: return 1; } } @@ -217,7 +218,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command fprintf(stderr, "Usage: bwa index [options] \n\n"); fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); - fprintf(stderr, " -b INT block size for the bwtsw algorithm (force -a bwtsw) [%d]\n", block_size); + fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size); fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); diff --git a/main.c b/main.c index d212e24..8240411 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r807-dirty" +#define PACKAGE_VERSION "0.7.10-r808-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From b5cba257c1bb5b3f050c32b1747961d3429aa39d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Aug 2014 11:59:27 -0400 Subject: [PATCH 03/19] r809: new strategy for the -a mode --- bwamem.c | 8 +++++++- bwamem.h | 3 +++ bwamem_extra.c | 15 ++++++++++++++- bwt.c | 16 ++++++++++++++-- bwt.h | 1 + fastmap.c | 26 +++++++++++++++++++++----- main.c | 2 +- 7 files changed, 61 insertions(+), 10 deletions(-) diff --git a/bwamem.c b/bwamem.c index e9d9304..f8c5695 100644 --- a/bwamem.c +++ b/bwamem.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #ifdef HAVE_PTHREAD #include @@ -57,6 +58,9 @@ mem_opt_t *mem_opt_init() o->zdrop = 100; o->pen_unpaired = 17; o->pen_clip5 = o->pen_clip3 = 5; + + o->max_mem_intv = 0; + o->min_seed_len = 19; o->split_width = 10; o->max_occ = 500; @@ -115,7 +119,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co // first pass: find all SMEMs while (x < len) { if (seq[x] < 4) { - x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); + x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length @@ -125,6 +129,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co } else ++x; } // second pass: find MEMs inside a long SMEM + if (opt->max_mem_intv > 0) goto sort_intv; old_n = a->mem.n; for (k = 0; k < old_n; ++k) { bwtintv_t *p = &a->mem.a[k]; @@ -136,6 +141,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } // sort +sort_intv: ks_introsort(mem_intv, a->mem.n, a->mem.a); } diff --git a/bwamem.h b/bwamem.h index 7b14ca8..0628bfb 100644 --- a/bwamem.h +++ b/bwamem.h @@ -29,6 +29,8 @@ typedef struct { int w; // band width int zdrop; // Z-dropoff + uint64_t max_mem_intv; + int T; // output score threshold; only affecting output int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length @@ -97,6 +99,7 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); + void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv); const bwtintv_v *smem_next(smem_i *itr); mem_opt_t *mem_opt_init(void); diff --git a/bwamem_extra.c b/bwamem_extra.c index 59bb68e..add9ff5 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -1,3 +1,4 @@ +#include #include "bwa.h" #include "bwamem.h" #include "bntseq.h" @@ -11,6 +12,8 @@ struct __smem_i { const bwt_t *bwt; const uint8_t *query; int start, len; + int min_intv, max_len; + uint64_t max_intv; bwtintv_v *matches; // matches; to be returned by smem_next() bwtintv_v *sub; // sub-matches inside the longest match; temporary bwtintv_v *tmpvec[2]; // temporary arrays @@ -25,6 +28,9 @@ smem_i *smem_itr_init(const bwt_t *bwt) itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); itr->matches = calloc(1, sizeof(bwtintv_v)); itr->sub = calloc(1, sizeof(bwtintv_v)); + itr->min_intv = 1; + itr->max_len = INT_MAX; + itr->max_intv = 0; return itr; } @@ -44,6 +50,13 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } +void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv) +{ + itr->min_intv = min_intv; + itr->max_len = max_len; + itr->max_intv = max_intv; +} + const bwtintv_v *smem_next(smem_i *itr) { int i, max, max_i, ori_start; @@ -52,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr) while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM + itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; diff --git a/bwt.c b/bwt.c index c9bf6a3..a719f53 100644 --- a/bwt.c +++ b/bwt.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "utils.h" #include "bwt.h" #include "kvec.h" @@ -285,7 +286,7 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } -int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; @@ -307,6 +308,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (ok[c].x[2] != ik.x[2]) { // change of the interval size kv_push(bwtintv_t, *curr, ik); if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further + if (i - x > max_len && ik.x[2] < max_intv) break; } ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base @@ -323,8 +325,13 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; + int max_stop = 0; bwt_extend(bwt, p, ok, 1); - if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough + if (ok[c].x[2] != p->x[2]) { // change of interval size + if (p->info - i - 1 > max_len && p->x[2] < max_intv) + max_stop = 1; + } + if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; @@ -346,6 +353,11 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, return ret; } +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +{ + return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec); +} + /************************* * Read/write BWT and SA * *************************/ diff --git a/bwt.h b/bwt.h index c862352..7ab49bc 100644 --- a/bwt.h +++ b/bwt.h @@ -119,6 +119,7 @@ extern "C" { * Return the end of the longest exact match starting from _x_. */ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); // SMEM iterator interface diff --git a/fastmap.c b/fastmap.c index 479e878..4f5a07c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include "bwa.h" @@ -53,7 +54,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -81,6 +82,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; + else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1; else if (c == 'C') copy_comment = 1; else if (c == 'Q') { opt0.mapQ_coef_len = 1; @@ -133,6 +135,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); + fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT [%ld]\n", (long)opt->max_mem_intv); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); @@ -141,12 +144,13 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -e discard full-length exact matches\n"); + fprintf(stderr, "\nScoring options:\n\n"); fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); - fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); + fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); fprintf(stderr, " pacbio: -k17 -W40 -r10 -A2 -B5 -O2 -E1 -L0\n"); fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n"); @@ -264,7 +268,8 @@ int main_mem(int argc, char *argv[]) int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX; + uint64_t max_intv = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -272,16 +277,26 @@ int main_fastmap(int argc, char *argv[]) const bwtintv_v *a; bwaidx_t *idx; - while ((c = getopt(argc, argv, "w:l:p")) >= 0) { + while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) { switch (c) { case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; + case 'i': min_intv = atoi(optarg); break; + case 'I': max_intv = atol(optarg); break; + case 'L': max_len = atoi(optarg); break; default: return 1; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa fastmap [options] \n\n"); + fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len); + fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth); + fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv); + fprintf(stderr, " -l INT max MEM length [%d]\n", max_len); + fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv); + fprintf(stderr, "\n"); return 1; } @@ -289,6 +304,7 @@ int main_fastmap(int argc, char *argv[]) seq = kseq_init(fp); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; itr = smem_itr_init(idx->bwt); + smem_config(itr, min_intv, max_len, max_intv); while (kseq_read(seq) >= 0) { err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { diff --git a/main.c b/main.c index 8240411..d82f50c 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r808-dirty" +#define PACKAGE_VERSION "0.7.10-r809-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 1e611b235c8e37fd4d94e383543a2d8634deb3c1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Aug 2014 11:07:24 -0400 Subject: [PATCH 04/19] r810: add err_puts() puts() adds '\n', but fputs() does not. --- main.c | 2 +- utils.c | 11 +++++++++++ utils.h | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/main.c b/main.c index d82f50c..4001a4f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r809-dirty" +#define PACKAGE_VERSION "0.7.10-r810-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); diff --git a/utils.c b/utils.c index 00be7f0..2983261 100644 --- a/utils.c +++ b/utils.c @@ -219,6 +219,17 @@ int err_fputs(const char *s, FILE *stream) return ret; } +int err_puts(const char *s) +{ + int ret = puts(s); + if (EOF == ret) + { + _err_fatal_simple("puts", strerror(errno)); + } + + return ret; +} + int err_fflush(FILE *stream) { int ret = fflush(stream); diff --git a/utils.h b/utils.h index 5ef6ac4..11966b8 100644 --- a/utils.h +++ b/utils.h @@ -80,7 +80,7 @@ extern "C" { int err_fputc(int c, FILE *stream); #define err_putchar(C) err_fputc((C), stdout) int err_fputs(const char *s, FILE *stream); -#define err_puts(S) err_fputs((S), stdout) + int err_puts(const char *s); int err_fflush(FILE *stream); int err_fclose(FILE *stream); int err_gzclose(gzFile file); From 35ac99b4f77fd49f261388b90c04f30d3f3fd26e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 29 Aug 2014 10:51:23 -0400 Subject: [PATCH 05/19] r815: optionally output ref fasta header Also fixed a bug in reading .ann files --- bntseq.c | 6 +++--- bwamem.c | 20 ++++++++++++++------ bwamem.h | 1 + bwamem_pair.c | 6 +++--- fastmap.c | 4 +++- main.c | 2 +- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/bntseq.c b/bntseq.c index eddae84..548a7fd 100644 --- a/bntseq.c +++ b/bntseq.c @@ -94,7 +94,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix) bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) { - char str[1024]; + char str[8192]; FILE *fp; const char *fname; bntseq_t *bns; @@ -117,14 +117,14 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c if (scanres != 2) goto badread; p->name = strdup(str); // read fasta comments - while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; while (c != '\n' && c != EOF) c = fgetc(fp); if (c == EOF) { scanres = EOF; goto badread; } *q = 0; - if (q - str > 1) p->anno = strdup(str + 1); // skip leading space + if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space else p->anno = strdup(""); // read the rest scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); diff --git a/bwamem.c b/bwamem.c index f8c5695..8377122 100644 --- a/bwamem.c +++ b/bwamem.c @@ -759,7 +759,7 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar) return l; } -void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all) +void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_) { int i, l_name; mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert @@ -788,7 +788,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->n_cigar) { // aligned for (i = 0; i < p->n_cigar; ++i) { int c = p->cigar[i]&0xf; - if (!softclip_all && (c == 3 || c == 4)) + if (!(opt->flag&MEM_F_SOFTCLIP) && (c == 3 || c == 4)) c = which? 4 : 3; // use hard clipping for supplementary alignments kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); } @@ -816,7 +816,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputsn("*\t*", 3, str); } else if (!p->is_rev) { // the forward strand int i, qb = 0, qe = s->l_seq; - if (p->n_cigar && which && !softclip_all) { // have cigar && not the primary alignment && not softclip all + if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP)) { // have cigar && not the primary alignment && not softclip all if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4; if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4; } @@ -830,7 +830,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m } else kputc('*', str); } else { // the reverse strand int i, qb = 0, qe = s->l_seq; - if (p->n_cigar && which && !softclip_all) { + if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP)) { if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4; if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4; } @@ -875,6 +875,14 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m } if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } + if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) { + int tmp; + kputsn("\tXR:Z:", 6, str); + tmp = str->l; + kputs(bns->anns[p->rid].anno, str); + for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE + if (str->s[i] == '\t') str->s[i] = ' '; + } kputc('\n', str); } @@ -941,10 +949,10 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); t.flag |= extra_flag; - mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP); + mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m); } else { for (k = 0; k < aa.n; ++k) - mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP); + mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m); for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); free(aa.a); } diff --git a/bwamem.h b/bwamem.h index 0628bfb..61bb335 100644 --- a/bwamem.h +++ b/bwamem.h @@ -18,6 +18,7 @@ typedef struct __smem_i smem_i; #define MEM_F_NO_RESCUE 0x20 #define MEM_F_SELF_OVLP 0x40 #define MEM_F_ALN_REG 0x80 +#define MEM_F_REF_HDR 0x100 #define MEM_F_SOFTCLIP 0x200 typedef struct { diff --git a/bwamem_pair.c b/bwamem_pair.c index a3aeb80..8fe2dd0 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -247,7 +247,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); - extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, int softclip_all); + extern void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; @@ -327,8 +327,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // write SAM h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0; h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0; - mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0; - mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s; + mem_aln2sam(opt, bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; + mem_aln2sam(opt, bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); // h[0].XA will be freed in the following block free(h[1].cigar); diff --git a/fastmap.c b/fastmap.c index 4f5a07c..c141383 100644 --- a/fastmap.c +++ b/fastmap.c @@ -54,7 +54,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -71,6 +71,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; else if (c == 'F') opt->flag |= MEM_F_ALN_REG; else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP; + else if (c == 'V') opt->flag |= MEM_F_REF_HDR; else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); @@ -163,6 +164,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -h INT if there are 80%% of the max score, output all in XA [%d]\n", opt->max_hits); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, " -V output the reference FASTA header in the XR tag\n"); fprintf(stderr, " -Y use soft clipping for supplementary alignments\n"); fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); diff --git a/main.c b/main.c index 4001a4f..f5e65c1 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r810-dirty" +#define PACKAGE_VERSION "0.7.10-r815-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From e4ad616d43c890b3c78fde322187b0634cdb087d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 5 Sep 2014 12:49:50 -0400 Subject: [PATCH 06/19] r816: read .alt file (not tested) --- bntseq.c | 29 +++++++++++++++++++++++++++-- bntseq.h | 1 + main.c | 2 +- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/bntseq.c b/bntseq.c index 548a7fd..93cdba1 100644 --- a/bntseq.c +++ b/bntseq.c @@ -37,6 +37,9 @@ #include "kseq.h" KSEQ_DECLARE(gzFile) +#include "khash.h" +KHASH_MAP_INIT_STR(str, int) + #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif @@ -165,11 +168,33 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c bntseq_t *bns_restore(const char *prefix) { - char ann_filename[1024], amb_filename[1024], pac_filename[1024]; + char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024]; + FILE *fp; + bntseq_t *bns; strcat(strcpy(ann_filename, prefix), ".ann"); strcat(strcpy(amb_filename, prefix), ".amb"); strcat(strcpy(pac_filename, prefix), ".pac"); - return bns_restore_core(ann_filename, amb_filename, pac_filename); + bns = bns_restore_core(ann_filename, amb_filename, pac_filename); + if (bns == 0) return 0; + if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present + char str[1024]; + khash_t(str) *h; + int i, absent; + khint_t k; + h = kh_init(str); + for (i = 0; i < bns->n_seqs; ++i) { + k = kh_put(str, h, bns->anns[i].name, &absent); + kh_val(h, k) = i; + } + while (fscanf(fp, "%s", str) == 1) { + k = kh_get(str, h, str); + if (k != kh_end(h)) + bns->anns[kh_val(h, k)].is_alt = 1; + } + kh_destroy(str, h); + fclose(fp); + } + return bns; } void bns_destroy(bntseq_t *bns) diff --git a/bntseq.h b/bntseq.h index 6437cf6..03671d6 100644 --- a/bntseq.h +++ b/bntseq.h @@ -43,6 +43,7 @@ typedef struct { int32_t len; int32_t n_ambs; uint32_t gi; + int32_t is_alt; char *name, *anno; } bntann1_t; diff --git a/main.c b/main.c index f5e65c1..52e87e8 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r815-dirty" +#define PACKAGE_VERSION "0.7.10-r816-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 1934f0cf24263ffd3671a4f572114e48f082cba6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 5 Sep 2014 13:20:52 -0400 Subject: [PATCH 07/19] code backup --- bwamem.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index 8377122..0bbdd2b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1077,7 +1077,7 @@ typedef struct { const mem_pestat_t *pes; smem_aux_t **aux; bseq1_t *seqs; - mem_alnreg_v *regs; + mem_alnreg_v *regs, *regs_alt; int64_t n_processed; } worker_t; @@ -1120,16 +1120,17 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn { extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); worker_t w; - mem_alnreg_v *regs; mem_pestat_t pes[4]; double ctime, rtime; - int i; + int i, has_alt = 0; + for (i = 0; i < bns->n_seqs; ++i) + if (bns->anns[i].is_alt) has_alt = 1; ctime = cputime(); rtime = realtime(); global_bns = bns; - regs = malloc(n * sizeof(mem_alnreg_v)); + w.regs = malloc(n * sizeof(mem_alnreg_v)); w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; - w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; + w.seqs = seqs; w.n_processed = n_processed; w.pes = &pes[0]; w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); for (i = 0; i < opt->n_threads; ++i) @@ -1140,10 +1141,10 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn free(w.aux); if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 - else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data + else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data } kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment - free(regs); + free(w.regs); if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); } From ca61fe3ad5a9d4b7d2acf2a780e637b28f056fce Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 8 Sep 2014 08:52:02 -0400 Subject: [PATCH 08/19] code backup --- bwamem.c | 22 +++++++++++++++------- bwamem.h | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0bbdd2b..d6a7998 100644 --- a/bwamem.c +++ b/bwamem.c @@ -377,7 +377,7 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) -#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) +#define alnreg_hlt(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)))) KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) #define PATCH_MAX_R_BW 0.05f @@ -475,14 +475,17 @@ int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int return n - 1; } -void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) +void mem_mark_primary_se(const mem_opt_t *opt, int _n, mem_alnreg_t *a, int64_t id) { // similar to the loop in mem_chain_flt() - int i, k, tmp; + int i, k, tmp, n; kvec_t(int) z; - if (n == 0) return; + if (n_ == 0) return; kv_init(z); - for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); - ks_introsort(mem_ars_hash, n, a); + for (i = 0; i < n_; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); + ks_introsort(mem_ars_hash, n_, a); + for (i = 0; i < n_; ++i) + if (a[i].is_alt) break; + if ((n = i) == 0) return; tmp = opt->a + opt->b; tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp; tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp; @@ -995,6 +998,11 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); } } + for (i = 0; i < regs.n; ++i) { + mem_alnreg_t *p = ®s.a[i]; + if (p->rid >= 0 && bns->anns[p->rid].is_alt) + p->is_alt = 1; + } return regs; } @@ -1077,7 +1085,7 @@ typedef struct { const mem_pestat_t *pes; smem_aux_t **aux; bseq1_t *seqs; - mem_alnreg_v *regs, *regs_alt; + mem_alnreg_v *regs; int64_t n_processed; } worker_t; diff --git a/bwamem.h b/bwamem.h index 61bb335..8b8ec0d 100644 --- a/bwamem.h +++ b/bwamem.h @@ -68,7 +68,7 @@ typedef struct { int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary int seedlen0; // length of the starting seed - int n_comp; // number of sub-alignments chained together + int n_comp:30, is_alt:2; // number of sub-alignments chained together float frac_rep; uint64_t hash; } mem_alnreg_t; From f4aedddee6d55dffad46060f5bfd8099f8d22024 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 8 Sep 2014 11:32:48 -0400 Subject: [PATCH 09/19] r819: bugfix - added too many sub-SMEMs --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 8377122..4fd6b28 100644 --- a/bwamem.c +++ b/bwamem.c @@ -137,7 +137,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co if (end - start < split_len || p->x[2] > opt->split_width) continue; bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) - if ((a->mem1.a[i].info>>32) - (uint32_t)a->mem1.a[i].info >= opt->min_seed_len) + if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } // sort diff --git a/main.c b/main.c index f5e65c1..93bd84e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r815-dirty" +#define PACKAGE_VERSION "0.7.10-r819-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 8d2b93156b34b05e97913a99f69980fa94c4b533 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 12 Sep 2014 10:35:49 -0400 Subject: [PATCH 10/19] r821: more relax on containing seeds --- bwamem.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4fd6b28..4df1acc 100644 --- a/bwamem.c +++ b/bwamem.c @@ -627,12 +627,12 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac // qd: distance ahead of the seed on query; rd: on reference qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed - w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width + w = max_gap < p->w? max_gap : p->w; // bounded by the band width if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit // similar to the previous four lines, but this time we look at the region behind qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); max_gap = cal_max_gap(opt, qd < rd? qd : rd); - w = max_gap < opt->w? max_gap : opt->w; + w = max_gap < p->w? max_gap : p->w; if (qd - rd < w && rd - qd < w) break; } if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln diff --git a/main.c b/main.c index 93bd84e..2915e78 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r819-dirty" +#define PACKAGE_VERSION "0.7.10-r821-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 015ab3f6c34484d5bed1610f5e49fb8316da9406 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 14 Sep 2014 16:41:14 -0400 Subject: [PATCH 11/19] r823: towards ALT support --- bwamem.c | 58 ++++++++++++++++++++++++++++++++++----------------- bwamem_pair.c | 20 +++++++++--------- main.c | 2 +- 3 files changed, 50 insertions(+), 30 deletions(-) diff --git a/bwamem.c b/bwamem.c index 66da875..784ec56 100644 --- a/bwamem.c +++ b/bwamem.c @@ -377,9 +377,12 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) -#define alnreg_hlt(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)))) +#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) +#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)))) +KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2) + #define PATCH_MAX_R_BW 0.05f #define PATCH_MIN_SC_RATIO 0.90f @@ -475,24 +478,19 @@ int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int return n - 1; } -void mem_mark_primary_se(const mem_opt_t *opt, int _n, mem_alnreg_t *a, int64_t id) +typedef kvec_t(int) int_v; + +static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z) { // similar to the loop in mem_chain_flt() - int i, k, tmp, n; - kvec_t(int) z; - if (n_ == 0) return; - kv_init(z); - for (i = 0; i < n_; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); - ks_introsort(mem_ars_hash, n_, a); - for (i = 0; i < n_; ++i) - if (a[i].is_alt) break; - if ((n = i) == 0) return; + int i, k, tmp; tmp = opt->a + opt->b; tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp; tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp; - kv_push(int, z, 0); + z->n = 0; + kv_push(int, *z, 0); for (i = 1; i < n; ++i) { - for (k = 0; k < z.n; ++k) { - int j = z.a[k]; + for (k = 0; k < z->n; ++k) { + int j = z->a[k]; int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; if (e_min > b_max) { // have overlap @@ -504,10 +502,32 @@ void mem_mark_primary_se(const mem_opt_t *opt, int _n, mem_alnreg_t *a, int64_t } } } - if (k == z.n) kv_push(int, z, i); - else a[i].secondary = z.a[k]; + if (k == z->n) kv_push(int, *z, i); + else a[i].secondary = z->a[k]; + } +} + +int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) +{ + int i, n_pri; + int_v z = {0,0,0}; + if (n == 0) return 0; + for (i = n_pri = 0; i < n; ++i) { + a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); + if (!a[i].is_alt) ++n_pri; + } + ks_introsort(mem_ars_hash, n, a); + mem_mark_primary_se_core(opt, n, a, &z); + for (i = 0; i < n; ++i) // this block is used to trigger potential bugs; if no bugs, it has no effects + if (a[i].is_alt && a[i].secondary >= 0) + a[i].secondary = INT_MAX; + if (n_pri > 0 || n_pri != n) { + ks_introsort(mem_ars_hash2, n, a); + for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; + mem_mark_primary_se_core(opt, n_pri, a, &z); } free(z.a); + return n_pri; } /********************************* @@ -920,7 +940,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) } // TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible -void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); kstring_t str; @@ -936,7 +956,7 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa mem_alnreg_t *p = &a->a[k]; mem_aln_t *q; if (p->score < opt->T) continue; - if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); @@ -1114,7 +1134,7 @@ static void worker2(void *data, int i, int tid) mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); } else { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); - mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); } free(w->regs[i].a); } else { diff --git a/bwamem_pair.c b/bwamem_pair.c index 8fe2dd0..c79cc69 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -177,14 +177,14 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co return n; } -int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) +int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2]) { pair64_v v, u; int r, i, k, y[4], ret; // y[] keeps the last hit int64_t l_pac = bns->l_pac; kv_init(v); kv_init(u); for (r = 0; r < 2; ++r) { // loop through read number - for (i = 0; i < a[r].n; ++i) { + for (i = 0; i < n_pri[r]; ++i) { pair64_t key; mem_alnreg_t *e = &a[r].a[i]; key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position @@ -244,13 +244,13 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { - extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); + extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); - extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); + extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); extern void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); - int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; + int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2]; kstring_t str; mem_aln_t h[2]; @@ -267,11 +267,11 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); } - mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); - mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); + n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); + n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - if (a[0].n && a[1].n && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { + if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) { int is_multi[2], q_pe, score_un, q_se[2]; char **XA[2]; // check if an end has multiple hits even after mate-SW @@ -354,8 +354,8 @@ no_pairing: d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; } - mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); - mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); + mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); + mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); free(h[1].cigar); return n; diff --git a/main.c b/main.c index 2915e78..4c7a85e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r821-dirty" +#define PACKAGE_VERSION "0.7.10-r823-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From aee53f1334193943f84910bcde2fd89fa6d722ca Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Sep 2014 00:29:05 -0400 Subject: [PATCH 12/19] r824: ALT mapping seems working --- bwamem.c | 9 +++++---- bwamem_extra.c | 4 ++-- main.c | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index 784ec56..6868dba 100644 --- a/bwamem.c +++ b/bwamem.c @@ -157,7 +157,7 @@ typedef struct { typedef struct { int n, m, first, rid; - uint32_t w:30, kept:2; + uint32_t w:29, kept:2, is_alt:1; float frac_rep; int64_t pos; mem_seed_t *seeds; @@ -278,6 +278,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds[0] = s; tmp.rid = rid; + tmp.is_alt = !!bns->anns[rid].is_alt; kb_putp(chn, tree, &tmp); } } @@ -331,7 +332,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) int j = chains.a[k]; int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]); - if (e_min > b_max) { // have overlap + if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary int li = chn_end(a[i]) - chn_beg(a[i]); int lj = chn_end(a[j]) - chn_beg(a[j]); int min_l = li < lj? li : lj; @@ -518,7 +519,7 @@ int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id } ks_introsort(mem_ars_hash, n, a); mem_mark_primary_se_core(opt, n, a, &z); - for (i = 0; i < n; ++i) // this block is used to trigger potential bugs; if no bugs, it has no effects + for (i = 0; i < n; ++i) // don't track the "parent" hit of ALT secondary hits if (a[i].is_alt && a[i].secondary >= 0) a[i].secondary = INT_MAX; if (n_pri > 0 || n_pri != n) { @@ -957,7 +958,7 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_aln_t *q; if (p->score < opt->T) continue; if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue; - if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; + if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); assert(q->rid >= 0); // this should not happen with the new code diff --git a/bwamem_extra.c b/bwamem_extra.c index add9ff5..45289ea 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -128,7 +128,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac cnt = calloc(a->n, sizeof(int)); for (i = 0, tot = 0; i < a->n; ++i) { int j = a->a[i].secondary; - if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio) + if (j >= 0 && j < INT_MAX && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio) ++cnt[j], ++tot; } if (tot == 0) goto end_gen_alt; @@ -136,7 +136,7 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac for (i = 0; i < a->n; ++i) { mem_aln_t t; int j = a->a[i].secondary; - if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later + if (j < 0 || j == INT_MAX || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later if (cnt[j] > opt->max_hits) continue; t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); kputs(bns->anns[t.rid].name, &aln[j]); diff --git a/main.c b/main.c index 4c7a85e..a2543e4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r823-dirty" +#define PACKAGE_VERSION "0.7.10-r824-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From bd85af08abb09a81d12f0c351b3a6500bfe95042 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Sep 2014 12:13:04 -0400 Subject: [PATCH 13/19] r826: improved alt mapping for PE --- bwamem_pair.c | 6 +++--- main.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index c79cc69..f212427 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -276,9 +276,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co char **XA[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { - for (j = 1; j < a[i].n; ++j) + for (j = 1; j < n_pri[i]; ++j) if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; - is_multi[i] = j < a[i].n? 1 : 0; + is_multi[i] = j < n_pri[i]? 1 : 0; } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score // compute mapQ for the best SE hit @@ -316,7 +316,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co int k = a[i].a[z[i]].secondary; if (k >= 0) { // switch secondary and primary assert(a[i].a[k].secondary < 0); - for (j = 0; j < a[i].n; ++j) + for (j = 0; j < n_pri[i]; ++j) if (a[i].a[j].secondary == k || j == k) a[i].a[j].secondary = z[i]; a[i].a[z[i]].secondary = -1; diff --git a/main.c b/main.c index a2543e4..4a38fbd 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r824-dirty" +#define PACKAGE_VERSION "0.7.10-r826-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From b07587f8065597d2a3cdf8963a45b1e464b27466 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Sep 2014 16:07:51 -0400 Subject: [PATCH 14/19] r827: an alt hit as good as a pri hit as supp --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6868dba..ae9731a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -378,7 +378,7 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) -#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) +#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash)))) KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) #define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)))) diff --git a/main.c b/main.c index 4a38fbd..8c34b1b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r826-dirty" +#define PACKAGE_VERSION "0.7.10-r827-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 5d26ab0ee3e4b4e19f5ccc7a3a9ecc455075f110 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Sep 2014 23:22:05 -0400 Subject: [PATCH 15/19] r828: changed the default scoring for pacbio --- fastmap.c | 12 ++++++------ main.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fastmap.c b/fastmap.c index c141383..05a5954 100644 --- a/fastmap.c +++ b/fastmap.c @@ -153,8 +153,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); - fprintf(stderr, " pacbio: -k17 -W40 -r10 -A2 -B5 -O2 -E1 -L0\n"); - fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n"); + fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0\n"); +// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -180,13 +180,13 @@ int main_mem(int argc, char *argv[]) if (mode) { if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { - if (!opt0.a) opt->a = 2, opt0.a = 1; + if (!opt0.a) opt->a = 1, opt0.a = 1; update_a(opt, &opt0); - if (!opt0.o_del) opt->o_del = 2; + if (!opt0.o_del) opt->o_del = 1; if (!opt0.e_del) opt->e_del = 1; - if (!opt0.o_ins) opt->o_ins = 2; + if (!opt0.o_ins) opt->o_ins = 1; if (!opt0.e_ins) opt->e_ins = 1; - if (!opt0.b) opt->b = 5; + if (!opt0.b) opt->b = 1; if (opt0.split_factor == 0.) opt->split_factor = 10.; if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { diff --git a/main.c b/main.c index 8c34b1b..3edac05 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r827-dirty" +#define PACKAGE_VERSION "0.7.10-r828-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 624687b072d0ddd40a745d91de35c19b4b54b62a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Sep 2014 23:33:22 -0400 Subject: [PATCH 16/19] r829: killed a harmless gcc warning --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index ae9731a..dbe039f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1151,7 +1151,7 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn worker_t w; mem_pestat_t pes[4]; double ctime, rtime; - int i, has_alt = 0; + int i; for (i = 0; i < bns->n_seqs; ++i) if (bns->anns[i].is_alt) has_alt = 1; diff --git a/main.c b/main.c index 3edac05..9ff48fc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r828-dirty" +#define PACKAGE_VERSION "0.7.10-r829-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 4b6eeb34c83c2d85a7e4a5b4d4c2fb9718290012 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Sep 2014 23:42:24 -0400 Subject: [PATCH 17/19] r830: optionally fixed chunk size --- bwamem.c | 2 -- fastmap.c | 8 +++++--- main.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index dbe039f..64c26fa 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1153,8 +1153,6 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn double ctime, rtime; int i; - for (i = 0; i < bns->n_seqs; ++i) - if (bns->anns[i].is_alt) has_alt = 1; ctime = cputime(); rtime = realtime(); global_bns = bns; w.regs = malloc(n * sizeof(mem_alnreg_v)); diff --git a/fastmap.c b/fastmap.c index 05a5954..6cdad68 100644 --- a/fastmap.c +++ b/fastmap.c @@ -39,6 +39,7 @@ int main_mem(int argc, char *argv[]) { mem_opt_t *opt, opt0; int fd, fd2, i, c, n, copy_comment = 0; + int fixed_chunk_size = -1, actual_chunk_size; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; @@ -54,7 +55,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -85,6 +86,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1; else if (c == 'C') copy_comment = 1; + else if (c == 'K') fixed_chunk_size = atoi(optarg); else if (c == 'Q') { opt0.mapQ_coef_len = 1; opt->mapQ_coef_len = atoi(optarg); @@ -205,7 +207,6 @@ int main_mem(int argc, char *argv[]) return 1; // FIXME memory leak } } else update_a(opt, &opt0); -// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF bwa_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak @@ -234,7 +235,8 @@ int main_mem(int argc, char *argv[]) } if (!(opt->flag & MEM_F_ALN_REG)) bwa_print_sam_hdr(idx->bns, rg_line); - while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; + while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) { int64_t size = 0; if ((opt->flag & MEM_F_PE) && (n&1) == 1) { if (bwa_verbose >= 2) diff --git a/main.c b/main.c index 9ff48fc..0b09051 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r829-dirty" +#define PACKAGE_VERSION "0.7.10-r830-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From d013ea94b0b59979ba8bcd410f7285b98bd791db Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Sep 2014 23:45:17 -0400 Subject: [PATCH 18/19] Removed instruction on ALT alignment Favoring the new method --- README.md | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/README.md b/README.md index dff245e..be4bc99 100644 --- a/README.md +++ b/README.md @@ -130,38 +130,6 @@ case, BWA-backtrack will flag the read as unmapped (0x4), but you will see position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment as well. BWA-MEM does not have this problem. -####6. How to map sequences to GRCh38 with ALT contigs? - -BWA-backtrack and BWA-MEM partially support mapping to a reference containing -ALT contigs that represent alternative alleles highly divergent from the -reference genome. - - # download the K8 executable required by bwa-helper.js - wget http://sourceforge.net/projects/lh3/files/k8/k8-0.2.1.tar.bz2/download - tar -jxf k8-0.2.1.tar.bz2 - - # download the ALT-to-GRCh38 alignment in the SAM format - wget http://sourceforge.net/projects/bio-bwa/files/hs38.alt.sam.gz/download - - # download the GRCh38 sequences with ALT contigs - wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz - - # index and mapping - bwa index -p hs38a GCA_000001405.15_GRCh38_full_analysis_set.fna.gz - bwa mem -h50 hs38a reads.fq | ./k8-linux bwa-helper.js genalt hs38.alt.sam.gz > out.sam - -Here, option `-h50` asks bwa-mem to output multiple hits in the XA tag if the -read has 50 or fewer hits. For each SAM line containing the XA tag, -`bwa-helper.js genalt` decodes the alignments in the XA tag, groups hits lifted -to the same chromosomal region, adjusts mapping quality and outputs all the -hits overlapping the reported hit. A read may be mapped to both the primary -assembly and one or more ALT contigs all with high mapping quality. - -Note that this procedure assumes reads are single-end and may miss hits to -highly repetitive regions as these hits will not be reported with option -`-h50`. `bwa-helper.js` is a prototype implementation not recommended for -production uses. - [1]: http://en.wikipedia.org/wiki/GNU_General_Public_License From 43ce69104d4b31e38135307fac8b1f9e5fc9363a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 15 Sep 2014 23:59:35 -0400 Subject: [PATCH 19/19] Removed PacBio read-to-read instruction --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index be4bc99..58322d4 100644 --- a/README.md +++ b/README.md @@ -87,16 +87,12 @@ settings: * Illumina paired-end reads no longer than ~70bp: bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai - bwa samse ref.fa reads.sai reads.fq > aln-pe.sam + bwa samsp ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam * PacBio subreads to a reference genome: bwa mem -x pacbio ref.fa reads.fq > aln.sam -* PacBio subreads to themselves (the output is not SAM): - - bwa mem -x pbread reads.fq reads.fq > overlap.pas - BWA-MEM is recommended for query sequences longer than ~70bp for a variety of error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with errors given longer query sequences as the chance of missing all seeds is small.