From 2087dc162f5ff5eb6c8da5d4bead13199013864f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 17 Apr 2013 16:50:20 -0400 Subject: [PATCH 1/6] r377: increased unpaired penalty from 9 to 17 This leads to more aggressive pairing - more properly paired reads. I have found a few cases where, for example, read1 is umambiguously mapped to chr20 while its 100bp mate has a perfect match to another chr but has 3 mismatches and 1 deletion when it is paired with read1 on chr20. With longer reads, it seems that the chr20 hit is correct, although it is not obvious how this happened in evolution. --- bwa.1 | 4 ++-- bwamem.c | 2 +- main.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bwa.1 b/bwa.1 index 1edbf12..9d45a3d 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "15 March 2013" "bwa-0.7.3a" "Bioinformatics tools" +.TH bwa 1 "15 March 2013" "bwa-0.7.4" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -220,7 +220,7 @@ deducted. [5] Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as .RI scoreRead1+scoreRead2- INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these -two scores to determine whether we should force pairing. [9] +two scores to determine whether we should force pairing. [17] .TP .B -p Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details. diff --git a/bwamem.c b/bwamem.c index 182b57a..921b512 100644 --- a/bwamem.c +++ b/bwamem.c @@ -46,7 +46,7 @@ mem_opt_t *mem_opt_init() o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->T = 30; o->zdrop = 100; - o->pen_unpaired = 9; + o->pen_unpaired = 17; o->pen_clip = 5; o->min_seed_len = 19; o->split_width = 10; diff --git a/main.c b/main.c index 8c1343c..ea92d81 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r376-beta" +#define PACKAGE_VERSION "0.7.3-r377-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From be11e27e126fd680835eb5287c5d66c9f0d4a32d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 12:00:37 -0400 Subject: [PATCH 2/6] r378: bugfix - wrong CIGAR This is actually caused by a bug in SSE2-SW, where the query begin may be smaller than the true one if there is an exact tandem repeat. --- bwape.c | 6 +++--- ksw.c | 4 +++- main.c | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bwape.c b/bwape.c index 9fd12b1..92161a5 100644 --- a/bwape.c +++ b/bwape.c @@ -404,7 +404,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; - int xtra; + int xtra, gscore; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); @@ -422,12 +422,12 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u // do alignment xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); - ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); + gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); - if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits + if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment free(cigar); free(ref_seq); *n_cigar = 0; return 0; } diff --git a/ksw.c b/ksw.c index 2daf809..a786a2b 100644 --- a/ksw.c +++ b/ksw.c @@ -201,10 +201,11 @@ end_loop16: r.score = gmax + q->shift < 255? gmax : 255; r.te = te; if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score - int max = -1, low, high, qlen = slen * 16; + int max = -1, tmp, low, high, qlen = slen * 16; uint8_t *t = (uint8_t*)Hmax; for (i = 0; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; + else ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; //printf("%d,%d\n", max, gmax); if (b) { i = (r.score + q->max - 1) / q->max; @@ -306,6 +307,7 @@ end_loop8: uint16_t *t = (uint16_t*)Hmax; for (i = 0, r.qe = -1; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + else ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; if (b) { i = (r.score + q->max - 1) / q->max; low = te - i; high = te + i; diff --git a/main.c b/main.c index ea92d81..75e4e6b 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r377-beta" +#define PACKAGE_VERSION "0.7.3-r378-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From f0c94d80d1bc1639549aa272c1d1950668f19c48 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 12:04:00 -0400 Subject: [PATCH 3/6] r379: fixed compiling error --- ksw.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ksw.c b/ksw.c index a786a2b..0a9b40e 100644 --- a/ksw.c +++ b/ksw.c @@ -205,7 +205,7 @@ end_loop16: uint8_t *t = (uint8_t*)Hmax; for (i = 0; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; - else ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; + else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; //printf("%d,%d\n", max, gmax); if (b) { i = (r.score + q->max - 1) / q->max; @@ -307,7 +307,7 @@ end_loop8: uint16_t *t = (uint16_t*)Hmax; for (i = 0, r.qe = -1; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; - else ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; + else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; if (b) { i = (r.score + q->max - 1) / q->max; low = te - i; high = te + i; diff --git a/main.c b/main.c index 75e4e6b..1580451 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r378-beta" +#define PACKAGE_VERSION "0.7.3-r379-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From db7a98636f9907107c41a5c3e2f11a62a264758d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 12:04:44 -0400 Subject: [PATCH 4/6] r380: er... another compiling error --- ksw.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ksw.c b/ksw.c index 0a9b40e..7d9dfb3 100644 --- a/ksw.c +++ b/ksw.c @@ -303,7 +303,7 @@ end_loop8: } r.score = gmax; r.te = te; { - int max = -1, low, high, qlen = slen * 8; + int max = -1, tmp, low, high, qlen = slen * 8; uint16_t *t = (uint16_t*)Hmax; for (i = 0, r.qe = -1; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; diff --git a/main.c b/main.c index 1580451..4fa2522 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r379-beta" +#define PACKAGE_VERSION "0.7.3-r380-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 3f8caef33c92d1dff61a547ad7bf0cd72ea776a2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 17:44:35 -0400 Subject: [PATCH 5/6] r381: fixed a bug when upper bound < max read len --- bwape.c | 5 +++++ main.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/bwape.c b/bwape.c index 92161a5..6752a18 100644 --- a/bwape.c +++ b/bwape.c @@ -106,6 +106,11 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + if (ii->low > ii->high) { + fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n"); + free(isizes); + return -1; + } for (i = 0, x = n = 0; i < tot; ++i) if (isizes[i] >= ii->low && isizes[i] <= ii->high) ++n, x += isizes[i]; diff --git a/main.c b/main.c index 4fa2522..85e41b2 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r380-beta" +#define PACKAGE_VERSION "0.7.3-r381-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From f6ae0d4d0f2f16392d254f0621008299f53eac0f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 17:52:06 -0400 Subject: [PATCH 6/6] r382: similar treatment in bwa-sw (see r381) --- bwtsw2_pair.c | 8 +++++++- main.c | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index cad96e9..cdd822f 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -46,7 +46,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) p75 = isize[(int)(.75 * k + .499)]; ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k); if (k < 8) { - ksprintf(msg, "[%s] fail to infer the insert size distribution.\n", __func__); + ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__); free(isize); r.failed = 1; return r; @@ -55,6 +55,12 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) r.low = tmp > max_len? tmp : max_len; if (r.low < 1) r.low = 1; r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + if (r.low > r.high) { + ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__); + free(isize); + r.failed = 1; + return r; + } ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); for (i = x = 0, r.avg = 0; i < k; ++i) diff --git a/main.c b/main.c index 85e41b2..aee4772 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r381-beta" +#define PACKAGE_VERSION "0.7.3-r382-beta" #endif int bwa_fa2pac(int argc, char *argv[]);