From cd818687ac4ba6ccb20fa12ed30136f5601b9cc4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 17 Apr 2012 20:43:43 -0400 Subject: [PATCH] r115: added -I and -S to bwasw --- bwtsw2.h | 5 +++-- bwtsw2_aux.c | 3 ++- bwtsw2_main.c | 8 ++++++-- bwtsw2_pair.c | 12 +++++++----- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 89615b5..0a1b860 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -12,8 +12,9 @@ #define BSW2_FLAG_RESCUED 0x800 typedef struct { - int a, b, q, r, t, qr, bw; - int z, is, t_seeds, hard_clip, multi_2nd; + int skip_sw:16, hard_clip:16; + int a, b, q, r, t, qr, bw, max_ins; + int z, is, t_seeds, multi_2nd; float mask_level, coef; int n_threads, chunk_size; } bsw2opt_t; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 2cce142..710051d 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -50,7 +50,8 @@ bsw2opt_t *bsw2_init_opt() bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; o->bw = 50; - o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; + o->max_ins = 20000; + o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; o->mask_level = 0.50f; o->coef = 5.5f; o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; return o; diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 041e8ae..50355fe 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -18,7 +18,7 @@ int bwa_bwtsw2(int argc, char *argv[]) opt = bsw2_init_opt(); srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:M")) >= 0) { + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:S")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; @@ -35,6 +35,8 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'M': opt->multi_2nd = 1; break; case 'H': opt->hard_clip = 1; break; case 'f': xreopen(optarg, "w", stdout); break; + case 'I': opt->max_ins = atoi(optarg); break; + case 'S': opt->skip_sw = 1; break; } } opt->qr = opt->q + opt->r; @@ -50,9 +52,11 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); fprintf(stderr, "\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); + fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); fprintf(stderr, " -M mark multi-part alignments as secondary\n"); + fprintf(stderr, " -S skip Smith-Waterman read pairing\n"); + fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins); fprintf(stderr, "\n"); fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 5e1ec7c..a6f4d80 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -12,7 +12,6 @@ #include "stdaln.h" #endif -#define MAX_INS 20000 #define MIN_RATIO 0.8 #define OUTLIER_BOUND 2.0 #define MAX_STDDEV 4.0 @@ -23,7 +22,7 @@ typedef struct { double avg, std; } bsw2pestat_t; -bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg) +bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, k, x, p25, p50, p75, tmp, max_len = 0; @@ -40,6 +39,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg) if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; + if (l >= max_ins) continue; // skip pairs with excessively large insert max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; isize[k++] = l; @@ -186,7 +186,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b int8_t g_mat[25]; kstring_t msg; memset(&msg, 0, sizeof(kstring_t)); - pes = bsw2_stat(n, hits, &msg); + pes = bsw2_stat(n, hits, &msg, opt->max_ins); for (i = k = 0; i < 5; ++i) { for (j = 0; j < 4; ++j) g_mat[k++] = i == j? opt->a : -opt->b; @@ -207,8 +207,10 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit - if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); - if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); + if (!opt->skip_sw) { + if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); + if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); + } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0 // the following enumerate all possibilities. It is tedious but necessary... if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; bwtsw2_t *p[2];