diff --git a/Makefile b/Makefile index e4b073f..63acf1c 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o bamlite.o fastmap.o + bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread diff --git a/bwtsw2.h b/bwtsw2.h index bd6d219..3272929 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -33,6 +33,11 @@ typedef struct { uint8_t *aln_mem; } bsw2global_t; +typedef struct { + int l, tid; + char *name, *seq, *qual, *sam; +} bsw2seq1_t; + #ifdef __cplusplus extern "C" { #endif @@ -45,6 +50,8 @@ extern "C" { bsw2global_t *bsw2_global_init(); void bsw2_global_destroy(bsw2global_t *_pool); + void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); + #ifdef __cplusplus } #endif diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 2d91b2d..e992aaf 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -328,11 +328,6 @@ static void flag_fr(bwtsw2_t *b[2]) } } -typedef struct { - int l, tid; - char *name, *seq, *qual, *sam; -} bsw2seq1_t; - typedef struct { int n, max; bsw2seq1_t *seq; @@ -559,6 +554,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const free(seq[0]); bsw2_destroy(b[0]); } + bwtsw2_pair(pac, _seq->n, _seq->seq, buf); for (x = 0; x < _seq->n; ++x) { print_hits(bns, &opt, &_seq->seq[x], buf[x]); bsw2_destroy(buf[x]); diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c new file mode 100644 index 0000000..cf88f8e --- /dev/null +++ b/bwtsw2_pair.c @@ -0,0 +1,71 @@ +#include +#include +#include +#include "bwt.h" +#include "bntseq.h" +#include "bwtsw2.h" +#include "ksw.h" + +#define MAX_INS 20000 +#define MIN_RATIO 0.8 +#define OUTLIER_BOUND 2.0 +#define MAX_STDDEV 4.0 + +typedef struct { + int low, high; + double avg, std; +} bsw2pestat_t; + +bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf) +{ + extern void ks_introsort_uint64_t(size_t n, uint64_t *a); + int i, k, x, p25, p50, p75, tmp, max_len = 0; + uint64_t *isize; + bsw2pestat_t r; + + isize = calloc(n, 8); + for (i = k = 0; i < n; i += 2) { + bsw2hit_t *t[2]; + int l; + if (buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits + t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; + if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough + if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough + l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + (t[1]->end - t[1]->beg) : t[1]->k - t[0]->k + (t[0]->end - t[0]->beg); + max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; + max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; + isize[k++] = l; + } + ks_introsort_uint64_t(k, isize); + p25 = isize[(int)(.25 * k + .499)]; + p50 = isize[(int)(.50 * k + .499)]; + p75 = isize[(int)(.75 * k + .499)]; + tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + fprintf(stderr, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + fprintf(stderr, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); + for (i = x = 0, r.avg = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.avg += isize[i], ++x; + r.avg /= x; + for (i = 0, r.std = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.std += (isize[i] - r.avg) * (isize[i] - r.avg); + r.std = sqrt(r.std / x); + fprintf(stderr, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); + tmp = (int)(p25 - 3. * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + r.high = (int)(p75 + 3. * (p75 - p25) + .499); + if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); + fprintf(stderr, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); + return r; +} + +void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +{ + bsw2pestat_t pes; + pes = bwtsw2_stat(n, hits); +}