compute insert size distribution
This commit is contained in:
parent
a29b179045
commit
17eaac5a21
2
Makefile
2
Makefile
|
|
@ -7,7 +7,7 @@ OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \
|
|||
is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \
|
||||
bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o bamlite.o fastmap.o
|
||||
bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o
|
||||
PROG= bwa
|
||||
INCLUDES=
|
||||
LIBS= -lm -lz -lpthread
|
||||
|
|
|
|||
7
bwtsw2.h
7
bwtsw2.h
|
|
@ -33,6 +33,11 @@ typedef struct {
|
|||
uint8_t *aln_mem;
|
||||
} bsw2global_t;
|
||||
|
||||
typedef struct {
|
||||
int l, tid;
|
||||
char *name, *seq, *qual, *sam;
|
||||
} bsw2seq1_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
@ -45,6 +50,8 @@ extern "C" {
|
|||
bsw2global_t *bsw2_global_init();
|
||||
void bsw2_global_destroy(bsw2global_t *_pool);
|
||||
|
||||
void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -328,11 +328,6 @@ static void flag_fr(bwtsw2_t *b[2])
|
|||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int l, tid;
|
||||
char *name, *seq, *qual, *sam;
|
||||
} bsw2seq1_t;
|
||||
|
||||
typedef struct {
|
||||
int n, max;
|
||||
bsw2seq1_t *seq;
|
||||
|
|
@ -559,6 +554,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const
|
|||
free(seq[0]);
|
||||
bsw2_destroy(b[0]);
|
||||
}
|
||||
bwtsw2_pair(pac, _seq->n, _seq->seq, buf);
|
||||
for (x = 0; x < _seq->n; ++x) {
|
||||
print_hits(bns, &opt, &_seq->seq[x], buf[x]);
|
||||
bsw2_destroy(buf[x]);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,71 @@
|
|||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "bwt.h"
|
||||
#include "bntseq.h"
|
||||
#include "bwtsw2.h"
|
||||
#include "ksw.h"
|
||||
|
||||
#define MAX_INS 20000
|
||||
#define MIN_RATIO 0.8
|
||||
#define OUTLIER_BOUND 2.0
|
||||
#define MAX_STDDEV 4.0
|
||||
|
||||
typedef struct {
|
||||
int low, high;
|
||||
double avg, std;
|
||||
} bsw2pestat_t;
|
||||
|
||||
bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf)
|
||||
{
|
||||
extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
|
||||
int i, k, x, p25, p50, p75, tmp, max_len = 0;
|
||||
uint64_t *isize;
|
||||
bsw2pestat_t r;
|
||||
|
||||
isize = calloc(n, 8);
|
||||
for (i = k = 0; i < n; i += 2) {
|
||||
bsw2hit_t *t[2];
|
||||
int l;
|
||||
if (buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits
|
||||
t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0];
|
||||
if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough
|
||||
if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough
|
||||
l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + (t[1]->end - t[1]->beg) : t[1]->k - t[0]->k + (t[0]->end - t[0]->beg);
|
||||
max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg;
|
||||
max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
|
||||
isize[k++] = l;
|
||||
}
|
||||
ks_introsort_uint64_t(k, isize);
|
||||
p25 = isize[(int)(.25 * k + .499)];
|
||||
p50 = isize[(int)(.50 * k + .499)];
|
||||
p75 = isize[(int)(.75 * k + .499)];
|
||||
tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
r.low = tmp > max_len? tmp : max_len;
|
||||
r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
fprintf(stderr, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
|
||||
fprintf(stderr, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high);
|
||||
for (i = x = 0, r.avg = 0; i < k; ++i)
|
||||
if (isize[i] >= r.low && isize[i] <= r.high)
|
||||
r.avg += isize[i], ++x;
|
||||
r.avg /= x;
|
||||
for (i = 0, r.std = 0; i < k; ++i)
|
||||
if (isize[i] >= r.low && isize[i] <= r.high)
|
||||
r.std += (isize[i] - r.avg) * (isize[i] - r.avg);
|
||||
r.std = sqrt(r.std / x);
|
||||
fprintf(stderr, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std);
|
||||
tmp = (int)(p25 - 3. * (p75 - p25) + .499);
|
||||
r.low = tmp > max_len? tmp : max_len;
|
||||
r.high = (int)(p75 + 3. * (p75 - p25) + .499);
|
||||
if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499);
|
||||
r.low = tmp > max_len? tmp : max_len;
|
||||
if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499);
|
||||
fprintf(stderr, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
|
||||
return r;
|
||||
}
|
||||
|
||||
void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits)
|
||||
{
|
||||
bsw2pestat_t pes;
|
||||
pes = bwtsw2_stat(n, hits);
|
||||
}
|
||||
Loading…
Reference in New Issue