合并更改(run.sh)
This commit is contained in:
commit
aa31eac682
|
|
@ -1,4 +1,7 @@
|
||||||
|
test_out/
|
||||||
*.[oa]
|
*.[oa]
|
||||||
|
*.fa
|
||||||
|
*.txt
|
||||||
bwa
|
bwa
|
||||||
test
|
test
|
||||||
test64
|
test64
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
// 使用 IntelliSense 了解相关属性。
|
||||||
|
// 悬停以查看现有属性的描述。
|
||||||
|
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "bwa-mem",
|
||||||
|
"preLaunchTask": "Build",
|
||||||
|
"type": "cppdbg",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${workspaceRoot}/bwa",
|
||||||
|
"args": [
|
||||||
|
"mem",
|
||||||
|
"-t",
|
||||||
|
"64",
|
||||||
|
"-M",
|
||||||
|
"-R",
|
||||||
|
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||||
|
"/share_nas3/zyseq-release-v1.1.3/zyseq/wes/resource/reference/human_g1k_v37_decoy.fasta",
|
||||||
|
"/share_nas3/zyseq-release-v1.1.3/zyseq/data/n1.fq",
|
||||||
|
"/share_nas3/zyseq-release-v1.1.3/zyseq/data/n2.fq",
|
||||||
|
//"/public/home/zzh/data/reference/human_g1k_v37_decoy.fasta",
|
||||||
|
//"/public/home/zzh/data/fastq/n1.fq",
|
||||||
|
//"/public/home/zzh/data/fastq/n2.fq",
|
||||||
|
"-o",
|
||||||
|
"/dev/null"
|
||||||
|
],
|
||||||
|
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "index",
|
||||||
|
"preLaunchTask": "Build",
|
||||||
|
"type": "cppdbg",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${workspaceRoot}/bwa",
|
||||||
|
"args": [
|
||||||
|
"index",
|
||||||
|
"reference/human_g1k_v37_decoy.fasta"
|
||||||
|
],
|
||||||
|
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"files.associations": {
|
||||||
|
"random": "c"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
// See https://go.microsoft.com/fwlink/?LinkId=733558
|
||||||
|
// for the documentation about the tasks.json format
|
||||||
|
"version": "2.0.0",
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"label": "Build",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "make clean; make -j 16",
|
||||||
|
"problemMatcher": [],
|
||||||
|
"group": {
|
||||||
|
"kind": "build",
|
||||||
|
"isDefault": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
20
Makefile
20
Makefile
|
|
@ -1,18 +1,22 @@
|
||||||
CC= gcc
|
CC= gcc
|
||||||
#CC= clang --analyze
|
#CC= clang --analyze
|
||||||
CFLAGS= -g -Wall -Wno-unused-function -O2
|
#CFLAGS= -g -Wall -Wno-unused-function -mavx2 -I../jemalloc-5.3.0/include
|
||||||
|
CFLAGS= -g -Wall -Wno-unused-function -O2 -mavx2 -I../jemalloc-5.3.0/include
|
||||||
|
#CFLAGS= -g -Wall -Wno-unused-function -O2 -mavx2
|
||||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||||
|
SHOW_PERF=-DSHOW_PERF
|
||||||
AR= ar
|
AR= ar
|
||||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF)
|
||||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
|
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
|
||||||
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
|
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o ksw_avx2.o
|
||||||
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||||
bwape.o kopen.o pemerge.o maxk.o \
|
bwape.o kopen.o pemerge.o maxk.o \
|
||||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||||
PROG= bwa
|
PROG= bwa
|
||||||
INCLUDES=
|
INCLUDES=
|
||||||
LIBS= -lm -lz -lpthread
|
LIBS= -lm -lz -lpthread -ldl
|
||||||
|
#LIBS= -lm -lz -lpthread
|
||||||
SUBDIRS= .
|
SUBDIRS= .
|
||||||
|
|
||||||
ifeq ($(shell uname -s),Linux)
|
ifeq ($(shell uname -s),Linux)
|
||||||
|
|
@ -26,8 +30,12 @@ endif
|
||||||
|
|
||||||
all:$(PROG)
|
all:$(PROG)
|
||||||
|
|
||||||
bwa:libbwa.a $(AOBJS) main.o
|
# 用jemalloc代替malloc
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
|
bwa:libbwa.a ../jemalloc-5.3.0/lib/libjemalloc.a $(AOBJS) main.o
|
||||||
|
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) ../jemalloc-5.3.0/lib/libjemalloc.a main.o -o $@ -L. -lbwa $(LIBS)
|
||||||
|
# 原始malloc
|
||||||
|
#bwa:libbwa.a $(AOBJS) main.o
|
||||||
|
# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
|
||||||
|
|
||||||
bwamem-lite:libbwa.a example.o
|
bwamem-lite:libbwa.a example.o
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
|
$(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
|
||||||
|
|
|
||||||
467
bwa.c
467
bwa.c
|
|
@ -1,8 +1,8 @@
|
||||||
/* The MIT License
|
/* The MIT License
|
||||||
|
|
||||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||||
2009-2018 Broad Institute, Inc.
|
2009-2018 Broad Institute, Inc.
|
||||||
2008-2009 Genome Research Ltd. (GRL)
|
2008-2009 Genome Research Ltd. (GRL)
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
Permission is hereby granted, free of charge, to any person obtaining
|
||||||
a copy of this software and associated documentation files (the
|
a copy of this software and associated documentation files (the
|
||||||
|
|
@ -36,7 +36,12 @@
|
||||||
#include "kvec.h"
|
#include "kvec.h"
|
||||||
|
|
||||||
#ifdef USE_MALLOC_WRAPPERS
|
#ifdef USE_MALLOC_WRAPPERS
|
||||||
# include "malloc_wrap.h"
|
#include "malloc_wrap.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
extern int64_t get_mseconds();
|
||||||
|
extern int64_t time_ksw_global2;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int bwa_verbose = 3;
|
int bwa_verbose = 3;
|
||||||
|
|
@ -53,14 +58,15 @@ KSEQ_DECLARE(gzFile)
|
||||||
|
|
||||||
static inline void trim_readno(kstring_t *s)
|
static inline void trim_readno(kstring_t *s)
|
||||||
{
|
{
|
||||||
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
|
if (s->l > 2 && s->s[s->l - 2] == '/' && isdigit(s->s[s->l - 1]))
|
||||||
s->l -= 2, s->s[s->l] = 0;
|
s->l -= 2, s->s[s->l] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline char *dupkstring(const kstring_t *str, int dupempty)
|
static inline char *dupkstring(const kstring_t *str, int dupempty)
|
||||||
{
|
{
|
||||||
char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL;
|
char *s = (str->l > 0 || dupempty) ? malloc(str->l + 1) : NULL;
|
||||||
if (!s) return NULL;
|
if (!s)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
memcpy(s, str->s, str->l);
|
memcpy(s, str->s, str->l);
|
||||||
s[str->l] = '\0';
|
s[str->l] = '\0';
|
||||||
|
|
@ -78,32 +84,39 @@ static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
|
||||||
|
|
||||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
||||||
{
|
{
|
||||||
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
|
kseq_t *ks = (kseq_t *)ks1_, *ks2 = (kseq_t *)ks2_;
|
||||||
int size = 0, m, n;
|
int size = 0, m, n;
|
||||||
bseq1_t *seqs;
|
bseq1_t *seqs;
|
||||||
m = n = 0; seqs = 0;
|
m = n = 0;
|
||||||
while (kseq_read(ks) >= 0) {
|
seqs = 0;
|
||||||
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
|
while (kseq_read(ks) >= 0)
|
||||||
|
{
|
||||||
|
if (ks2 && kseq_read(ks2) < 0)
|
||||||
|
{ // the 2nd file has fewer reads
|
||||||
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
|
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (n >= m) {
|
if (n >= m)
|
||||||
m = m? m<<1 : 256;
|
{
|
||||||
|
m = m ? m << 1 : 256;
|
||||||
seqs = realloc(seqs, m * sizeof(bseq1_t));
|
seqs = realloc(seqs, m * sizeof(bseq1_t));
|
||||||
}
|
}
|
||||||
trim_readno(&ks->name);
|
trim_readno(&ks->name);
|
||||||
kseq2bseq1(ks, &seqs[n]);
|
kseq2bseq1(ks, &seqs[n]);
|
||||||
seqs[n].id = n;
|
seqs[n].id = n;
|
||||||
size += seqs[n++].l_seq;
|
size += seqs[n++].l_seq;
|
||||||
if (ks2) {
|
if (ks2)
|
||||||
|
{
|
||||||
trim_readno(&ks2->name);
|
trim_readno(&ks2->name);
|
||||||
kseq2bseq1(ks2, &seqs[n]);
|
kseq2bseq1(ks2, &seqs[n]);
|
||||||
seqs[n].id = n;
|
seqs[n].id = n;
|
||||||
size += seqs[n++].l_seq;
|
size += seqs[n++].l_seq;
|
||||||
}
|
}
|
||||||
if (size >= chunk_size && (n&1) == 0) break;
|
if (size >= chunk_size && (n & 1) == 0)
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (size == 0) { // test if the 2nd file is finished
|
if (size == 0)
|
||||||
|
{ // test if the 2nd file is finished
|
||||||
if (ks2 && kseq_read(ks2) >= 0)
|
if (ks2 && kseq_read(ks2) >= 0)
|
||||||
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
|
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
@ -114,17 +127,25 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
||||||
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
|
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
|
||||||
{
|
{
|
||||||
int i, has_last;
|
int i, has_last;
|
||||||
kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
|
kvec_t(bseq1_t) a[2] = {{0, 0, 0}, {0, 0, 0}};
|
||||||
for (i = 1, has_last = 1; i < n; ++i) {
|
for (i = 1, has_last = 1; i < n; ++i)
|
||||||
if (has_last) {
|
{
|
||||||
if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
|
if (has_last)
|
||||||
kv_push(bseq1_t, a[1], seqs[i-1]);
|
{
|
||||||
|
if (strcmp(seqs[i].name, seqs[i - 1].name) == 0)
|
||||||
|
{
|
||||||
|
kv_push(bseq1_t, a[1], seqs[i - 1]);
|
||||||
kv_push(bseq1_t, a[1], seqs[i]);
|
kv_push(bseq1_t, a[1], seqs[i]);
|
||||||
has_last = 0;
|
has_last = 0;
|
||||||
} else kv_push(bseq1_t, a[0], seqs[i-1]);
|
}
|
||||||
} else has_last = 1;
|
else
|
||||||
|
kv_push(bseq1_t, a[0], seqs[i - 1]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
has_last = 1;
|
||||||
}
|
}
|
||||||
if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
|
if (has_last)
|
||||||
|
kv_push(bseq1_t, a[0], seqs[i - 1]);
|
||||||
sep[0] = a[0].a, m[0] = a[0].n;
|
sep[0] = a[0].a, m[0] = a[0].n;
|
||||||
sep[1] = a[1].a, m[1] = a[1].n;
|
sep[1] = a[1].a, m[1] = a[1].n;
|
||||||
}
|
}
|
||||||
|
|
@ -136,12 +157,14 @@ void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
|
||||||
void bwa_fill_scmat(int a, int b, int8_t mat[25])
|
void bwa_fill_scmat(int a, int b, int8_t mat[25])
|
||||||
{
|
{
|
||||||
int i, j, k;
|
int i, j, k;
|
||||||
for (i = k = 0; i < 4; ++i) {
|
for (i = k = 0; i < 4; ++i)
|
||||||
|
{
|
||||||
for (j = 0; j < 4; ++j)
|
for (j = 0; j < 4; ++j)
|
||||||
mat[k++] = i == j? a : -b;
|
mat[k++] = i == j ? a : -b;
|
||||||
mat[k++] = -1; // ambiguous base
|
mat[k++] = -1; // ambiguous base
|
||||||
}
|
}
|
||||||
for (j = 0; j < 5; ++j) mat[k++] = -1;
|
for (j = 0; j < 5; ++j)
|
||||||
|
mat[k++] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate CIGAR when the alignment end points are known
|
// Generate CIGAR when the alignment end points are known
|
||||||
|
|
@ -154,78 +177,119 @@ uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins,
|
||||||
kstring_t str;
|
kstring_t str;
|
||||||
const char *int2base;
|
const char *int2base;
|
||||||
|
|
||||||
if (n_cigar) *n_cigar = 0;
|
if (n_cigar)
|
||||||
if (NM) *NM = -1;
|
*n_cigar = 0;
|
||||||
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
if (NM)
|
||||||
|
*NM = -1;
|
||||||
|
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac))
|
||||||
|
return 0; // reject if negative length or bridging the forward and reverse strand
|
||||||
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||||
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
|
if (re - rb != rlen)
|
||||||
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
|
goto ret_gen_cigar; // possible if out of range
|
||||||
for (i = 0; i < l_query>>1; ++i)
|
if (rb >= l_pac)
|
||||||
|
{ // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
|
||||||
|
for (i = 0; i < l_query >> 1; ++i)
|
||||||
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
||||||
for (i = 0; i < rlen>>1; ++i)
|
for (i = 0; i < rlen >> 1; ++i)
|
||||||
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
||||||
}
|
}
|
||||||
if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
|
if (l_query == re - rb && w_ == 0)
|
||||||
|
{ // no gap; no need to do DP
|
||||||
// UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
|
// UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
|
||||||
if (n_cigar) {
|
if (n_cigar)
|
||||||
|
{
|
||||||
cigar = malloc(4);
|
cigar = malloc(4);
|
||||||
cigar[0] = l_query<<4 | 0;
|
cigar[0] = l_query << 4 | 0;
|
||||||
*n_cigar = 1;
|
*n_cigar = 1;
|
||||||
}
|
}
|
||||||
for (i = 0, *score = 0; i < l_query; ++i)
|
for (i = 0, *score = 0; i < l_query; ++i)
|
||||||
*score += mat[rseq[i]*5 + query[i]];
|
*score += mat[rseq[i] * 5 + query[i]];
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
int w, max_gap, max_ins, max_del, min_w;
|
int w, max_gap, max_ins, max_del, min_w;
|
||||||
// set the band-width
|
// set the band-width
|
||||||
max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.);
|
max_ins = (int)((double)(((l_query + 1) >> 1) * mat[0] - o_ins) / e_ins + 1.);
|
||||||
max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.);
|
max_del = (int)((double)(((l_query + 1) >> 1) * mat[0] - o_del) / e_del + 1.);
|
||||||
max_gap = max_ins > max_del? max_ins : max_del;
|
max_gap = max_ins > max_del ? max_ins : max_del;
|
||||||
max_gap = max_gap > 1? max_gap : 1;
|
max_gap = max_gap > 1 ? max_gap : 1;
|
||||||
w = (max_gap + abs((int)rlen - l_query) + 1) >> 1;
|
w = (max_gap + abs((int)rlen - l_query) + 1) >> 1;
|
||||||
w = w < w_? w : w_;
|
w = w < w_ ? w : w_;
|
||||||
min_w = abs((int)rlen - l_query) + 3;
|
min_w = abs((int)rlen - l_query) + 3;
|
||||||
w = w > min_w? w : min_w;
|
w = w > min_w ? w : min_w;
|
||||||
// NW alignment
|
// NW alignment
|
||||||
if (bwa_verbose >= 4) {
|
if (bwa_verbose >= 4)
|
||||||
|
{
|
||||||
printf("* Global bandwidth: %d\n", w);
|
printf("* Global bandwidth: %d\n", w);
|
||||||
printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
printf("* Global ref: ");
|
||||||
printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
for (i = 0; i < rlen; ++i)
|
||||||
|
putchar("ACGTN"[(int)rseq[i]]);
|
||||||
|
putchar('\n');
|
||||||
|
printf("* Global query: ");
|
||||||
|
for (i = 0; i < l_query; ++i)
|
||||||
|
putchar("ACGTN"[(int)query[i]]);
|
||||||
|
putchar('\n');
|
||||||
}
|
}
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
*score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
|
*score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_ksw_global2, tmp_diff);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
if (NM && n_cigar) {// compute NM and MD
|
if (NM && n_cigar)
|
||||||
|
{ // compute NM and MD
|
||||||
int k, x, y, u, n_mm = 0, n_gap = 0;
|
int k, x, y, u, n_mm = 0, n_gap = 0;
|
||||||
str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR
|
str.l = str.m = *n_cigar * 4;
|
||||||
int2base = rb < l_pac? "ACGTN" : "TGCAN";
|
str.s = (char *)cigar; // append MD to CIGAR
|
||||||
for (k = 0, x = y = u = 0; k < *n_cigar; ++k) {
|
int2base = rb < l_pac ? "ACGTN" : "TGCAN";
|
||||||
|
for (k = 0, x = y = u = 0; k < *n_cigar; ++k)
|
||||||
|
{
|
||||||
int op, len;
|
int op, len;
|
||||||
cigar = (uint32_t*)str.s;
|
cigar = (uint32_t *)str.s;
|
||||||
op = cigar[k]&0xf, len = cigar[k]>>4;
|
op = cigar[k] & 0xf, len = cigar[k] >> 4;
|
||||||
if (op == 0) { // match
|
if (op == 0)
|
||||||
for (i = 0; i < len; ++i) {
|
{ // match
|
||||||
if (query[x + i] != rseq[y + i]) {
|
for (i = 0; i < len; ++i)
|
||||||
|
{
|
||||||
|
if (query[x + i] != rseq[y + i])
|
||||||
|
{
|
||||||
kputw(u, &str);
|
kputw(u, &str);
|
||||||
kputc(int2base[rseq[y+i]], &str);
|
kputc(int2base[rseq[y + i]], &str);
|
||||||
++n_mm; u = 0;
|
++n_mm;
|
||||||
} else ++u;
|
u = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
++u;
|
||||||
}
|
}
|
||||||
x += len; y += len;
|
x += len;
|
||||||
} else if (op == 2) { // deletion
|
y += len;
|
||||||
if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR
|
}
|
||||||
kputw(u, &str); kputc('^', &str);
|
else if (op == 2)
|
||||||
|
{ // deletion
|
||||||
|
if (k > 0 && k < *n_cigar - 1)
|
||||||
|
{ // don't do the following if D is the first or the last CIGAR
|
||||||
|
kputw(u, &str);
|
||||||
|
kputc('^', &str);
|
||||||
for (i = 0; i < len; ++i)
|
for (i = 0; i < len; ++i)
|
||||||
kputc(int2base[rseq[y+i]], &str);
|
kputc(int2base[rseq[y + i]], &str);
|
||||||
u = 0; n_gap += len;
|
u = 0;
|
||||||
|
n_gap += len;
|
||||||
}
|
}
|
||||||
y += len;
|
y += len;
|
||||||
} else if (op == 1) x += len, n_gap += len; // insertion
|
}
|
||||||
|
else if (op == 1)
|
||||||
|
x += len, n_gap += len; // insertion
|
||||||
}
|
}
|
||||||
kputw(u, &str); kputc(0, &str);
|
kputw(u, &str);
|
||||||
|
kputc(0, &str);
|
||||||
*NM = n_mm + n_gap;
|
*NM = n_mm + n_gap;
|
||||||
cigar = (uint32_t*)str.s;
|
cigar = (uint32_t *)str.s;
|
||||||
}
|
}
|
||||||
if (rb >= l_pac) // reverse back query
|
if (rb >= l_pac) // reverse back query
|
||||||
for (i = 0; i < l_query>>1; ++i)
|
for (i = 0; i < l_query >> 1; ++i)
|
||||||
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
||||||
|
|
||||||
ret_gen_cigar:
|
ret_gen_cigar:
|
||||||
|
|
@ -251,16 +315,22 @@ char *bwa_idx_infer_prefix(const char *hint)
|
||||||
prefix = malloc(l_hint + 3 + 4 + 1);
|
prefix = malloc(l_hint + 3 + 4 + 1);
|
||||||
strcpy(prefix, hint);
|
strcpy(prefix, hint);
|
||||||
strcpy(prefix + l_hint, ".64.bwt");
|
strcpy(prefix + l_hint, ".64.bwt");
|
||||||
if ((fp = fopen(prefix, "rb")) != 0) {
|
if ((fp = fopen(prefix, "rb")) != 0)
|
||||||
|
{
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
prefix[l_hint + 3] = 0;
|
prefix[l_hint + 3] = 0;
|
||||||
return prefix;
|
return prefix;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
strcpy(prefix + l_hint, ".bwt");
|
strcpy(prefix + l_hint, ".bwt");
|
||||||
if ((fp = fopen(prefix, "rb")) == 0) {
|
if ((fp = fopen(prefix, "rb")) == 0)
|
||||||
|
{
|
||||||
free(prefix);
|
free(prefix);
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
prefix[l_hint] = 0;
|
prefix[l_hint] = 0;
|
||||||
return prefix;
|
return prefix;
|
||||||
|
|
@ -273,16 +343,19 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
||||||
char *tmp, *prefix;
|
char *tmp, *prefix;
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
prefix = bwa_idx_infer_prefix(hint);
|
prefix = bwa_idx_infer_prefix(hint);
|
||||||
if (prefix == 0) {
|
if (prefix == 0)
|
||||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
{
|
||||||
|
if (bwa_verbose >= 1)
|
||||||
|
fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
tmp = calloc(strlen(prefix) + 5, 1);
|
tmp = calloc(strlen(prefix) + 5, 1);
|
||||||
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
|
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
|
||||||
bwt = bwt_restore_bwt(tmp);
|
bwt = bwt_restore_bwt(tmp);
|
||||||
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
|
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
|
||||||
bwt_restore_sa(tmp, bwt);
|
bwt_restore_sa(tmp, bwt);
|
||||||
free(tmp); free(prefix);
|
free(tmp);
|
||||||
|
free(prefix);
|
||||||
return bwt;
|
return bwt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -291,22 +364,28 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
||||||
bwaidx_t *idx;
|
bwaidx_t *idx;
|
||||||
char *prefix;
|
char *prefix;
|
||||||
prefix = bwa_idx_infer_prefix(hint);
|
prefix = bwa_idx_infer_prefix(hint);
|
||||||
if (prefix == 0) {
|
if (prefix == 0)
|
||||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
{
|
||||||
|
if (bwa_verbose >= 1)
|
||||||
|
fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
idx = calloc(1, sizeof(bwaidx_t));
|
idx = calloc(1, sizeof(bwaidx_t));
|
||||||
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
if (which & BWA_IDX_BWT)
|
||||||
if (which & BWA_IDX_BNS) {
|
idx->bwt = bwa_idx_load_bwt(hint);
|
||||||
|
if (which & BWA_IDX_BNS)
|
||||||
|
{
|
||||||
int i, c;
|
int i, c;
|
||||||
idx->bns = bns_restore(prefix);
|
idx->bns = bns_restore(prefix);
|
||||||
for (i = c = 0; i < idx->bns->n_seqs; ++i)
|
for (i = c = 0; i < idx->bns->n_seqs; ++i)
|
||||||
if (idx->bns->anns[i].is_alt) ++c;
|
if (idx->bns->anns[i].is_alt)
|
||||||
|
++c;
|
||||||
if (bwa_verbose >= 3)
|
if (bwa_verbose >= 3)
|
||||||
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
|
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
|
||||||
if (which & BWA_IDX_PAC) {
|
if (which & BWA_IDX_PAC)
|
||||||
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
|
{
|
||||||
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
idx->pac = calloc(idx->bns->l_pac / 4 + 1, 1);
|
||||||
|
err_fread_noeof(idx->pac, 1, idx->bns->l_pac / 4 + 1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
||||||
err_fclose(idx->bns->fp_pac);
|
err_fclose(idx->bns->fp_pac);
|
||||||
idx->bns->fp_pac = 0;
|
idx->bns->fp_pac = 0;
|
||||||
}
|
}
|
||||||
|
|
@ -322,14 +401,24 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
|
||||||
|
|
||||||
void bwa_idx_destroy(bwaidx_t *idx)
|
void bwa_idx_destroy(bwaidx_t *idx)
|
||||||
{
|
{
|
||||||
if (idx == 0) return;
|
if (idx == 0)
|
||||||
if (idx->mem == 0) {
|
return;
|
||||||
if (idx->bwt) bwt_destroy(idx->bwt);
|
if (idx->mem == 0)
|
||||||
if (idx->bns) bns_destroy(idx->bns);
|
{
|
||||||
if (idx->pac) free(idx->pac);
|
if (idx->bwt)
|
||||||
} else {
|
bwt_destroy(idx->bwt);
|
||||||
free(idx->bwt); free(idx->bns->anns); free(idx->bns);
|
if (idx->bns)
|
||||||
if (!idx->is_shm) free(idx->mem);
|
bns_destroy(idx->bns);
|
||||||
|
if (idx->pac)
|
||||||
|
free(idx->pac);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
free(idx->bwt);
|
||||||
|
free(idx->bns->anns);
|
||||||
|
free(idx->bns);
|
||||||
|
if (!idx->is_shm)
|
||||||
|
free(idx->mem);
|
||||||
}
|
}
|
||||||
free(idx);
|
free(idx);
|
||||||
}
|
}
|
||||||
|
|
@ -340,22 +429,42 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
// generate idx->bwt
|
// generate idx->bwt
|
||||||
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
|
x = sizeof(bwt_t);
|
||||||
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
|
idx->bwt = malloc(x);
|
||||||
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
|
memcpy(idx->bwt, mem + k, x);
|
||||||
|
k += x;
|
||||||
|
x = idx->bwt->bwt_size * 4;
|
||||||
|
idx->bwt->bwt = (uint32_t *)(mem + k);
|
||||||
|
k += x;
|
||||||
|
x = idx->bwt->n_sa * sizeof(bwtint_t);
|
||||||
|
idx->bwt->sa = (bwtint_t *)(mem + k);
|
||||||
|
k += x;
|
||||||
|
|
||||||
// generate idx->bns and idx->pac
|
// generate idx->bns and idx->pac
|
||||||
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
|
x = sizeof(bntseq_t);
|
||||||
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
|
idx->bns = malloc(x);
|
||||||
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
|
memcpy(idx->bns, mem + k, x);
|
||||||
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
k += x;
|
||||||
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
|
x = idx->bns->n_holes * sizeof(bntamb1_t);
|
||||||
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
|
idx->bns->ambs = (bntamb1_t *)(mem + k);
|
||||||
|
k += x;
|
||||||
|
x = idx->bns->n_seqs * sizeof(bntann1_t);
|
||||||
|
idx->bns->anns = malloc(x);
|
||||||
|
memcpy(idx->bns->anns, mem + k, x);
|
||||||
|
k += x;
|
||||||
|
for (i = 0; i < idx->bns->n_seqs; ++i)
|
||||||
|
{
|
||||||
|
idx->bns->anns[i].name = (char *)(mem + k);
|
||||||
|
k += strlen(idx->bns->anns[i].name) + 1;
|
||||||
|
idx->bns->anns[i].anno = (char *)(mem + k);
|
||||||
|
k += strlen(idx->bns->anns[i].anno) + 1;
|
||||||
}
|
}
|
||||||
idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
|
idx->pac = (uint8_t *)(mem + k);
|
||||||
|
k += idx->bns->l_pac / 4 + 1;
|
||||||
assert(k == l_mem);
|
assert(k == l_mem);
|
||||||
|
|
||||||
idx->l_mem = k; idx->mem = mem;
|
idx->l_mem = k;
|
||||||
|
idx->mem = mem;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -367,35 +476,56 @@ int bwa_idx2mem(bwaidx_t *idx)
|
||||||
|
|
||||||
// copy idx->bwt
|
// copy idx->bwt
|
||||||
x = idx->bwt->bwt_size * 4;
|
x = idx->bwt->bwt_size * 4;
|
||||||
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
|
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x);
|
||||||
|
idx->bwt->bwt = 0;
|
||||||
memmove(mem + sizeof(bwt_t), mem, x);
|
memmove(mem + sizeof(bwt_t), mem, x);
|
||||||
memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
|
memcpy(mem, idx->bwt, sizeof(bwt_t));
|
||||||
x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
|
k = sizeof(bwt_t) + x;
|
||||||
|
x = idx->bwt->n_sa * sizeof(bwtint_t);
|
||||||
|
mem = realloc(mem, k + x);
|
||||||
|
memcpy(mem + k, idx->bwt->sa, x);
|
||||||
|
k += x;
|
||||||
free(idx->bwt->sa);
|
free(idx->bwt->sa);
|
||||||
free(idx->bwt); idx->bwt = 0;
|
free(idx->bwt);
|
||||||
|
idx->bwt = 0;
|
||||||
|
|
||||||
// copy idx->bns
|
// copy idx->bns
|
||||||
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
|
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
|
||||||
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
|
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
|
||||||
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
|
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
|
||||||
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
|
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
|
||||||
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
|
x = sizeof(bntseq_t);
|
||||||
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
|
memcpy(mem + k, idx->bns, x);
|
||||||
|
k += x;
|
||||||
|
x = idx->bns->n_holes * sizeof(bntamb1_t);
|
||||||
|
memcpy(mem + k, idx->bns->ambs, x);
|
||||||
|
k += x;
|
||||||
free(idx->bns->ambs);
|
free(idx->bns->ambs);
|
||||||
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
|
x = idx->bns->n_seqs * sizeof(bntann1_t);
|
||||||
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
memcpy(mem + k, idx->bns->anns, x);
|
||||||
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
|
k += x;
|
||||||
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
|
for (i = 0; i < idx->bns->n_seqs; ++i)
|
||||||
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
|
{
|
||||||
|
x = strlen(idx->bns->anns[i].name) + 1;
|
||||||
|
memcpy(mem + k, idx->bns->anns[i].name, x);
|
||||||
|
k += x;
|
||||||
|
x = strlen(idx->bns->anns[i].anno) + 1;
|
||||||
|
memcpy(mem + k, idx->bns->anns[i].anno, x);
|
||||||
|
k += x;
|
||||||
|
free(idx->bns->anns[i].name);
|
||||||
|
free(idx->bns->anns[i].anno);
|
||||||
}
|
}
|
||||||
free(idx->bns->anns);
|
free(idx->bns->anns);
|
||||||
|
|
||||||
// copy idx->pac
|
// copy idx->pac
|
||||||
x = idx->bns->l_pac/4+1;
|
x = idx->bns->l_pac / 4 + 1;
|
||||||
mem = realloc(mem, k + x);
|
mem = realloc(mem, k + x);
|
||||||
memcpy(mem + k, idx->pac, x); k += x;
|
memcpy(mem + k, idx->pac, x);
|
||||||
free(idx->bns); idx->bns = 0;
|
k += x;
|
||||||
free(idx->pac); idx->pac = 0;
|
free(idx->bns);
|
||||||
|
idx->bns = 0;
|
||||||
|
free(idx->pac);
|
||||||
|
idx->pac = 0;
|
||||||
|
|
||||||
return bwa_mem2idx(k, mem, idx);
|
return bwa_mem2idx(k, mem, idx);
|
||||||
}
|
}
|
||||||
|
|
@ -408,46 +538,66 @@ void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
|
||||||
{
|
{
|
||||||
int i, n_HD = 0, n_SQ = 0;
|
int i, n_HD = 0, n_SQ = 0;
|
||||||
extern char *bwa_pg;
|
extern char *bwa_pg;
|
||||||
|
|
||||||
if (hdr_line) {
|
if (hdr_line)
|
||||||
|
{
|
||||||
// check for HD line
|
// check for HD line
|
||||||
const char *p = hdr_line;
|
const char *p = hdr_line;
|
||||||
if ((p = strstr(p, "@HD")) != 0) {
|
if ((p = strstr(p, "@HD")) != 0)
|
||||||
|
{
|
||||||
++n_HD;
|
++n_HD;
|
||||||
}
|
}
|
||||||
// check for SQ lines
|
// check for SQ lines
|
||||||
p = hdr_line;
|
p = hdr_line;
|
||||||
while ((p = strstr(p, "@SQ\t")) != 0) {
|
while ((p = strstr(p, "@SQ\t")) != 0)
|
||||||
if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
|
{
|
||||||
|
if (p == hdr_line || *(p - 1) == '\n')
|
||||||
|
++n_SQ;
|
||||||
p += 4;
|
p += 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n_SQ == 0) {
|
if (n_SQ == 0)
|
||||||
for (i = 0; i < bns->n_seqs; ++i) {
|
{
|
||||||
|
for (i = 0; i < bns->n_seqs; ++i)
|
||||||
|
{
|
||||||
err_printf("@SQ\tSN:%s\tLN:%d", bns->anns[i].name, bns->anns[i].len);
|
err_printf("@SQ\tSN:%s\tLN:%d", bns->anns[i].name, bns->anns[i].len);
|
||||||
if (bns->anns[i].is_alt) err_printf("\tAH:*\n");
|
if (bns->anns[i].is_alt)
|
||||||
else err_fputc('\n', stdout);
|
err_printf("\tAH:*\n");
|
||||||
|
else
|
||||||
|
err_fputc('\n', stdout);
|
||||||
}
|
}
|
||||||
} else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
|
}
|
||||||
|
else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
|
||||||
fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
|
fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
|
||||||
if (n_HD == 0) {
|
if (n_HD == 0)
|
||||||
|
{
|
||||||
err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");
|
err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");
|
||||||
}
|
}
|
||||||
if (hdr_line) err_printf("%s\n", hdr_line);
|
if (hdr_line)
|
||||||
if (bwa_pg) err_printf("%s\n", bwa_pg);
|
err_printf("%s\n", hdr_line);
|
||||||
|
if (bwa_pg)
|
||||||
|
err_printf("%s\n", bwa_pg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static char *bwa_escape(char *s)
|
static char *bwa_escape(char *s)
|
||||||
{
|
{
|
||||||
char *p, *q;
|
char *p, *q;
|
||||||
for (p = q = s; *p; ++p) {
|
for (p = q = s; *p; ++p)
|
||||||
if (*p == '\\') {
|
{
|
||||||
|
if (*p == '\\')
|
||||||
|
{
|
||||||
++p;
|
++p;
|
||||||
if (*p == 't') *q++ = '\t';
|
if (*p == 't')
|
||||||
else if (*p == 'n') *q++ = '\n';
|
*q++ = '\t';
|
||||||
else if (*p == 'r') *q++ = '\r';
|
else if (*p == 'n')
|
||||||
else if (*p == '\\') *q++ = '\\';
|
*q++ = '\n';
|
||||||
} else *q++ = *p;
|
else if (*p == 'r')
|
||||||
|
*q++ = '\r';
|
||||||
|
else if (*p == '\\')
|
||||||
|
*q++ = '\\';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
*q++ = *p;
|
||||||
}
|
}
|
||||||
*q = '\0';
|
*q = '\0';
|
||||||
return s;
|
return s;
|
||||||
|
|
@ -457,24 +607,33 @@ char *bwa_set_rg(const char *s)
|
||||||
{
|
{
|
||||||
char *p, *q, *r, *rg_line = 0;
|
char *p, *q, *r, *rg_line = 0;
|
||||||
memset(bwa_rg_id, 0, 256);
|
memset(bwa_rg_id, 0, 256);
|
||||||
if (strstr(s, "@RG") != s) {
|
if (strstr(s, "@RG") != s)
|
||||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
|
{
|
||||||
|
if (bwa_verbose >= 1)
|
||||||
|
fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
|
||||||
goto err_set_rg;
|
goto err_set_rg;
|
||||||
}
|
}
|
||||||
if (strstr(s, "\t") != NULL) {
|
if (strstr(s, "\t") != NULL)
|
||||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
|
{
|
||||||
|
if (bwa_verbose >= 1)
|
||||||
|
fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
|
||||||
goto err_set_rg;
|
goto err_set_rg;
|
||||||
}
|
}
|
||||||
rg_line = strdup(s);
|
rg_line = strdup(s);
|
||||||
bwa_escape(rg_line);
|
bwa_escape(rg_line);
|
||||||
if ((p = strstr(rg_line, "\tID:")) == 0) {
|
if ((p = strstr(rg_line, "\tID:")) == 0)
|
||||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
|
{
|
||||||
|
if (bwa_verbose >= 1)
|
||||||
|
fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
|
||||||
goto err_set_rg;
|
goto err_set_rg;
|
||||||
}
|
}
|
||||||
p += 4;
|
p += 4;
|
||||||
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
for (q = p; *q && *q != '\t' && *q != '\n'; ++q)
|
||||||
if (q - p + 1 > 256) {
|
;
|
||||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
|
if (q - p + 1 > 256)
|
||||||
|
{
|
||||||
|
if (bwa_verbose >= 1)
|
||||||
|
fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
|
||||||
goto err_set_rg;
|
goto err_set_rg;
|
||||||
}
|
}
|
||||||
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
||||||
|
|
@ -489,13 +648,17 @@ err_set_rg:
|
||||||
char *bwa_insert_header(const char *s, char *hdr)
|
char *bwa_insert_header(const char *s, char *hdr)
|
||||||
{
|
{
|
||||||
int len = 0;
|
int len = 0;
|
||||||
if (s == 0 || s[0] != '@') return hdr;
|
if (s == 0 || s[0] != '@')
|
||||||
if (hdr) {
|
return hdr;
|
||||||
|
if (hdr)
|
||||||
|
{
|
||||||
len = strlen(hdr);
|
len = strlen(hdr);
|
||||||
hdr = realloc(hdr, len + strlen(s) + 2);
|
hdr = realloc(hdr, len + strlen(s) + 2);
|
||||||
hdr[len++] = '\n';
|
hdr[len++] = '\n';
|
||||||
strcpy(hdr + len, s);
|
strcpy(hdr + len, s);
|
||||||
} else hdr = strdup(s);
|
}
|
||||||
|
else
|
||||||
|
hdr = strdup(s);
|
||||||
bwa_escape(hdr + len);
|
bwa_escape(hdr + len);
|
||||||
return hdr;
|
return hdr;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
437
bwamem_pair.c
437
bwamem_pair.c
|
|
@ -1,8 +1,8 @@
|
||||||
/* The MIT License
|
/* The MIT License
|
||||||
|
|
||||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||||
2009-2018 Broad Institute, Inc.
|
2009-2018 Broad Institute, Inc.
|
||||||
2008-2009 Genome Research Ltd. (GRL)
|
2008-2009 Genome Research Ltd. (GRL)
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
Permission is hereby granted, free of charge, to any person obtaining
|
||||||
a copy of this software and associated documentation files (the
|
a copy of this software and associated documentation files (the
|
||||||
|
|
@ -35,38 +35,45 @@
|
||||||
#include "ksw.h"
|
#include "ksw.h"
|
||||||
|
|
||||||
#ifdef USE_MALLOC_WRAPPERS
|
#ifdef USE_MALLOC_WRAPPERS
|
||||||
# include "malloc_wrap.h"
|
#include "malloc_wrap.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
extern int64_t get_mseconds();
|
||||||
|
extern int64_t time_ksw_align2;
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MIN_RATIO 0.8
|
#define MIN_RATIO 0.8
|
||||||
#define MIN_DIR_CNT 10
|
#define MIN_DIR_CNT 10
|
||||||
#define MIN_DIR_RATIO 0.05
|
#define MIN_DIR_RATIO 0.05
|
||||||
#define OUTLIER_BOUND 2.0
|
#define OUTLIER_BOUND 2.0
|
||||||
#define MAPPING_BOUND 3.0
|
#define MAPPING_BOUND 3.0
|
||||||
#define MAX_STDDEV 4.0
|
#define MAX_STDDEV 4.0
|
||||||
|
|
||||||
static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
|
static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
|
||||||
{
|
{
|
||||||
int64_t p2;
|
int64_t p2;
|
||||||
int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
|
int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
|
||||||
p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
|
p2 = r1 == r2 ? b2 : (l_pac << 1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
|
||||||
*dist = p2 > b1? p2 - b1 : b1 - p2;
|
*dist = p2 > b1 ? p2 - b1 : b1 - p2;
|
||||||
return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);
|
return (r1 == r2 ? 0 : 1) ^ (p2 > b1 ? 0 : 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
|
static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
|
||||||
{
|
{
|
||||||
int j;
|
int j;
|
||||||
for (j = 1; j < r->n; ++j) { // choose unique alignment
|
for (j = 1; j < r->n; ++j)
|
||||||
int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
|
{ // choose unique alignment
|
||||||
int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
|
int b_max = r->a[j].qb > r->a[0].qb ? r->a[j].qb : r->a[0].qb;
|
||||||
if (e_min > b_max) { // have overlap
|
int e_min = r->a[j].qe < r->a[0].qe ? r->a[j].qe : r->a[0].qe;
|
||||||
int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
|
if (e_min > b_max)
|
||||||
if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
|
{ // have overlap
|
||||||
|
int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb ? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
|
||||||
|
if (e_min - b_max >= min_l * opt->mask_level)
|
||||||
|
break; // significant overlap
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
|
return j < r->n ? r->a[j].score : opt->min_seed_len * opt->a;
|
||||||
}
|
}
|
||||||
|
|
||||||
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
|
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
|
||||||
|
|
@ -75,36 +82,48 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
|
||||||
uint64_v isize[4];
|
uint64_v isize[4];
|
||||||
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
||||||
memset(isize, 0, sizeof(kvec_t(int)) * 4);
|
memset(isize, 0, sizeof(kvec_t(int)) * 4);
|
||||||
for (i = 0; i < n>>1; ++i) {
|
for (i = 0; i < n >> 1; ++i)
|
||||||
|
{
|
||||||
int dir;
|
int dir;
|
||||||
int64_t is;
|
int64_t is;
|
||||||
mem_alnreg_v *r[2];
|
mem_alnreg_v *r[2];
|
||||||
r[0] = (mem_alnreg_v*)®s[i<<1|0];
|
r[0] = (mem_alnreg_v *)®s[i << 1 | 0];
|
||||||
r[1] = (mem_alnreg_v*)®s[i<<1|1];
|
r[1] = (mem_alnreg_v *)®s[i << 1 | 1];
|
||||||
if (r[0]->n == 0 || r[1]->n == 0) continue;
|
if (r[0]->n == 0 || r[1]->n == 0)
|
||||||
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
|
continue;
|
||||||
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
|
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score)
|
||||||
if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr
|
continue;
|
||||||
|
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score)
|
||||||
|
continue;
|
||||||
|
if (r[0]->a[0].rid != r[1]->a[0].rid)
|
||||||
|
continue; // not on the same chr
|
||||||
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
|
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
|
||||||
if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
|
if (is && is <= opt->max_ins)
|
||||||
|
kv_push(uint64_t, isize[dir], is);
|
||||||
}
|
}
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
|
if (bwa_verbose >= 3)
|
||||||
for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
|
fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
|
||||||
|
for (d = 0; d < 4; ++d)
|
||||||
|
{ // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
|
||||||
mem_pestat_t *r = &pes[d];
|
mem_pestat_t *r = &pes[d];
|
||||||
uint64_v *q = &isize[d];
|
uint64_v *q = &isize[d];
|
||||||
int p25, p50, p75, x;
|
int p25, p50, p75, x;
|
||||||
if (q->n < MIN_DIR_CNT) {
|
if (q->n < MIN_DIR_CNT)
|
||||||
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
{
|
||||||
|
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d >> 1 & 1], "FR"[d & 1]);
|
||||||
r->failed = 1;
|
r->failed = 1;
|
||||||
free(q->a);
|
free(q->a);
|
||||||
continue;
|
continue;
|
||||||
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
}
|
||||||
|
else
|
||||||
|
fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d >> 1 & 1], "FR"[d & 1]);
|
||||||
ks_introsort_64(q->n, q->a);
|
ks_introsort_64(q->n, q->a);
|
||||||
p25 = q->a[(int)(.25 * q->n + .499)];
|
p25 = q->a[(int)(.25 * q->n + .499)];
|
||||||
p50 = q->a[(int)(.50 * q->n + .499)];
|
p50 = q->a[(int)(.50 * q->n + .499)];
|
||||||
p75 = q->a[(int)(.75 * q->n + .499)];
|
p75 = q->a[(int)(.75 * q->n + .499)];
|
||||||
r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
||||||
if (r->low < 1) r->low = 1;
|
if (r->low < 1)
|
||||||
|
r->low = 1;
|
||||||
r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
||||||
fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
|
fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
|
||||||
fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
|
fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
|
||||||
|
|
@ -117,20 +136,24 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
|
||||||
r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
|
r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
|
||||||
r->std = sqrt(r->std / x);
|
r->std = sqrt(r->std / x);
|
||||||
fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
|
fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
|
||||||
r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
|
r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
|
||||||
r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
|
r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
|
||||||
if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499);
|
if (r->low > r->avg - MAX_STDDEV * r->std)
|
||||||
if (r->high < r->avg + MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
|
r->low = (int)(r->avg - MAX_STDDEV * r->std + .499);
|
||||||
if (r->low < 1) r->low = 1;
|
if (r->high < r->avg + MAX_STDDEV * r->std)
|
||||||
|
r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
|
||||||
|
if (r->low < 1)
|
||||||
|
r->low = 1;
|
||||||
fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
|
fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
|
||||||
free(q->a);
|
free(q->a);
|
||||||
}
|
}
|
||||||
for (d = 0, max = 0; d < 4; ++d)
|
for (d = 0, max = 0; d < 4; ++d)
|
||||||
max = max > isize[d].n? max : isize[d].n;
|
max = max > isize[d].n ? max : isize[d].n;
|
||||||
for (d = 0; d < 4; ++d)
|
for (d = 0; d < 4; ++d)
|
||||||
if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
|
if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO)
|
||||||
|
{
|
||||||
pes[d].failed = 1;
|
pes[d].failed = 1;
|
||||||
fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d >> 1 & 1], "FR"[d & 1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -140,66 +163,93 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
||||||
int64_t l_pac = bns->l_pac;
|
int64_t l_pac = bns->l_pac;
|
||||||
int i, r, skip[4], n = 0, rid;
|
int i, r, skip[4], n = 0, rid;
|
||||||
for (r = 0; r < 4; ++r)
|
for (r = 0; r < 4; ++r)
|
||||||
skip[r] = pes[r].failed? 1 : 0;
|
skip[r] = pes[r].failed ? 1 : 0;
|
||||||
for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
|
for (i = 0; i < ma->n; ++i)
|
||||||
|
{ // check which orinentation has been found
|
||||||
int64_t dist;
|
int64_t dist;
|
||||||
r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
|
r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
|
||||||
if (dist >= pes[r].low && dist <= pes[r].high)
|
if (dist >= pes[r].low && dist <= pes[r].high)
|
||||||
skip[r] = 1;
|
skip[r] = 1;
|
||||||
}
|
}
|
||||||
if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
|
if (skip[0] + skip[1] + skip[2] + skip[3] == 4)
|
||||||
for (r = 0; r < 4; ++r) {
|
return 0; // consistent pair exist; no need to perform SW
|
||||||
|
for (r = 0; r < 4; ++r)
|
||||||
|
{
|
||||||
int is_rev, is_larger;
|
int is_rev, is_larger;
|
||||||
uint8_t *seq, *rev = 0, *ref = 0;
|
uint8_t *seq, *rev = 0, *ref = 0;
|
||||||
int64_t rb, re;
|
int64_t rb, re;
|
||||||
if (skip[r]) continue;
|
if (skip[r])
|
||||||
is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
|
continue;
|
||||||
is_larger = !(r>>1); // whether the mate has larger coordinate
|
is_rev = (r >> 1 != (r & 1)); // whether to reverse complement the mate
|
||||||
if (is_rev) {
|
is_larger = !(r >> 1); // whether the mate has larger coordinate
|
||||||
|
if (is_rev)
|
||||||
|
{
|
||||||
rev = malloc(l_ms); // this is the reverse complement of $ms
|
rev = malloc(l_ms); // this is the reverse complement of $ms
|
||||||
for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
|
for (i = 0; i < l_ms; ++i)
|
||||||
|
rev[l_ms - 1 - i] = ms[i] < 4 ? 3 - ms[i] : 4;
|
||||||
seq = rev;
|
seq = rev;
|
||||||
} else seq = (uint8_t*)ms;
|
|
||||||
if (!is_rev) {
|
|
||||||
rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
|
|
||||||
re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
|
|
||||||
} else {
|
|
||||||
rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
|
|
||||||
re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
|
|
||||||
}
|
}
|
||||||
if (rb < 0) rb = 0;
|
else
|
||||||
if (re > l_pac<<1) re = l_pac<<1;
|
seq = (uint8_t *)ms;
|
||||||
if (rb < re) ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid);
|
if (!is_rev)
|
||||||
if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening
|
{
|
||||||
|
rb = is_larger ? a->rb + pes[r].low : a->rb - pes[r].high;
|
||||||
|
re = (is_larger ? a->rb + pes[r].high : a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
rb = (is_larger ? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
|
||||||
|
re = is_larger ? a->rb + pes[r].high : a->rb - pes[r].low;
|
||||||
|
}
|
||||||
|
if (rb < 0)
|
||||||
|
rb = 0;
|
||||||
|
if (re > l_pac << 1)
|
||||||
|
re = l_pac << 1;
|
||||||
|
if (rb < re)
|
||||||
|
ref = bns_fetch_seq(bns, pac, &rb, (rb + re) >> 1, &re, &rid);
|
||||||
|
if (a->rid == rid && re - rb >= opt->min_seed_len)
|
||||||
|
{ // no funny things happening
|
||||||
kswr_t aln;
|
kswr_t aln;
|
||||||
mem_alnreg_t b;
|
mem_alnreg_t b;
|
||||||
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
|
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250 ? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
|
aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_ksw_align2, tmp_diff);
|
||||||
|
#endif
|
||||||
memset(&b, 0, sizeof(mem_alnreg_t));
|
memset(&b, 0, sizeof(mem_alnreg_t));
|
||||||
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
|
if (aln.score >= opt->min_seed_len && aln.qb >= 0)
|
||||||
|
{ // something goes wrong if aln.qb < 0
|
||||||
b.rid = a->rid;
|
b.rid = a->rid;
|
||||||
b.is_alt = a->is_alt;
|
b.is_alt = a->is_alt;
|
||||||
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
|
b.qb = is_rev ? l_ms - (aln.qe + 1) : aln.qb;
|
||||||
b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
|
b.qe = is_rev ? l_ms - aln.qb : aln.qe + 1;
|
||||||
b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
|
b.rb = is_rev ? (l_pac << 1) - (rb + aln.te + 1) : rb + aln.tb;
|
||||||
b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1;
|
b.re = is_rev ? (l_pac << 1) - (rb + aln.tb) : rb + aln.te + 1;
|
||||||
b.score = aln.score;
|
b.score = aln.score;
|
||||||
b.csub = aln.score2;
|
b.csub = aln.score2;
|
||||||
b.secondary = -1;
|
b.secondary = -1;
|
||||||
b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
|
b.seedcov = (b.re - b.rb < b.qe - b.qb ? b.re - b.rb : b.qe - b.qb) >> 1;
|
||||||
// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
|
// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
|
||||||
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
|
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
|
||||||
// move b s.t. ma is sorted
|
// move b s.t. ma is sorted
|
||||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||||
if (ma->a[i].score < b.score) break;
|
if (ma->a[i].score < b.score)
|
||||||
|
break;
|
||||||
tmp = i;
|
tmp = i;
|
||||||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
|
for (i = ma->n - 1; i > tmp; --i)
|
||||||
|
ma->a[i] = ma->a[i - 1];
|
||||||
ma->a[i] = b;
|
ma->a[i] = b;
|
||||||
}
|
}
|
||||||
++n;
|
++n;
|
||||||
}
|
}
|
||||||
if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
|
if (n)
|
||||||
if (rev) free(rev);
|
ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
|
||||||
|
if (rev)
|
||||||
|
free(rev);
|
||||||
free(ref);
|
free(ref);
|
||||||
}
|
}
|
||||||
return n;
|
return n;
|
||||||
|
|
@ -210,61 +260,79 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons
|
||||||
pair64_v v, u;
|
pair64_v v, u;
|
||||||
int r, i, k, y[4], ret; // y[] keeps the last hit
|
int r, i, k, y[4], ret; // y[] keeps the last hit
|
||||||
int64_t l_pac = bns->l_pac;
|
int64_t l_pac = bns->l_pac;
|
||||||
kv_init(v); kv_init(u);
|
kv_init(v);
|
||||||
for (r = 0; r < 2; ++r) { // loop through read number
|
kv_init(u);
|
||||||
for (i = 0; i < n_pri[r]; ++i) {
|
for (r = 0; r < 2; ++r)
|
||||||
|
{ // loop through read number
|
||||||
|
for (i = 0; i < n_pri[r]; ++i)
|
||||||
|
{
|
||||||
pair64_t key;
|
pair64_t key;
|
||||||
mem_alnreg_t *e = &a[r].a[i];
|
mem_alnreg_t *e = &a[r].a[i];
|
||||||
key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
|
key.x = e->rb < l_pac ? e->rb : (l_pac << 1) - 1 - e->rb; // forward position
|
||||||
key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset);
|
key.x = (uint64_t)e->rid << 32 | (key.x - bns->anns[e->rid].offset);
|
||||||
key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r;
|
key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac) << 1 | r;
|
||||||
kv_push(pair64_t, v, key);
|
kv_push(pair64_t, v, key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ks_introsort_128(v.n, v.a);
|
ks_introsort_128(v.n, v.a);
|
||||||
y[0] = y[1] = y[2] = y[3] = -1;
|
y[0] = y[1] = y[2] = y[3] = -1;
|
||||||
//for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
|
// for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
|
||||||
for (i = 0; i < v.n; ++i) {
|
for (i = 0; i < v.n; ++i)
|
||||||
for (r = 0; r < 2; ++r) { // loop through direction
|
{
|
||||||
int dir = r<<1 | (v.a[i].y>>1&1), which;
|
for (r = 0; r < 2; ++r)
|
||||||
if (pes[dir].failed) continue; // invalid orientation
|
{ // loop through direction
|
||||||
which = r<<1 | ((v.a[i].y&1)^1);
|
int dir = r << 1 | (v.a[i].y >> 1 & 1), which;
|
||||||
if (y[which] < 0) continue; // no previous hits
|
if (pes[dir].failed)
|
||||||
for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
|
continue; // invalid orientation
|
||||||
|
which = r << 1 | ((v.a[i].y & 1) ^ 1);
|
||||||
|
if (y[which] < 0)
|
||||||
|
continue; // no previous hits
|
||||||
|
for (k = y[which]; k >= 0; --k)
|
||||||
|
{ // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
|
||||||
int64_t dist;
|
int64_t dist;
|
||||||
int q;
|
int q;
|
||||||
double ns;
|
double ns;
|
||||||
pair64_t *p;
|
pair64_t *p;
|
||||||
if ((v.a[k].y&3) != which) continue;
|
if ((v.a[k].y & 3) != which)
|
||||||
|
continue;
|
||||||
dist = (int64_t)v.a[i].x - v.a[k].x;
|
dist = (int64_t)v.a[i].x - v.a[k].x;
|
||||||
//printf("%d: %lld\n", k, dist);
|
// printf("%d: %lld\n", k, dist);
|
||||||
if (dist > pes[dir].high) break;
|
if (dist > pes[dir].high)
|
||||||
if (dist < pes[dir].low) continue;
|
break;
|
||||||
|
if (dist < pes[dir].low)
|
||||||
|
continue;
|
||||||
ns = (dist - pes[dir].avg) / pes[dir].std;
|
ns = (dist - pes[dir].avg) / pes[dir].std;
|
||||||
q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4)
|
q = (int)((v.a[i].y >> 32) + (v.a[k].y >> 32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4)
|
||||||
if (q < 0) q = 0;
|
if (q < 0)
|
||||||
|
q = 0;
|
||||||
p = kv_pushp(pair64_t, u);
|
p = kv_pushp(pair64_t, u);
|
||||||
p->y = (uint64_t)k<<32 | i;
|
p->y = (uint64_t)k << 32 | i;
|
||||||
p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
|
p->x = (uint64_t)q << 32 | (hash_64(p->y ^ id << 8) & 0xffffffffU);
|
||||||
//printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
|
// printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
y[v.a[i].y&3] = i;
|
y[v.a[i].y & 3] = i;
|
||||||
}
|
}
|
||||||
if (u.n) { // found at least one proper pair
|
if (u.n)
|
||||||
|
{ // found at least one proper pair
|
||||||
int tmp = opt->a + opt->b;
|
int tmp = opt->a + opt->b;
|
||||||
tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del;
|
tmp = tmp > opt->o_del + opt->e_del ? tmp : opt->o_del + opt->e_del;
|
||||||
tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins;
|
tmp = tmp > opt->o_ins + opt->e_ins ? tmp : opt->o_ins + opt->e_ins;
|
||||||
ks_introsort_128(u.n, u.a);
|
ks_introsort_128(u.n, u.a);
|
||||||
i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
|
i = u.a[u.n - 1].y >> 32;
|
||||||
z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
|
k = u.a[u.n - 1].y << 32 >> 32;
|
||||||
z[v.a[k].y&1] = v.a[k].y<<32>>34;
|
z[v.a[i].y & 1] = v.a[i].y << 32 >> 34; // index of the best pair
|
||||||
ret = u.a[u.n-1].x >> 32;
|
z[v.a[k].y & 1] = v.a[k].y << 32 >> 34;
|
||||||
*sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
|
ret = u.a[u.n - 1].x >> 32;
|
||||||
|
*sub = u.n > 1 ? u.a[u.n - 2].x >> 32 : 0;
|
||||||
for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
|
for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
|
||||||
if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
|
if (*sub - (int)(u.a[i].x >> 32) <= tmp)
|
||||||
} else ret = 0, *sub = 0, *n_sub = 0;
|
++*n_sub;
|
||||||
free(u.a); free(v.a);
|
}
|
||||||
|
else
|
||||||
|
ret = 0, *sub = 0, *n_sub = 0;
|
||||||
|
free(u.a);
|
||||||
|
free(v.a);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -284,72 +352,94 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
||||||
kstring_t str;
|
kstring_t str;
|
||||||
mem_aln_t h[2], g[2], aa[2][2];
|
mem_aln_t h[2], g[2], aa[2][2];
|
||||||
|
|
||||||
str.l = str.m = 0; str.s = 0;
|
str.l = str.m = 0;
|
||||||
|
str.s = 0;
|
||||||
memset(h, 0, sizeof(mem_aln_t) * 2);
|
memset(h, 0, sizeof(mem_aln_t) * 2);
|
||||||
memset(g, 0, sizeof(mem_aln_t) * 2);
|
memset(g, 0, sizeof(mem_aln_t) * 2);
|
||||||
n_aa[0] = n_aa[1] = 0;
|
n_aa[0] = n_aa[1] = 0;
|
||||||
if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
|
if (!(opt->flag & MEM_F_NO_RESCUE))
|
||||||
|
{ // then perform SW for the best alignment
|
||||||
mem_alnreg_v b[2];
|
mem_alnreg_v b[2];
|
||||||
kv_init(b[0]); kv_init(b[1]);
|
kv_init(b[0]);
|
||||||
|
kv_init(b[1]);
|
||||||
for (i = 0; i < 2; ++i)
|
for (i = 0; i < 2; ++i)
|
||||||
for (j = 0; j < a[i].n; ++j)
|
for (j = 0; j < a[i].n; ++j)
|
||||||
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
|
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
|
||||||
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
||||||
for (i = 0; i < 2; ++i)
|
for (i = 0; i < 2; ++i)
|
||||||
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
||||||
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
|
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t *)s[!i].seq, &a[!i]);
|
||||||
free(b[0].a); free(b[1].a);
|
free(b[0].a);
|
||||||
|
free(b[1].a);
|
||||||
}
|
}
|
||||||
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
|
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id << 1 | 0);
|
||||||
n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
|
n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id << 1 | 1);
|
||||||
if (opt->flag & MEM_F_PRIMARY5) {
|
if (opt->flag & MEM_F_PRIMARY5)
|
||||||
|
{
|
||||||
mem_reorder_primary5(opt->T, &a[0]);
|
mem_reorder_primary5(opt->T, &a[0]);
|
||||||
mem_reorder_primary5(opt->T, &a[1]);
|
mem_reorder_primary5(opt->T, &a[1]);
|
||||||
}
|
}
|
||||||
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
|
if (opt->flag & MEM_F_NOPAIRING)
|
||||||
|
goto no_pairing;
|
||||||
// pairing single-end hits
|
// pairing single-end hits
|
||||||
if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
|
if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0)
|
||||||
|
{
|
||||||
int is_multi[2], q_pe, score_un, q_se[2];
|
int is_multi[2], q_pe, score_un, q_se[2];
|
||||||
char **XA[2];
|
char **XA[2];
|
||||||
// check if an end has multiple hits even after mate-SW
|
// check if an end has multiple hits even after mate-SW
|
||||||
for (i = 0; i < 2; ++i) {
|
for (i = 0; i < 2; ++i)
|
||||||
|
{
|
||||||
for (j = 1; j < n_pri[i]; ++j)
|
for (j = 1; j < n_pri[i]; ++j)
|
||||||
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
|
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T)
|
||||||
is_multi[i] = j < n_pri[i]? 1 : 0;
|
break;
|
||||||
|
is_multi[i] = j < n_pri[i] ? 1 : 0;
|
||||||
}
|
}
|
||||||
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
|
if (is_multi[0] || is_multi[1])
|
||||||
|
goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
|
||||||
// compute mapQ for the best SE hit
|
// compute mapQ for the best SE hit
|
||||||
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
|
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
|
||||||
//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
|
// q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
|
||||||
subo = subo > score_un? subo : score_un;
|
subo = subo > score_un ? subo : score_un;
|
||||||
q_pe = raw_mapq(o - subo, opt->a);
|
q_pe = raw_mapq(o - subo, opt->a);
|
||||||
if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
|
if (n_sub > 0)
|
||||||
if (q_pe < 0) q_pe = 0;
|
q_pe -= (int)(4.343 * log(n_sub + 1) + .499);
|
||||||
if (q_pe > 60) q_pe = 60;
|
if (q_pe < 0)
|
||||||
|
q_pe = 0;
|
||||||
|
if (q_pe > 60)
|
||||||
|
q_pe = 60;
|
||||||
q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);
|
q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);
|
||||||
// the following assumes no split hits
|
// the following assumes no split hits
|
||||||
if (o > score_un) { // paired alignment is preferred
|
if (o > score_un)
|
||||||
|
{ // paired alignment is preferred
|
||||||
mem_alnreg_t *c[2];
|
mem_alnreg_t *c[2];
|
||||||
c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];
|
c[0] = &a[0].a[z[0]];
|
||||||
for (i = 0; i < 2; ++i) {
|
c[1] = &a[1].a[z[1]];
|
||||||
|
for (i = 0; i < 2; ++i)
|
||||||
|
{
|
||||||
if (c[i]->secondary >= 0)
|
if (c[i]->secondary >= 0)
|
||||||
c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
|
c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
|
||||||
q_se[i] = mem_approx_mapq_se(opt, c[i]);
|
q_se[i] = mem_approx_mapq_se(opt, c[i]);
|
||||||
}
|
}
|
||||||
q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;
|
q_se[0] = q_se[0] > q_pe ? q_se[0] : q_pe < q_se[0] + 40 ? q_pe
|
||||||
q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
|
: q_se[0] + 40;
|
||||||
|
q_se[1] = q_se[1] > q_pe ? q_se[1] : q_pe < q_se[1] + 40 ? q_pe
|
||||||
|
: q_se[1] + 40;
|
||||||
extra_flag |= 2;
|
extra_flag |= 2;
|
||||||
// cap at the tandem repeat score
|
// cap at the tandem repeat score
|
||||||
q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
|
q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a) ? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
|
||||||
q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
|
q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a) ? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
|
||||||
} else { // the unpaired alignment is preferred
|
}
|
||||||
|
else
|
||||||
|
{ // the unpaired alignment is preferred
|
||||||
z[0] = z[1] = 0;
|
z[0] = z[1] = 0;
|
||||||
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
|
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
|
||||||
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
|
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
|
||||||
}
|
}
|
||||||
for (i = 0; i < 2; ++i) {
|
for (i = 0; i < 2; ++i)
|
||||||
|
{
|
||||||
int k = a[i].a[z[i]].secondary_all;
|
int k = a[i].a[z[i]].secondary_all;
|
||||||
if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT
|
if (k >= 0 && k < n_pri[i])
|
||||||
|
{ // switch secondary and primary if both of them are non-ALT
|
||||||
assert(a[i].a[k].secondary_all < 0);
|
assert(a[i].a[k].secondary_all < 0);
|
||||||
for (j = 0; j < a[i].n; ++j)
|
for (j = 0; j < a[i].n; ++j)
|
||||||
if (a[i].a[j].secondary_all == k || j == k)
|
if (a[i].a[j].secondary_all == k || j == k)
|
||||||
|
|
@ -357,63 +447,86 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
||||||
a[i].a[z[i]].secondary_all = -1;
|
a[i].a[z[i]].secondary_all = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!(opt->flag & MEM_F_ALL)) {
|
if (!(opt->flag & MEM_F_ALL))
|
||||||
|
{
|
||||||
for (i = 0; i < 2; ++i)
|
for (i = 0; i < 2; ++i)
|
||||||
XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
|
XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
|
||||||
} else XA[0] = XA[1] = 0;
|
}
|
||||||
|
else
|
||||||
|
XA[0] = XA[1] = 0;
|
||||||
// write SAM
|
// write SAM
|
||||||
for (i = 0; i < 2; ++i) {
|
for (i = 0; i < 2; ++i)
|
||||||
|
{
|
||||||
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
|
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
|
||||||
h[i].mapq = q_se[i];
|
h[i].mapq = q_se[i];
|
||||||
h[i].flag |= 0x40<<i | extra_flag;
|
h[i].flag |= 0x40 << i | extra_flag;
|
||||||
h[i].XA = XA[i]? XA[i][z[i]] : 0;
|
h[i].XA = XA[i] ? XA[i][z[i]] : 0;
|
||||||
aa[i][n_aa[i]++] = h[i];
|
aa[i][n_aa[i]++] = h[i];
|
||||||
if (n_pri[i] < a[i].n) { // the read has ALT hits
|
if (n_pri[i] < a[i].n)
|
||||||
|
{ // the read has ALT hits
|
||||||
mem_alnreg_t *p = &a[i].a[n_pri[i]];
|
mem_alnreg_t *p = &a[i].a[n_pri[i]];
|
||||||
if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) continue;
|
if (p->score < opt->T || p->secondary >= 0 || !p->is_alt)
|
||||||
|
continue;
|
||||||
g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
|
g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
|
||||||
g[i].flag |= 0x800 | 0x40<<i | extra_flag;
|
g[i].flag |= 0x800 | 0x40 << i | extra_flag;
|
||||||
g[i].XA = XA[i]? XA[i][n_pri[i]] : 0;
|
g[i].XA = XA[i] ? XA[i][n_pri[i]] : 0;
|
||||||
aa[i][n_aa[i]++] = g[i];
|
aa[i][n_aa[i]++] = g[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (i = 0; i < n_aa[0]; ++i)
|
for (i = 0; i < n_aa[0]; ++i)
|
||||||
mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
|
mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
|
||||||
s[0].sam = strdup(str.s); str.l = 0;
|
s[0].sam = strdup(str.s);
|
||||||
|
str.l = 0;
|
||||||
for (i = 0; i < n_aa[1]; ++i)
|
for (i = 0; i < n_aa[1]; ++i)
|
||||||
mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
|
mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
|
||||||
s[1].sam = str.s;
|
s[1].sam = str.s;
|
||||||
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
if (strcmp(s[0].name, s[1].name) != 0)
|
||||||
|
err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
||||||
// free
|
// free
|
||||||
for (i = 0; i < 2; ++i) {
|
for (i = 0; i < 2; ++i)
|
||||||
free(h[i].cigar); free(g[i].cigar);
|
{
|
||||||
if (XA[i] == 0) continue;
|
free(h[i].cigar);
|
||||||
for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
|
free(g[i].cigar);
|
||||||
|
if (XA[i] == 0)
|
||||||
|
continue;
|
||||||
|
for (j = 0; j < a[i].n; ++j)
|
||||||
|
free(XA[i][j]);
|
||||||
free(XA[i]);
|
free(XA[i]);
|
||||||
}
|
}
|
||||||
} else goto no_pairing;
|
}
|
||||||
|
else
|
||||||
|
goto no_pairing;
|
||||||
return n;
|
return n;
|
||||||
|
|
||||||
no_pairing:
|
no_pairing:
|
||||||
for (i = 0; i < 2; ++i) {
|
for (i = 0; i < 2; ++i)
|
||||||
|
{
|
||||||
int which = -1;
|
int which = -1;
|
||||||
if (a[i].n) {
|
if (a[i].n)
|
||||||
if (a[i].a[0].score >= opt->T) which = 0;
|
{
|
||||||
|
if (a[i].a[0].score >= opt->T)
|
||||||
|
which = 0;
|
||||||
else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
|
else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
|
||||||
which = n_pri[i];
|
which = n_pri[i];
|
||||||
}
|
}
|
||||||
if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
|
if (which >= 0)
|
||||||
else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
|
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
|
||||||
|
else
|
||||||
|
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
|
||||||
}
|
}
|
||||||
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
|
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0)
|
||||||
|
{ // if the top hits from the two ends constitute a proper pair, flag it.
|
||||||
int64_t dist;
|
int64_t dist;
|
||||||
int d;
|
int d;
|
||||||
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
|
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
|
||||||
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
|
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high)
|
||||||
|
extra_flag |= 2;
|
||||||
}
|
}
|
||||||
mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
|
mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41 | extra_flag, &h[1]);
|
||||||
mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
|
mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81 | extra_flag, &h[0]);
|
||||||
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
if (strcmp(s[0].name, s[1].name) != 0)
|
||||||
free(h[0].cigar); free(h[1].cigar);
|
err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
||||||
|
free(h[0].cigar);
|
||||||
|
free(h[1].cigar);
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
357
bwt.c
357
bwt.c
|
|
@ -36,16 +36,24 @@
|
||||||
#include "kvec.h"
|
#include "kvec.h"
|
||||||
|
|
||||||
#ifdef USE_MALLOC_WRAPPERS
|
#ifdef USE_MALLOC_WRAPPERS
|
||||||
# include "malloc_wrap.h"
|
#include "malloc_wrap.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
extern int64_t get_mseconds();
|
||||||
|
extern int64_t time_bwt_sa,
|
||||||
|
time_bwt_occ4,
|
||||||
|
time_bwt_smem1a;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void bwt_gen_cnt_table(bwt_t *bwt)
|
void bwt_gen_cnt_table(bwt_t *bwt)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
for (i = 0; i != 256; ++i) {
|
for (i = 0; i != 256; ++i)
|
||||||
|
{
|
||||||
uint32_t x = 0;
|
uint32_t x = 0;
|
||||||
for (j = 0; j != 4; ++j)
|
for (j = 0; j != 4; ++j)
|
||||||
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
|
x |= (((i & 3) == j) + ((i >> 2 & 3) == j) + ((i >> 4 & 3) == j) + (i >> 6 == j)) << (j << 3);
|
||||||
bwt->cnt_table[i] = x;
|
bwt->cnt_table[i] = x;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -55,7 +63,7 @@ static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inver
|
||||||
bwtint_t x = k - (k > bwt->primary);
|
bwtint_t x = k - (k > bwt->primary);
|
||||||
x = bwt_B0(bwt, x);
|
x = bwt_B0(bwt, x);
|
||||||
x = bwt->L2[x] + bwt_occ(bwt, k, x);
|
x = bwt->L2[x] + bwt_occ(bwt, k, x);
|
||||||
return k == bwt->primary? 0 : x;
|
return k == bwt->primary ? 0 : x;
|
||||||
}
|
}
|
||||||
|
|
||||||
// bwt->bwt and bwt->occ must be precalculated
|
// bwt->bwt and bwt->occ must be precalculated
|
||||||
|
|
@ -68,37 +76,50 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
|
||||||
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
|
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
|
||||||
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
|
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
|
||||||
|
|
||||||
if (bwt->sa) free(bwt->sa);
|
if (bwt->sa)
|
||||||
|
free(bwt->sa);
|
||||||
bwt->sa_intv = intv;
|
bwt->sa_intv = intv;
|
||||||
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
||||||
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
bwt->sa = (bwtint_t *)calloc(bwt->n_sa, sizeof(bwtint_t));
|
||||||
// calculate SA value
|
// calculate SA value
|
||||||
isa = 0; sa = bwt->seq_len;
|
isa = 0;
|
||||||
for (i = 0; i < bwt->seq_len; ++i) {
|
sa = bwt->seq_len;
|
||||||
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
for (i = 0; i < bwt->seq_len; ++i)
|
||||||
|
{
|
||||||
|
if (isa % intv == 0)
|
||||||
|
bwt->sa[isa / intv] = sa;
|
||||||
--sa;
|
--sa;
|
||||||
isa = bwt_invPsi(bwt, isa);
|
isa = bwt_invPsi(bwt, isa);
|
||||||
}
|
}
|
||||||
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
if (isa % intv == 0)
|
||||||
|
bwt->sa[isa / intv] = sa;
|
||||||
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
||||||
}
|
}
|
||||||
|
|
||||||
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
||||||
{
|
{
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
bwtint_t sa = 0, mask = bwt->sa_intv - 1;
|
bwtint_t sa = 0, mask = bwt->sa_intv - 1;
|
||||||
while (k & mask) {
|
while (k & mask)
|
||||||
|
{
|
||||||
++sa;
|
++sa;
|
||||||
k = bwt_invPsi(bwt, k);
|
k = bwt_invPsi(bwt, k);
|
||||||
}
|
}
|
||||||
/* without setting bwt->sa[0] = -1, the following line should be
|
/* without setting bwt->sa[0] = -1, the following line should be
|
||||||
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
|
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
|
||||||
return sa + bwt->sa[k/bwt->sa_intv];
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_bwt_sa, tmp_diff);
|
||||||
|
#endif
|
||||||
|
return sa + bwt->sa[k / bwt->sa_intv];
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int __occ_aux(uint64_t y, int c)
|
static inline int __occ_aux(uint64_t y, int c)
|
||||||
{
|
{
|
||||||
// reduce nucleotide counting to bits counting
|
// reduce nucleotide counting to bits counting
|
||||||
y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
|
y = ((c & 2) ? y : ~y) >> 1 & ((c & 1) ? y : ~y) & 0x5555555555555555ull;
|
||||||
// count the number of 1s in y
|
// count the number of 1s in y
|
||||||
y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
|
y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
|
||||||
return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
|
return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
|
||||||
|
|
@ -109,21 +130,25 @@ bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
|
||||||
bwtint_t n;
|
bwtint_t n;
|
||||||
uint32_t *p, *end;
|
uint32_t *p, *end;
|
||||||
|
|
||||||
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
|
if (k == bwt->seq_len)
|
||||||
if (k == (bwtint_t)(-1)) return 0;
|
return bwt->L2[c + 1] - bwt->L2[c];
|
||||||
|
if (k == (bwtint_t)(-1))
|
||||||
|
return 0;
|
||||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||||
|
|
||||||
// retrieve Occ at k/OCC_INTERVAL
|
// retrieve Occ at k/OCC_INTERVAL
|
||||||
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
|
n = ((bwtint_t *)(p = bwt_occ_intv(bwt, k)))[c];
|
||||||
p += sizeof(bwtint_t); // jump to the start of the first BWT cell
|
p += sizeof(bwtint_t); // jump to the start of the first BWT cell
|
||||||
|
|
||||||
// calculate Occ up to the last k/32
|
// calculate Occ up to the last k/32
|
||||||
end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1);
|
end = p + (((k >> 5) - ((k & ~OCC_INTV_MASK) >> 5)) << 1);
|
||||||
for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
for (; p < end; p += 2)
|
||||||
|
n += __occ_aux((uint64_t)p[0] << 32 | p[1], c);
|
||||||
|
|
||||||
// calculate Occ
|
// calculate Occ
|
||||||
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
n += __occ_aux(((uint64_t)p[0] << 32 | p[1]) & ~((1ull << ((~k & 31) << 1)) - 1), c);
|
||||||
if (c == 0) n -= ~k&31; // corrected for the masked bits
|
if (c == 0)
|
||||||
|
n -= ~k & 31; // corrected for the masked bits
|
||||||
|
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
@ -132,69 +157,86 @@ bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
|
||||||
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
|
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
|
||||||
{
|
{
|
||||||
bwtint_t _k, _l;
|
bwtint_t _k, _l;
|
||||||
_k = (k >= bwt->primary)? k-1 : k;
|
_k = (k >= bwt->primary) ? k - 1 : k;
|
||||||
_l = (l >= bwt->primary)? l-1 : l;
|
_l = (l >= bwt->primary) ? l - 1 : l;
|
||||||
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
if (_l / OCC_INTERVAL != _k / OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1))
|
||||||
|
{
|
||||||
*ok = bwt_occ(bwt, k, c);
|
*ok = bwt_occ(bwt, k, c);
|
||||||
*ol = bwt_occ(bwt, l, c);
|
*ol = bwt_occ(bwt, l, c);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
bwtint_t m, n, i, j;
|
bwtint_t m, n, i, j;
|
||||||
uint32_t *p;
|
uint32_t *p;
|
||||||
if (k >= bwt->primary) --k;
|
if (k >= bwt->primary)
|
||||||
if (l >= bwt->primary) --l;
|
--k;
|
||||||
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
|
if (l >= bwt->primary)
|
||||||
|
--l;
|
||||||
|
n = ((bwtint_t *)(p = bwt_occ_intv(bwt, k)))[c];
|
||||||
p += sizeof(bwtint_t);
|
p += sizeof(bwtint_t);
|
||||||
// calculate *ok
|
// calculate *ok
|
||||||
j = k >> 5 << 5;
|
j = k >> 5 << 5;
|
||||||
for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
|
for (i = k / OCC_INTERVAL * OCC_INTERVAL; i < j; i += 32, p += 2)
|
||||||
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
n += __occ_aux((uint64_t)p[0] << 32 | p[1], c);
|
||||||
m = n;
|
m = n;
|
||||||
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
n += __occ_aux(((uint64_t)p[0] << 32 | p[1]) & ~((1ull << ((~k & 31) << 1)) - 1), c);
|
||||||
if (c == 0) n -= ~k&31; // corrected for the masked bits
|
if (c == 0)
|
||||||
|
n -= ~k & 31; // corrected for the masked bits
|
||||||
*ok = n;
|
*ok = n;
|
||||||
// calculate *ol
|
// calculate *ol
|
||||||
j = l >> 5 << 5;
|
j = l >> 5 << 5;
|
||||||
for (; i < j; i += 32, p += 2)
|
for (; i < j; i += 32, p += 2)
|
||||||
m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
m += __occ_aux((uint64_t)p[0] << 32 | p[1], c);
|
||||||
m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
|
m += __occ_aux(((uint64_t)p[0] << 32 | p[1]) & ~((1ull << ((~l & 31) << 1)) - 1), c);
|
||||||
if (c == 0) m -= ~l&31; // corrected for the masked bits
|
if (c == 0)
|
||||||
|
m -= ~l & 31; // corrected for the masked bits
|
||||||
*ol = m;
|
*ol = m;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define __occ_aux4(bwt, b) \
|
#define __occ_aux4(bwt, b) \
|
||||||
((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
|
((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b) >> 8 & 0xff] + (bwt)->cnt_table[(b) >> 16 & 0xff] + (bwt)->cnt_table[(b) >> 24])
|
||||||
+ (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
|
|
||||||
|
|
||||||
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
|
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
|
||||||
{
|
{
|
||||||
bwtint_t x;
|
bwtint_t x;
|
||||||
uint32_t *p, tmp, *end;
|
uint32_t *p, tmp, *end;
|
||||||
if (k == (bwtint_t)(-1)) {
|
if (k == (bwtint_t)(-1))
|
||||||
|
{
|
||||||
memset(cnt, 0, 4 * sizeof(bwtint_t));
|
memset(cnt, 0, 4 * sizeof(bwtint_t));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||||
p = bwt_occ_intv(bwt, k);
|
p = bwt_occ_intv(bwt, k);
|
||||||
memcpy(cnt, p, 4 * sizeof(bwtint_t));
|
memcpy(cnt, p, 4 * sizeof(bwtint_t));
|
||||||
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
|
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
|
||||||
end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop
|
end = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); // this is the end point of the following loop
|
||||||
for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p);
|
for (x = 0; p < end; ++p)
|
||||||
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
|
x += __occ_aux4(bwt, *p);
|
||||||
x += __occ_aux4(bwt, tmp) - (~k&15);
|
tmp = *p & ~((1U << ((~k & 15) << 1)) - 1);
|
||||||
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
|
x += __occ_aux4(bwt, tmp) - (~k & 15);
|
||||||
|
cnt[0] += x & 0xff;
|
||||||
|
cnt[1] += x >> 8 & 0xff;
|
||||||
|
cnt[2] += x >> 16 & 0xff;
|
||||||
|
cnt[3] += x >> 24;
|
||||||
}
|
}
|
||||||
|
|
||||||
// an analogy to bwt_occ4() but more efficient, requiring k <= l
|
// an analogy to bwt_occ4() but more efficient, requiring k <= l
|
||||||
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
|
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
|
||||||
{
|
{
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
// int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
bwtint_t _k, _l;
|
bwtint_t _k, _l;
|
||||||
_k = k - (k >= bwt->primary);
|
_k = k - (k >= bwt->primary);
|
||||||
_l = l - (l >= bwt->primary);
|
_l = l - (l >= bwt->primary);
|
||||||
if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1))
|
||||||
|
{
|
||||||
bwt_occ4(bwt, k, cntk);
|
bwt_occ4(bwt, k, cntk);
|
||||||
bwt_occ4(bwt, l, cntl);
|
bwt_occ4(bwt, l, cntl);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
bwtint_t x, y;
|
bwtint_t x, y;
|
||||||
uint32_t *p, tmp, *endk, *endl;
|
uint32_t *p, tmp, *endk, *endl;
|
||||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||||
|
|
@ -203,38 +245,57 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin
|
||||||
memcpy(cntk, p, 4 * sizeof(bwtint_t));
|
memcpy(cntk, p, 4 * sizeof(bwtint_t));
|
||||||
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
|
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
|
||||||
// prepare cntk[]
|
// prepare cntk[]
|
||||||
endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4));
|
endk = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4));
|
||||||
endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4));
|
endl = p + ((l >> 4) - ((l & ~OCC_INTV_MASK) >> 4));
|
||||||
for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p);
|
for (x = 0; p < endk; ++p)
|
||||||
|
x += __occ_aux4(bwt, *p);
|
||||||
y = x;
|
y = x;
|
||||||
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
|
tmp = *p & ~((1U << ((~k & 15) << 1)) - 1);
|
||||||
x += __occ_aux4(bwt, tmp) - (~k&15);
|
x += __occ_aux4(bwt, tmp) - (~k & 15);
|
||||||
// calculate cntl[] and finalize cntk[]
|
// calculate cntl[] and finalize cntk[]
|
||||||
for (; p < endl; ++p) y += __occ_aux4(bwt, *p);
|
for (; p < endl; ++p)
|
||||||
tmp = *p & ~((1U<<((~l&15)<<1)) - 1);
|
y += __occ_aux4(bwt, *p);
|
||||||
y += __occ_aux4(bwt, tmp) - (~l&15);
|
tmp = *p & ~((1U << ((~l & 15) << 1)) - 1);
|
||||||
|
y += __occ_aux4(bwt, tmp) - (~l & 15);
|
||||||
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
|
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
|
||||||
cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
|
cntk[0] += x & 0xff;
|
||||||
cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
|
cntk[1] += x >> 8 & 0xff;
|
||||||
|
cntk[2] += x >> 16 & 0xff;
|
||||||
|
cntk[3] += x >> 24;
|
||||||
|
cntl[0] += y & 0xff;
|
||||||
|
cntl[1] += y >> 8 & 0xff;
|
||||||
|
cntl[2] += y >> 16 & 0xff;
|
||||||
|
cntl[3] += y >> 24;
|
||||||
}
|
}
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
// int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
// __sync_fetch_and_add(&time_bwt_occ4, tmp_diff);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
|
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
|
||||||
{
|
{
|
||||||
bwtint_t k, l, ok, ol;
|
bwtint_t k, l, ok, ol;
|
||||||
int i;
|
int i;
|
||||||
k = 0; l = bwt->seq_len;
|
k = 0;
|
||||||
for (i = len - 1; i >= 0; --i) {
|
l = bwt->seq_len;
|
||||||
|
for (i = len - 1; i >= 0; --i)
|
||||||
|
{
|
||||||
ubyte_t c = str[i];
|
ubyte_t c = str[i];
|
||||||
if (c > 3) return 0; // no match
|
if (c > 3)
|
||||||
|
return 0; // no match
|
||||||
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
||||||
k = bwt->L2[c] + ok + 1;
|
k = bwt->L2[c] + ok + 1;
|
||||||
l = bwt->L2[c] + ol;
|
l = bwt->L2[c] + ol;
|
||||||
if (k > l) break; // no match
|
if (k > l)
|
||||||
|
break; // no match
|
||||||
}
|
}
|
||||||
if (k > l) return 0; // no match
|
if (k > l)
|
||||||
if (sa_begin) *sa_begin = k;
|
return 0; // no match
|
||||||
if (sa_end) *sa_end = l;
|
if (sa_begin)
|
||||||
|
*sa_begin = k;
|
||||||
|
if (sa_end)
|
||||||
|
*sa_end = l;
|
||||||
return l - k + 1;
|
return l - k + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -242,16 +303,21 @@ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
bwtint_t k, l, ok, ol;
|
bwtint_t k, l, ok, ol;
|
||||||
k = *k0; l = *l0;
|
k = *k0;
|
||||||
for (i = len - 1; i >= 0; --i) {
|
l = *l0;
|
||||||
|
for (i = len - 1; i >= 0; --i)
|
||||||
|
{
|
||||||
ubyte_t c = str[i];
|
ubyte_t c = str[i];
|
||||||
if (c > 3) return 0; // there is an N here. no match
|
if (c > 3)
|
||||||
|
return 0; // there is an N here. no match
|
||||||
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
||||||
k = bwt->L2[c] + ok + 1;
|
k = bwt->L2[c] + ok + 1;
|
||||||
l = bwt->L2[c] + ol;
|
l = bwt->L2[c] + ol;
|
||||||
if (k > l) return 0; // no match
|
if (k > l)
|
||||||
|
return 0; // no match
|
||||||
}
|
}
|
||||||
*k0 = k; *l0 = l;
|
*k0 = k;
|
||||||
|
*l0 = l;
|
||||||
return l - k + 1;
|
return l - k + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -264,7 +330,8 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
|
||||||
bwtint_t tk[4], tl[4];
|
bwtint_t tk[4], tl[4];
|
||||||
int i;
|
int i;
|
||||||
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
|
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
|
||||||
for (i = 0; i != 4; ++i) {
|
for (i = 0; i != 4; ++i)
|
||||||
|
{
|
||||||
ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i];
|
ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i];
|
||||||
ok[i].x[2] = tl[i] - tk[i];
|
ok[i].x[2] = tl[i] - tk[i];
|
||||||
}
|
}
|
||||||
|
|
@ -276,9 +343,11 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
|
||||||
|
|
||||||
static void bwt_reverse_intvs(bwtintv_v *p)
|
static void bwt_reverse_intvs(bwtintv_v *p)
|
||||||
{
|
{
|
||||||
if (p->n > 1) {
|
if (p->n > 1)
|
||||||
|
{
|
||||||
int j;
|
int j;
|
||||||
for (j = 0; j < p->n>>1; ++j) {
|
for (j = 0; j < p->n >> 1; ++j)
|
||||||
|
{
|
||||||
bwtintv_t tmp = p->a[p->n - 1 - j];
|
bwtintv_t tmp = p->a[p->n - 1 - j];
|
||||||
p->a[p->n - 1 - j] = p->a[j];
|
p->a[p->n - 1 - j] = p->a[j];
|
||||||
p->a[j] = tmp;
|
p->a[j] = tmp;
|
||||||
|
|
@ -288,65 +357,102 @@ static void bwt_reverse_intvs(bwtintv_v *p)
|
||||||
// NOTE: $max_intv is not currently used in BWA-MEM
|
// NOTE: $max_intv is not currently used in BWA-MEM
|
||||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||||
{
|
{
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
int i, j, c, ret;
|
int i, j, c, ret;
|
||||||
bwtintv_t ik, ok[4];
|
bwtintv_t ik, ok[4];
|
||||||
bwtintv_v a[2], *prev, *curr, *swap;
|
bwtintv_v a[2], *prev, *curr, *swap;
|
||||||
|
|
||||||
mem->n = 0;
|
mem->n = 0;
|
||||||
if (q[x] > 3) return x + 1;
|
if (q[x] > 3)
|
||||||
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
|
return x + 1;
|
||||||
kv_init(a[0]); kv_init(a[1]);
|
if (min_intv < 1)
|
||||||
prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
|
min_intv = 1; // the interval size should be at least 1
|
||||||
curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
|
kv_init(a[0]);
|
||||||
|
kv_init(a[1]);
|
||||||
|
prev = tmpvec && tmpvec[0] ? tmpvec[0] : &a[0]; // use the temporary vector if provided
|
||||||
|
curr = tmpvec && tmpvec[1] ? tmpvec[1] : &a[1];
|
||||||
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||||
ik.info = x + 1;
|
ik.info = x + 1;
|
||||||
|
|
||||||
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
|
for (i = x + 1, curr->n = 0; i < len; ++i)
|
||||||
if (ik.x[2] < max_intv) { // an interval small enough
|
{ // forward search
|
||||||
|
if (ik.x[2] < max_intv)
|
||||||
|
{ // an interval small enough
|
||||||
kv_push(bwtintv_t, *curr, ik);
|
kv_push(bwtintv_t, *curr, ik);
|
||||||
break;
|
break;
|
||||||
} else if (q[i] < 4) { // an A/C/G/T base
|
}
|
||||||
|
else if (q[i] < 4)
|
||||||
|
{ // an A/C/G/T base
|
||||||
c = 3 - q[i]; // complement of q[i]
|
c = 3 - q[i]; // complement of q[i]
|
||||||
bwt_extend(bwt, &ik, ok, 0);
|
bwt_extend(bwt, &ik, ok, 0);
|
||||||
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
|
if (ok[c].x[2] != ik.x[2])
|
||||||
|
{ // change of the interval size
|
||||||
kv_push(bwtintv_t, *curr, ik);
|
kv_push(bwtintv_t, *curr, ik);
|
||||||
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
|
if (ok[c].x[2] < min_intv)
|
||||||
|
break; // the interval size is too small to be extended further
|
||||||
}
|
}
|
||||||
ik = ok[c]; ik.info = i + 1;
|
ik = ok[c];
|
||||||
} else { // an ambiguous base
|
ik.info = i + 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ // an ambiguous base
|
||||||
kv_push(bwtintv_t, *curr, ik);
|
kv_push(bwtintv_t, *curr, ik);
|
||||||
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
|
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
|
if (i == len)
|
||||||
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
|
||||||
ret = curr->a[0].info; // this will be the returned value
|
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
||||||
swap = curr; curr = prev; prev = swap;
|
ret = curr->a[0].info; // this will be the returned value
|
||||||
|
swap = curr;
|
||||||
|
curr = prev;
|
||||||
|
prev = swap;
|
||||||
|
|
||||||
for (i = x - 1; i >= -1; --i) { // backward search for MEMs
|
for (i = x - 1; i >= -1; --i)
|
||||||
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
|
{ // backward search for MEMs
|
||||||
for (j = 0, curr->n = 0; j < prev->n; ++j) {
|
c = i < 0 ? -1 : q[i] < 4 ? q[i]
|
||||||
|
: -1; // c==-1 if i<0 or q[i] is an ambiguous base
|
||||||
|
for (j = 0, curr->n = 0; j < prev->n; ++j)
|
||||||
|
{
|
||||||
bwtintv_t *p = &prev->a[j];
|
bwtintv_t *p = &prev->a[j];
|
||||||
if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
|
if (c >= 0 && ik.x[2] >= max_intv)
|
||||||
if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
bwt_extend(bwt, p, ok, 1);
|
||||||
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
|
if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv)
|
||||||
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
|
{ // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
||||||
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
|
if (curr->n == 0)
|
||||||
|
{ // test curr->n>0 to make sure there are no longer matches
|
||||||
|
if (mem->n == 0 || i + 1 < mem->a[mem->n - 1].info >> 32)
|
||||||
|
{ // skip contained matches
|
||||||
|
ik = *p;
|
||||||
|
ik.info |= (uint64_t)(i + 1) << 32;
|
||||||
kv_push(bwtintv_t, *mem, ik);
|
kv_push(bwtintv_t, *mem, ik);
|
||||||
}
|
}
|
||||||
} // otherwise the match is contained in another longer match
|
} // otherwise the match is contained in another longer match
|
||||||
} else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) {
|
}
|
||||||
|
else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n - 1].x[2])
|
||||||
|
{
|
||||||
ok[c].info = p->info;
|
ok[c].info = p->info;
|
||||||
kv_push(bwtintv_t, *curr, ok[c]);
|
kv_push(bwtintv_t, *curr, ok[c]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (curr->n == 0) break;
|
if (curr->n == 0)
|
||||||
swap = curr; curr = prev; prev = swap;
|
break;
|
||||||
|
swap = curr;
|
||||||
|
curr = prev;
|
||||||
|
prev = swap;
|
||||||
}
|
}
|
||||||
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
||||||
|
|
||||||
if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
|
if (tmpvec == 0 || tmpvec[0] == 0)
|
||||||
if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
|
free(a[0].a);
|
||||||
|
if (tmpvec == 0 || tmpvec[1] == 0)
|
||||||
|
free(a[1].a);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_bwt_smem1a, tmp_diff);
|
||||||
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -361,19 +467,25 @@ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int m
|
||||||
bwtintv_t ik, ok[4];
|
bwtintv_t ik, ok[4];
|
||||||
|
|
||||||
memset(mem, 0, sizeof(bwtintv_t));
|
memset(mem, 0, sizeof(bwtintv_t));
|
||||||
if (q[x] > 3) return x + 1;
|
if (q[x] > 3)
|
||||||
|
return x + 1;
|
||||||
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||||
for (i = x + 1; i < len; ++i) { // forward search
|
for (i = x + 1; i < len; ++i)
|
||||||
if (q[i] < 4) { // an A/C/G/T base
|
{ // forward search
|
||||||
|
if (q[i] < 4)
|
||||||
|
{ // an A/C/G/T base
|
||||||
c = 3 - q[i]; // complement of q[i]
|
c = 3 - q[i]; // complement of q[i]
|
||||||
bwt_extend(bwt, &ik, ok, 0);
|
bwt_extend(bwt, &ik, ok, 0);
|
||||||
if (ok[c].x[2] < max_intv && i - x >= min_len) {
|
if (ok[c].x[2] < max_intv && i - x >= min_len)
|
||||||
|
{
|
||||||
*mem = ok[c];
|
*mem = ok[c];
|
||||||
mem->info = (uint64_t)x<<32 | (i + 1);
|
mem->info = (uint64_t)x << 32 | (i + 1);
|
||||||
return i + 1;
|
return i + 1;
|
||||||
}
|
}
|
||||||
ik = ok[c];
|
ik = ok[c];
|
||||||
} else return i + 1;
|
}
|
||||||
|
else
|
||||||
|
return i + 1;
|
||||||
}
|
}
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
@ -387,7 +499,7 @@ void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
fp = xopen(fn, "wb");
|
fp = xopen(fn, "wb");
|
||||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
|
||||||
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
|
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
|
||||||
err_fflush(fp);
|
err_fflush(fp);
|
||||||
err_fclose(fp);
|
err_fclose(fp);
|
||||||
|
|
@ -398,7 +510,7 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt)
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
fp = xopen(fn, "wb");
|
fp = xopen(fn, "wb");
|
||||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
|
||||||
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||||
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
||||||
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
||||||
|
|
@ -407,13 +519,16 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt)
|
||||||
}
|
}
|
||||||
|
|
||||||
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
|
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
|
||||||
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
|
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
|
||||||
const int bufsize = 0x1000000; // 16M block
|
const int bufsize = 0x1000000; // 16M block
|
||||||
bwtint_t offset = 0;
|
bwtint_t offset = 0;
|
||||||
while (size) {
|
while (size)
|
||||||
int x = bufsize < size? bufsize : size;
|
{
|
||||||
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break;
|
int x = bufsize < size ? bufsize : size;
|
||||||
size -= x; offset += x;
|
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0)
|
||||||
|
break;
|
||||||
|
size -= x;
|
||||||
|
offset += x;
|
||||||
}
|
}
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
@ -433,7 +548,7 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
||||||
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
||||||
|
|
||||||
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
|
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
|
||||||
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
bwt->sa = (bwtint_t *)calloc(bwt->n_sa, sizeof(bwtint_t));
|
||||||
bwt->sa[0] = -1;
|
bwt->sa[0] = -1;
|
||||||
|
|
||||||
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
|
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
|
||||||
|
|
@ -445,15 +560,15 @@ bwt_t *bwt_restore_bwt(const char *fn)
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
|
|
||||||
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
bwt = (bwt_t *)calloc(1, sizeof(bwt_t));
|
||||||
fp = xopen(fn, "rb");
|
fp = xopen(fn, "rb");
|
||||||
err_fseek(fp, 0, SEEK_END);
|
err_fseek(fp, 0, SEEK_END);
|
||||||
bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
|
bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
|
||||||
bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
|
bwt->bwt = (uint32_t *)calloc(bwt->bwt_size, 4);
|
||||||
err_fseek(fp, 0, SEEK_SET);
|
err_fseek(fp, 0, SEEK_SET);
|
||||||
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||||
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
err_fread_noeof(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
|
||||||
fread_fix(fp, bwt->bwt_size<<2, bwt->bwt);
|
fread_fix(fp, bwt->bwt_size << 2, bwt->bwt);
|
||||||
bwt->seq_len = bwt->L2[4];
|
bwt->seq_len = bwt->L2[4];
|
||||||
err_fclose(fp);
|
err_fclose(fp);
|
||||||
bwt_gen_cnt_table(bwt);
|
bwt_gen_cnt_table(bwt);
|
||||||
|
|
@ -463,7 +578,9 @@ bwt_t *bwt_restore_bwt(const char *fn)
|
||||||
|
|
||||||
void bwt_destroy(bwt_t *bwt)
|
void bwt_destroy(bwt_t *bwt)
|
||||||
{
|
{
|
||||||
if (bwt == 0) return;
|
if (bwt == 0)
|
||||||
free(bwt->sa); free(bwt->bwt);
|
return;
|
||||||
|
free(bwt->sa);
|
||||||
|
free(bwt->bwt);
|
||||||
free(bwt);
|
free(bwt);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
218
bwtindex.c
218
bwtindex.c
|
|
@ -1,8 +1,8 @@
|
||||||
/* The MIT License
|
/* The MIT License
|
||||||
|
|
||||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||||
2009-2018 Broad Institute, Inc.
|
2009-2018 Broad Institute, Inc.
|
||||||
2008-2009 Genome Research Ltd. (GRL)
|
2008-2009 Genome Research Ltd. (GRL)
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
Permission is hereby granted, free of charge, to any person obtaining
|
||||||
a copy of this software and associated documentation files (the
|
a copy of this software and associated documentation files (the
|
||||||
|
|
@ -42,10 +42,9 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef USE_MALLOC_WRAPPERS
|
#ifdef USE_MALLOC_WRAPPERS
|
||||||
# include "malloc_wrap.h"
|
#include "malloc_wrap.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
int is_bwt(ubyte_t *T, int n);
|
int is_bwt(ubyte_t *T, int n);
|
||||||
|
|
||||||
int64_t bwa_seq_len(const char *fn_pac)
|
int64_t bwa_seq_len(const char *fn_pac)
|
||||||
|
|
@ -69,46 +68,55 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
|
|
||||||
// initialization
|
// initialization
|
||||||
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
bwt = (bwt_t *)calloc(1, sizeof(bwt_t));
|
||||||
bwt->seq_len = bwa_seq_len(fn_pac);
|
bwt->seq_len = bwa_seq_len(fn_pac);
|
||||||
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
||||||
fp = xopen(fn_pac, "rb");
|
fp = xopen(fn_pac, "rb");
|
||||||
|
|
||||||
// prepare sequence
|
// prepare sequence
|
||||||
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
|
pac_size = (bwt->seq_len >> 2) + ((bwt->seq_len & 3) == 0 ? 0 : 1);
|
||||||
buf2 = (ubyte_t*)calloc(pac_size, 1);
|
buf2 = (ubyte_t *)calloc(pac_size, 1);
|
||||||
err_fread_noeof(buf2, 1, pac_size, fp);
|
err_fread_noeof(buf2, 1, pac_size, fp);
|
||||||
err_fclose(fp);
|
err_fclose(fp);
|
||||||
memset(bwt->L2, 0, 5 * 4);
|
memset(bwt->L2, 0, 5 * 4);
|
||||||
buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
|
buf = (ubyte_t *)calloc(bwt->seq_len + 1, 1);
|
||||||
for (i = 0; i < bwt->seq_len; ++i) {
|
for (i = 0; i < bwt->seq_len; ++i)
|
||||||
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
|
{
|
||||||
++bwt->L2[1+buf[i]];
|
buf[i] = buf2[i >> 2] >> ((3 - (i & 3)) << 1) & 3;
|
||||||
|
++bwt->L2[1 + buf[i]];
|
||||||
}
|
}
|
||||||
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
|
for (i = 2; i <= 4; ++i)
|
||||||
|
bwt->L2[i] += bwt->L2[i - 1];
|
||||||
free(buf2);
|
free(buf2);
|
||||||
|
|
||||||
// Burrows-Wheeler Transform
|
// Burrows-Wheeler Transform
|
||||||
if (use_is) {
|
if (use_is)
|
||||||
|
{
|
||||||
bwt->primary = is_bwt(buf, bwt->seq_len);
|
bwt->primary = is_bwt(buf, bwt->seq_len);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
rope_t *r;
|
rope_t *r;
|
||||||
int64_t x;
|
int64_t x;
|
||||||
rpitr_t itr;
|
rpitr_t itr;
|
||||||
const uint8_t *blk;
|
const uint8_t *blk;
|
||||||
|
|
||||||
r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN);
|
r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN);
|
||||||
for (i = bwt->seq_len - 1, x = 0; i >= 0; --i) {
|
for (i = bwt->seq_len - 1, x = 0; i >= 0; --i)
|
||||||
|
{
|
||||||
int c = buf[i] + 1;
|
int c = buf[i] + 1;
|
||||||
x = rope_insert_run(r, x, c, 1, 0) + 1;
|
x = rope_insert_run(r, x, c, 1, 0) + 1;
|
||||||
while (--c >= 0) x += r->c[c];
|
while (--c >= 0)
|
||||||
|
x += r->c[c];
|
||||||
}
|
}
|
||||||
bwt->primary = x;
|
bwt->primary = x;
|
||||||
rope_itr_first(r, &itr);
|
rope_itr_first(r, &itr);
|
||||||
x = 0;
|
x = 0;
|
||||||
while ((blk = rope_itr_next_block(&itr)) != 0) {
|
while ((blk = rope_itr_next_block(&itr)) != 0)
|
||||||
|
{
|
||||||
const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk);
|
const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk);
|
||||||
while (q < end) {
|
while (q < end)
|
||||||
|
{
|
||||||
int c = 0;
|
int c = 0;
|
||||||
int64_t l;
|
int64_t l;
|
||||||
rle_dec1(q, c, l);
|
rle_dec1(q, c, l);
|
||||||
|
|
@ -118,9 +126,9 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
||||||
}
|
}
|
||||||
rope_destroy(r);
|
rope_destroy(r);
|
||||||
}
|
}
|
||||||
bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
|
bwt->bwt = (uint32_t *)calloc(bwt->bwt_size, 4);
|
||||||
for (i = 0; i < bwt->seq_len; ++i)
|
for (i = 0; i < bwt->seq_len; ++i)
|
||||||
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
|
bwt->bwt[i >> 4] |= buf[i] << ((15 - (i & 15)) << 1);
|
||||||
free(buf);
|
free(buf);
|
||||||
return bwt;
|
return bwt;
|
||||||
}
|
}
|
||||||
|
|
@ -129,23 +137,29 @@ int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt
|
||||||
{
|
{
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
int c, use_is = 1;
|
int c, use_is = 1;
|
||||||
while ((c = getopt(argc, argv, "d")) >= 0) {
|
while ((c = getopt(argc, argv, "d")) >= 0)
|
||||||
switch (c) {
|
{
|
||||||
case 'd': use_is = 0; break;
|
switch (c)
|
||||||
default: return 1;
|
{
|
||||||
|
case 'd':
|
||||||
|
use_is = 0;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (optind + 2 > argc) {
|
if (optind + 2 > argc)
|
||||||
|
{
|
||||||
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
bwt = bwt_pac2bwt(argv[optind], use_is);
|
bwt = bwt_pac2bwt(argv[optind], use_is);
|
||||||
bwt_dump_bwt(argv[optind+1], bwt);
|
bwt_dump_bwt(argv[optind + 1], bwt);
|
||||||
bwt_destroy(bwt);
|
bwt_destroy(bwt);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
#define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k)&0xf) << 1) & 3)
|
||||||
|
|
||||||
void bwt_bwtupdate_core(bwt_t *bwt)
|
void bwt_bwtupdate_core(bwt_t *bwt)
|
||||||
{
|
{
|
||||||
|
|
@ -153,28 +167,33 @@ void bwt_bwtupdate_core(bwt_t *bwt)
|
||||||
uint32_t *buf;
|
uint32_t *buf;
|
||||||
|
|
||||||
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
||||||
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
|
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
|
||||||
buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
|
buf = (uint32_t *)calloc(bwt->bwt_size, 4); // will be the new bwt
|
||||||
c[0] = c[1] = c[2] = c[3] = 0;
|
c[0] = c[1] = c[2] = c[3] = 0;
|
||||||
for (i = k = 0; i < bwt->seq_len; ++i) {
|
for (i = k = 0; i < bwt->seq_len; ++i)
|
||||||
if (i % OCC_INTERVAL == 0) {
|
{
|
||||||
|
if (i % OCC_INTERVAL == 0)
|
||||||
|
{
|
||||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||||
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
|
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
|
||||||
}
|
}
|
||||||
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
|
if (i % 16 == 0)
|
||||||
|
buf[k++] = bwt->bwt[i / 16]; // 16 == sizeof(uint32_t)/2
|
||||||
++c[bwt_B00(bwt, i)];
|
++c[bwt_B00(bwt, i)];
|
||||||
}
|
}
|
||||||
// the last element
|
// the last element
|
||||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||||
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
|
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
|
||||||
// update bwt
|
// update bwt
|
||||||
free(bwt->bwt); bwt->bwt = buf;
|
free(bwt->bwt);
|
||||||
|
bwt->bwt = buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
|
int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
|
||||||
{
|
{
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
if (argc != 2) {
|
if (argc != 2)
|
||||||
|
{
|
||||||
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
@ -189,19 +208,25 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
|
||||||
{
|
{
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
int c, sa_intv = 32;
|
int c, sa_intv = 32;
|
||||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
while ((c = getopt(argc, argv, "i:")) >= 0)
|
||||||
switch (c) {
|
{
|
||||||
case 'i': sa_intv = atoi(optarg); break;
|
switch (c)
|
||||||
default: return 1;
|
{
|
||||||
|
case 'i':
|
||||||
|
sa_intv = atoi(optarg);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (optind + 2 > argc) {
|
if (optind + 2 > argc)
|
||||||
|
{
|
||||||
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
bwt = bwt_restore_bwt(argv[optind]);
|
bwt = bwt_restore_bwt(argv[optind]);
|
||||||
bwt_cal_sa(bwt, sa_intv);
|
bwt_cal_sa(bwt, sa_intv);
|
||||||
bwt_dump_sa(argv[optind+1], bwt);
|
bwt_dump_sa(argv[optind + 1], bwt);
|
||||||
bwt_destroy(bwt);
|
bwt_destroy(bwt);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
@ -210,27 +235,42 @@ int bwa_index(int argc, char *argv[]) // the "index" command
|
||||||
{
|
{
|
||||||
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
|
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
|
||||||
char *prefix = 0, *str;
|
char *prefix = 0, *str;
|
||||||
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
|
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0)
|
||||||
switch (c) {
|
{
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
case 'a': // if -a is not set, algo_type will be determined later
|
case 'a': // if -a is not set, algo_type will be determined later
|
||||||
if (strcmp(optarg, "rb2") == 0) algo_type = BWTALGO_RB2;
|
if (strcmp(optarg, "rb2") == 0)
|
||||||
else if (strcmp(optarg, "bwtsw") == 0) algo_type = BWTALGO_BWTSW;
|
algo_type = BWTALGO_RB2;
|
||||||
else if (strcmp(optarg, "is") == 0) algo_type = BWTALGO_IS;
|
else if (strcmp(optarg, "bwtsw") == 0)
|
||||||
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
|
algo_type = BWTALGO_BWTSW;
|
||||||
|
else if (strcmp(optarg, "is") == 0)
|
||||||
|
algo_type = BWTALGO_IS;
|
||||||
|
else
|
||||||
|
err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
|
||||||
|
break;
|
||||||
|
case 'p':
|
||||||
|
prefix = strdup(optarg);
|
||||||
|
break;
|
||||||
|
case '6':
|
||||||
|
is_64 = 1;
|
||||||
break;
|
break;
|
||||||
case 'p': prefix = strdup(optarg); break;
|
|
||||||
case '6': is_64 = 1; break;
|
|
||||||
case 'b':
|
case 'b':
|
||||||
block_size = strtol(optarg, &str, 10);
|
block_size = strtol(optarg, &str, 10);
|
||||||
if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
|
if (*str == 'G' || *str == 'g')
|
||||||
else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024;
|
block_size *= 1024 * 1024 * 1024;
|
||||||
else if (*str == 'K' || *str == 'k') block_size *= 1024;
|
else if (*str == 'M' || *str == 'm')
|
||||||
|
block_size *= 1024 * 1024;
|
||||||
|
else if (*str == 'K' || *str == 'k')
|
||||||
|
block_size *= 1024;
|
||||||
break;
|
break;
|
||||||
default: return 1;
|
default:
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (optind + 1 > argc) {
|
if (optind + 1 > argc)
|
||||||
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
|
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
|
||||||
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
|
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
|
||||||
|
|
@ -238,14 +278,16 @@ int bwa_index(int argc, char *argv[]) // the "index" command
|
||||||
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
|
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
|
||||||
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
|
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
|
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
|
||||||
fprintf(stderr, " `-a div' do not work not for long genomes.\n\n");
|
fprintf(stderr, " `-a div' do not work not for long genomes.\n\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (prefix == 0) {
|
if (prefix == 0)
|
||||||
|
{
|
||||||
prefix = malloc(strlen(argv[optind]) + 4);
|
prefix = malloc(strlen(argv[optind]) + 4);
|
||||||
strcpy(prefix, argv[optind]);
|
strcpy(prefix, argv[optind]);
|
||||||
if (is_64) strcat(prefix, ".64");
|
if (is_64)
|
||||||
|
strcat(prefix, ".64");
|
||||||
}
|
}
|
||||||
bwa_idx_build(argv[optind], prefix, algo_type, block_size);
|
bwa_idx_build(argv[optind], prefix, algo_type, block_size);
|
||||||
free(prefix);
|
free(prefix);
|
||||||
|
|
@ -260,64 +302,84 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
|
||||||
clock_t t;
|
clock_t t;
|
||||||
int64_t l_pac;
|
int64_t l_pac;
|
||||||
|
|
||||||
str = (char*)calloc(strlen(prefix) + 10, 1);
|
str = (char *)calloc(strlen(prefix) + 10, 1);
|
||||||
str2 = (char*)calloc(strlen(prefix) + 10, 1);
|
str2 = (char *)calloc(strlen(prefix) + 10, 1);
|
||||||
str3 = (char*)calloc(strlen(prefix) + 10, 1);
|
str3 = (char *)calloc(strlen(prefix) + 10, 1);
|
||||||
|
|
||||||
{ // nucleotide indexing
|
{ // nucleotide indexing
|
||||||
gzFile fp = xzopen(fa, "r");
|
gzFile fp = xzopen(fa, "r");
|
||||||
t = clock();
|
t = clock();
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
||||||
l_pac = bns_fasta2bntseq(fp, prefix, 0);
|
l_pac = bns_fasta2bntseq(fp, prefix, 0);
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
err_gzclose(fp);
|
err_gzclose(fp);
|
||||||
}
|
}
|
||||||
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
|
if (algo_type == 0)
|
||||||
|
algo_type = l_pac > 50000000 ? 2 : 3; // set the algorithm for generating BWT
|
||||||
{
|
{
|
||||||
strcpy(str, prefix); strcat(str, ".pac");
|
strcpy(str, prefix);
|
||||||
strcpy(str2, prefix); strcat(str2, ".bwt");
|
strcat(str, ".pac");
|
||||||
|
strcpy(str2, prefix);
|
||||||
|
strcat(str2, ".bwt");
|
||||||
t = clock();
|
t = clock();
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
|
if (bwa_verbose >= 3)
|
||||||
if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
|
fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
|
||||||
else if (algo_type == 1 || algo_type == 3) {
|
if (algo_type == 2)
|
||||||
|
bwt_bwtgen2(str, str2, block_size);
|
||||||
|
else if (algo_type == 1 || algo_type == 3)
|
||||||
|
{
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
bwt = bwt_pac2bwt(str, algo_type == 3);
|
bwt = bwt_pac2bwt(str, algo_type == 3);
|
||||||
bwt_dump_bwt(str2, bwt);
|
bwt_dump_bwt(str2, bwt);
|
||||||
bwt_destroy(bwt);
|
bwt_destroy(bwt);
|
||||||
}
|
}
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
strcpy(str, prefix); strcat(str, ".bwt");
|
strcpy(str, prefix);
|
||||||
|
strcat(str, ".bwt");
|
||||||
t = clock();
|
t = clock();
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Update BWT... ");
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "[bwa_index] Update BWT... ");
|
||||||
bwt = bwt_restore_bwt(str);
|
bwt = bwt_restore_bwt(str);
|
||||||
bwt_bwtupdate_core(bwt);
|
bwt_bwtupdate_core(bwt);
|
||||||
bwt_dump_bwt(str, bwt);
|
bwt_dump_bwt(str, bwt);
|
||||||
bwt_destroy(bwt);
|
bwt_destroy(bwt);
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
gzFile fp = xzopen(fa, "r");
|
gzFile fp = xzopen(fa, "r");
|
||||||
t = clock();
|
t = clock();
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
|
||||||
l_pac = bns_fasta2bntseq(fp, prefix, 1);
|
l_pac = bns_fasta2bntseq(fp, prefix, 1);
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
err_gzclose(fp);
|
err_gzclose(fp);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
strcpy(str, prefix); strcat(str, ".bwt");
|
strcpy(str, prefix);
|
||||||
strcpy(str3, prefix); strcat(str3, ".sa");
|
strcat(str, ".bwt");
|
||||||
|
strcpy(str3, prefix);
|
||||||
|
strcat(str3, ".sa");
|
||||||
t = clock();
|
t = clock();
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
|
||||||
bwt = bwt_restore_bwt(str);
|
bwt = bwt_restore_bwt(str);
|
||||||
bwt_cal_sa(bwt, 32);
|
bwt_cal_sa(bwt, 32);
|
||||||
bwt_dump_sa(str3, bwt);
|
bwt_dump_sa(str3, bwt);
|
||||||
bwt_destroy(bwt);
|
bwt_destroy(bwt);
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
}
|
}
|
||||||
free(str3); free(str2); free(str);
|
free(str3);
|
||||||
|
free(str2);
|
||||||
|
free(str);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
623
fastmap.c
623
fastmap.c
|
|
@ -1,8 +1,8 @@
|
||||||
/* The MIT License
|
/* The MIT License
|
||||||
|
|
||||||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||||||
2009-2018 Broad Institute, Inc.
|
2009-2018 Broad Institute, Inc.
|
||||||
2008-2009 Genome Research Ltd. (GRL)
|
2008-2009 Genome Research Ltd. (GRL)
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
Permission is hereby granted, free of charge, to any person obtaining
|
||||||
a copy of this software and associated documentation files (the
|
a copy of this software and associated documentation files (the
|
||||||
|
|
@ -40,13 +40,48 @@
|
||||||
#include "kseq.h"
|
#include "kseq.h"
|
||||||
KSEQ_DECLARE(gzFile)
|
KSEQ_DECLARE(gzFile)
|
||||||
|
|
||||||
|
FILE *query_f = 0, *target_f = 0, *info_f = 0;
|
||||||
|
|
||||||
|
// 记录运行时间的变量
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
// 用来调试,计算感兴趣部分的运行时间
|
||||||
|
#include "sys/time.h"
|
||||||
|
// 获取当前毫秒数
|
||||||
|
int64_t get_mseconds()
|
||||||
|
{
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
return (int64_t)1000 * (tv.tv_sec + ((1e-6) * tv.tv_usec));
|
||||||
|
}
|
||||||
|
int64_t time_ksw_extend2 = 0,
|
||||||
|
time_ksw_global2 = 0,
|
||||||
|
time_ksw_align2 = 0,
|
||||||
|
time_bwt_smem1a = 0,
|
||||||
|
time_bwt_occ4 = 0,
|
||||||
|
time_bwt_sa = 0,
|
||||||
|
time_mem_chain = 0,
|
||||||
|
time_worker1 = 0,
|
||||||
|
time_worker2 = 0,
|
||||||
|
time_process_step0 = 0,
|
||||||
|
time_process_step1 = 0,
|
||||||
|
time_process_step2 = 0,
|
||||||
|
time_before_process = 0,
|
||||||
|
time_process = 0,
|
||||||
|
time_after_process = 0,
|
||||||
|
count_process_step2 = 0,
|
||||||
|
count_ksw_extend2 = 0;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
//////////////////////////////////
|
||||||
|
|
||||||
extern unsigned char nst_nt4_table[256];
|
extern unsigned char nst_nt4_table[256];
|
||||||
|
|
||||||
void *kopen(const char *fn, int *_fd);
|
void *kopen(const char *fn, int *_fd);
|
||||||
int kclose(void *a);
|
int kclose(void *a);
|
||||||
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
|
void kt_pipeline(int n_threads, void *(*func)(void *, int, void *), void *shared_data, int n_steps);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
kseq_t *ks, *ks2;
|
kseq_t *ks, *ks2;
|
||||||
mem_opt_t *opt;
|
mem_opt_t *opt;
|
||||||
mem_pestat_t *pes0;
|
mem_pestat_t *pes0;
|
||||||
|
|
@ -55,7 +90,8 @@ typedef struct {
|
||||||
bwaidx_t *idx;
|
bwaidx_t *idx;
|
||||||
} ktp_aux_t;
|
} ktp_aux_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
ktp_aux_t *aux;
|
ktp_aux_t *aux;
|
||||||
int n_seqs;
|
int n_seqs;
|
||||||
bseq1_t *seqs;
|
bseq1_t *seqs;
|
||||||
|
|
@ -63,60 +99,106 @@ typedef struct {
|
||||||
|
|
||||||
static void *process(void *shared, int step, void *_data)
|
static void *process(void *shared, int step, void *_data)
|
||||||
{
|
{
|
||||||
ktp_aux_t *aux = (ktp_aux_t*)shared;
|
ktp_aux_t *aux = (ktp_aux_t *)shared;
|
||||||
ktp_data_t *data = (ktp_data_t*)_data;
|
ktp_data_t *data = (ktp_data_t *)_data;
|
||||||
int i;
|
int i;
|
||||||
if (step == 0) {
|
if (step == 0)
|
||||||
|
{
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
ktp_data_t *ret;
|
ktp_data_t *ret;
|
||||||
int64_t size = 0;
|
int64_t size = 0;
|
||||||
ret = calloc(1, sizeof(ktp_data_t));
|
ret = calloc(1, sizeof(ktp_data_t));
|
||||||
ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2);
|
ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2);
|
||||||
if (ret->seqs == 0) {
|
if (ret->seqs == 0)
|
||||||
|
{
|
||||||
free(ret);
|
free(ret);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_process_step0, tmp_diff);
|
||||||
|
#endif
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (!aux->copy_comment)
|
if (!aux->copy_comment)
|
||||||
for (i = 0; i < ret->n_seqs; ++i) {
|
for (i = 0; i < ret->n_seqs; ++i)
|
||||||
|
{
|
||||||
free(ret->seqs[i].comment);
|
free(ret->seqs[i].comment);
|
||||||
ret->seqs[i].comment = 0;
|
ret->seqs[i].comment = 0;
|
||||||
}
|
}
|
||||||
for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq;
|
for (i = 0; i < ret->n_seqs; ++i)
|
||||||
|
size += ret->seqs[i].l_seq;
|
||||||
if (bwa_verbose >= 3)
|
if (bwa_verbose >= 3)
|
||||||
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
|
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_process_step0, tmp_diff);
|
||||||
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
} else if (step == 1) {
|
}
|
||||||
|
else if (step == 1)
|
||||||
|
{
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
const mem_opt_t *opt = aux->opt;
|
const mem_opt_t *opt = aux->opt;
|
||||||
const bwaidx_t *idx = aux->idx;
|
const bwaidx_t *idx = aux->idx;
|
||||||
if (opt->flag & MEM_F_SMARTPE) {
|
if (opt->flag & MEM_F_SMARTPE)
|
||||||
|
{
|
||||||
bseq1_t *sep[2];
|
bseq1_t *sep[2];
|
||||||
int n_sep[2];
|
int n_sep[2];
|
||||||
mem_opt_t tmp_opt = *opt;
|
mem_opt_t tmp_opt = *opt;
|
||||||
bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
|
bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
|
||||||
if (bwa_verbose >= 3)
|
if (bwa_verbose >= 3)
|
||||||
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
|
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
|
||||||
if (n_sep[0]) {
|
if (n_sep[0])
|
||||||
|
{
|
||||||
tmp_opt.flag &= ~MEM_F_PE;
|
tmp_opt.flag &= ~MEM_F_PE;
|
||||||
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0);
|
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0);
|
||||||
for (i = 0; i < n_sep[0]; ++i)
|
for (i = 0; i < n_sep[0]; ++i)
|
||||||
data->seqs[sep[0][i].id].sam = sep[0][i].sam;
|
data->seqs[sep[0][i].id].sam = sep[0][i].sam;
|
||||||
}
|
}
|
||||||
if (n_sep[1]) {
|
if (n_sep[1])
|
||||||
|
{
|
||||||
tmp_opt.flag |= MEM_F_PE;
|
tmp_opt.flag |= MEM_F_PE;
|
||||||
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0);
|
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0);
|
||||||
for (i = 0; i < n_sep[1]; ++i)
|
for (i = 0; i < n_sep[1]; ++i)
|
||||||
data->seqs[sep[1][i].id].sam = sep[1][i].sam;
|
data->seqs[sep[1][i].id].sam = sep[1][i].sam;
|
||||||
}
|
}
|
||||||
free(sep[0]); free(sep[1]);
|
free(sep[0]);
|
||||||
} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0);
|
free(sep[1]);
|
||||||
aux->n_processed += data->n_seqs;
|
|
||||||
return data;
|
|
||||||
} else if (step == 2) {
|
|
||||||
for (i = 0; i < data->n_seqs; ++i) {
|
|
||||||
if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout);
|
|
||||||
free(data->seqs[i].name); free(data->seqs[i].comment);
|
|
||||||
free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam);
|
|
||||||
}
|
}
|
||||||
free(data->seqs); free(data);
|
else
|
||||||
|
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0);
|
||||||
|
aux->n_processed += data->n_seqs;
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_process_step1, tmp_diff);
|
||||||
|
__sync_fetch_and_add(&count_process_step2, 1);
|
||||||
|
#endif
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
else if (step == 2)
|
||||||
|
{
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
|
for (i = 0; i < data->n_seqs; ++i)
|
||||||
|
{
|
||||||
|
if (data->seqs[i].sam)
|
||||||
|
err_fputs(data->seqs[i].sam, stdout);
|
||||||
|
free(data->seqs[i].name);
|
||||||
|
free(data->seqs[i].comment);
|
||||||
|
free(data->seqs[i].seq);
|
||||||
|
free(data->seqs[i].qual);
|
||||||
|
free(data->seqs[i].sam);
|
||||||
|
}
|
||||||
|
free(data->seqs);
|
||||||
|
free(data);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_process_step2, tmp_diff);
|
||||||
|
#endif
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -124,22 +206,36 @@ static void *process(void *shared, int step, void *_data)
|
||||||
|
|
||||||
static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
|
static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
|
||||||
{
|
{
|
||||||
if (opt0->a) { // matching score is changed
|
if (opt0->a)
|
||||||
if (!opt0->b) opt->b *= opt->a;
|
{ // matching score is changed
|
||||||
if (!opt0->T) opt->T *= opt->a;
|
if (!opt0->b)
|
||||||
if (!opt0->o_del) opt->o_del *= opt->a;
|
opt->b *= opt->a;
|
||||||
if (!opt0->e_del) opt->e_del *= opt->a;
|
if (!opt0->T)
|
||||||
if (!opt0->o_ins) opt->o_ins *= opt->a;
|
opt->T *= opt->a;
|
||||||
if (!opt0->e_ins) opt->e_ins *= opt->a;
|
if (!opt0->o_del)
|
||||||
if (!opt0->zdrop) opt->zdrop *= opt->a;
|
opt->o_del *= opt->a;
|
||||||
if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a;
|
if (!opt0->e_del)
|
||||||
if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a;
|
opt->e_del *= opt->a;
|
||||||
if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a;
|
if (!opt0->o_ins)
|
||||||
|
opt->o_ins *= opt->a;
|
||||||
|
if (!opt0->e_ins)
|
||||||
|
opt->e_ins *= opt->a;
|
||||||
|
if (!opt0->zdrop)
|
||||||
|
opt->zdrop *= opt->a;
|
||||||
|
if (!opt0->pen_clip5)
|
||||||
|
opt->pen_clip5 *= opt->a;
|
||||||
|
if (!opt0->pen_clip3)
|
||||||
|
opt->pen_clip3 *= opt->a;
|
||||||
|
if (!opt0->pen_unpaired)
|
||||||
|
opt->pen_unpaired *= opt->a;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main_mem(int argc, char *argv[])
|
int main_mem(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
int64_t start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
mem_opt_t *opt, opt0;
|
mem_opt_t *opt, opt0;
|
||||||
int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0;
|
int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0;
|
||||||
int fixed_chunk_size = -1;
|
int fixed_chunk_size = -1;
|
||||||
|
|
@ -152,119 +248,187 @@ int main_mem(int argc, char *argv[])
|
||||||
|
|
||||||
memset(&aux, 0, sizeof(ktp_aux_t));
|
memset(&aux, 0, sizeof(ktp_aux_t));
|
||||||
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
||||||
for (i = 0; i < 4; ++i) pes[i].failed = 1;
|
for (i = 0; i < 4; ++i)
|
||||||
|
pes[i].failed = 1;
|
||||||
|
|
||||||
|
query_f = fopen("query.fa", "w");
|
||||||
|
target_f = fopen("target.fa", "w");
|
||||||
|
info_f = fopen("info.txt", "w");
|
||||||
|
|
||||||
aux.opt = opt = mem_opt_init();
|
aux.opt = opt = mem_opt_init();
|
||||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||||
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
|
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0)
|
||||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
{
|
||||||
else if (c == '1') no_mt_io = 1;
|
if (c == 'k')
|
||||||
else if (c == 'x') mode = optarg;
|
opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||||
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
|
else if (c == '1')
|
||||||
else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
|
no_mt_io = 1;
|
||||||
else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1;
|
else if (c == 'x')
|
||||||
else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1;
|
mode = optarg;
|
||||||
else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1;
|
else if (c == 'w')
|
||||||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
opt->w = atoi(optarg), opt0.w = 1;
|
||||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
else if (c == 'A')
|
||||||
else if (c == 'a') opt->flag |= MEM_F_ALL;
|
opt->a = atoi(optarg), opt0.a = 1;
|
||||||
else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
|
else if (c == 'B')
|
||||||
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
opt->b = atoi(optarg), opt0.b = 1;
|
||||||
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
|
else if (c == 'T')
|
||||||
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
|
opt->T = atoi(optarg), opt0.T = 1;
|
||||||
else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
|
else if (c == 'U')
|
||||||
else if (c == '5') opt->flag |= MEM_F_PRIMARY5 | MEM_F_KEEP_SUPP_MAPQ; // always apply MEM_F_KEEP_SUPP_MAPQ with -5
|
opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1;
|
||||||
else if (c == 'q') opt->flag |= MEM_F_KEEP_SUPP_MAPQ;
|
else if (c == 't')
|
||||||
else if (c == 'u') opt->flag |= MEM_F_XB;
|
opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1 ? opt->n_threads : 1;
|
||||||
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
|
else if (c == 'P')
|
||||||
else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
|
opt->flag |= MEM_F_NOPAIRING;
|
||||||
else if (c == 'v') bwa_verbose = atoi(optarg);
|
else if (c == 'a')
|
||||||
else if (c == 'j') ignore_alt = 1;
|
opt->flag |= MEM_F_ALL;
|
||||||
else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.;
|
else if (c == 'p')
|
||||||
else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
|
opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
|
||||||
else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
|
else if (c == 'M')
|
||||||
else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
|
opt->flag |= MEM_F_NO_MULTI;
|
||||||
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
|
else if (c == 'S')
|
||||||
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
|
opt->flag |= MEM_F_NO_RESCUE;
|
||||||
else if (c == 'o' || c == 'f') xreopen(optarg, "wb", stdout);
|
else if (c == 'Y')
|
||||||
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
|
opt->flag |= MEM_F_SOFTCLIP;
|
||||||
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
|
else if (c == 'V')
|
||||||
else if (c == 'C') aux.copy_comment = 1;
|
opt->flag |= MEM_F_REF_HDR;
|
||||||
else if (c == 'K') fixed_chunk_size = atoi(optarg);
|
else if (c == '5')
|
||||||
else if (c == 'X') opt->mask_level = atof(optarg);
|
opt->flag |= MEM_F_PRIMARY5 | MEM_F_KEEP_SUPP_MAPQ; // always apply MEM_F_KEEP_SUPP_MAPQ with -5
|
||||||
else if (c == 'F') bwa_dbg = atoi(optarg);
|
else if (c == 'q')
|
||||||
else if (c == 'h') {
|
opt->flag |= MEM_F_KEEP_SUPP_MAPQ;
|
||||||
|
else if (c == 'u')
|
||||||
|
opt->flag |= MEM_F_XB;
|
||||||
|
else if (c == 'c')
|
||||||
|
opt->max_occ = atoi(optarg), opt0.max_occ = 1;
|
||||||
|
else if (c == 'd')
|
||||||
|
opt->zdrop = atoi(optarg), opt0.zdrop = 1;
|
||||||
|
else if (c == 'v')
|
||||||
|
bwa_verbose = atoi(optarg);
|
||||||
|
else if (c == 'j')
|
||||||
|
ignore_alt = 1;
|
||||||
|
else if (c == 'r')
|
||||||
|
opt->split_factor = atof(optarg), opt0.split_factor = 1.;
|
||||||
|
else if (c == 'D')
|
||||||
|
opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
|
||||||
|
else if (c == 'm')
|
||||||
|
opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
|
||||||
|
else if (c == 's')
|
||||||
|
opt->split_width = atoi(optarg), opt0.split_width = 1;
|
||||||
|
else if (c == 'G')
|
||||||
|
opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
|
||||||
|
else if (c == 'N')
|
||||||
|
opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
|
||||||
|
else if (c == 'o' || c == 'f')
|
||||||
|
xreopen(optarg, "wb", stdout);
|
||||||
|
else if (c == 'W')
|
||||||
|
opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
|
||||||
|
else if (c == 'y')
|
||||||
|
opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
|
||||||
|
else if (c == 'C')
|
||||||
|
aux.copy_comment = 1;
|
||||||
|
else if (c == 'K')
|
||||||
|
fixed_chunk_size = atoi(optarg);
|
||||||
|
else if (c == 'X')
|
||||||
|
opt->mask_level = atof(optarg);
|
||||||
|
else if (c == 'F')
|
||||||
|
bwa_dbg = atoi(optarg);
|
||||||
|
else if (c == 'h')
|
||||||
|
{
|
||||||
opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
|
opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
|
||||||
opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
|
opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
|
||||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||||
opt->max_XA_hits_alt = strtol(p+1, &p, 10);
|
opt->max_XA_hits_alt = strtol(p + 1, &p, 10);
|
||||||
}
|
}
|
||||||
else if (c == 'z') opt->XA_drop_ratio = atof(optarg);
|
else if (c == 'z')
|
||||||
else if (c == 'Q') {
|
opt->XA_drop_ratio = atof(optarg);
|
||||||
|
else if (c == 'Q')
|
||||||
|
{
|
||||||
opt0.mapQ_coef_len = 1;
|
opt0.mapQ_coef_len = 1;
|
||||||
opt->mapQ_coef_len = atoi(optarg);
|
opt->mapQ_coef_len = atoi(optarg);
|
||||||
opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0;
|
opt->mapQ_coef_fac = opt->mapQ_coef_len > 0 ? log(opt->mapQ_coef_len) : 0;
|
||||||
} else if (c == 'O') {
|
}
|
||||||
|
else if (c == 'O')
|
||||||
|
{
|
||||||
opt0.o_del = opt0.o_ins = 1;
|
opt0.o_del = opt0.o_ins = 1;
|
||||||
opt->o_del = opt->o_ins = strtol(optarg, &p, 10);
|
opt->o_del = opt->o_ins = strtol(optarg, &p, 10);
|
||||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||||
opt->o_ins = strtol(p+1, &p, 10);
|
opt->o_ins = strtol(p + 1, &p, 10);
|
||||||
} else if (c == 'E') {
|
}
|
||||||
|
else if (c == 'E')
|
||||||
|
{
|
||||||
opt0.e_del = opt0.e_ins = 1;
|
opt0.e_del = opt0.e_ins = 1;
|
||||||
opt->e_del = opt->e_ins = strtol(optarg, &p, 10);
|
opt->e_del = opt->e_ins = strtol(optarg, &p, 10);
|
||||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||||
opt->e_ins = strtol(p+1, &p, 10);
|
opt->e_ins = strtol(p + 1, &p, 10);
|
||||||
} else if (c == 'L') {
|
}
|
||||||
|
else if (c == 'L')
|
||||||
|
{
|
||||||
opt0.pen_clip5 = opt0.pen_clip3 = 1;
|
opt0.pen_clip5 = opt0.pen_clip3 = 1;
|
||||||
opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10);
|
opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10);
|
||||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||||
opt->pen_clip3 = strtol(p+1, &p, 10);
|
opt->pen_clip3 = strtol(p + 1, &p, 10);
|
||||||
} else if (c == 'R') {
|
}
|
||||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
|
else if (c == 'R')
|
||||||
} else if (c == 'H') {
|
{
|
||||||
if (optarg[0] != '@') {
|
if ((rg_line = bwa_set_rg(optarg)) == 0)
|
||||||
|
return 1; // FIXME: memory leak
|
||||||
|
}
|
||||||
|
else if (c == 'H')
|
||||||
|
{
|
||||||
|
if (optarg[0] != '@')
|
||||||
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
if ((fp = fopen(optarg, "r")) != 0) {
|
if ((fp = fopen(optarg, "r")) != 0)
|
||||||
|
{
|
||||||
char *buf;
|
char *buf;
|
||||||
buf = calloc(1, 0x10000);
|
buf = calloc(1, 0x10000);
|
||||||
while (fgets(buf, 0xffff, fp)) {
|
while (fgets(buf, 0xffff, fp))
|
||||||
|
{
|
||||||
i = strlen(buf);
|
i = strlen(buf);
|
||||||
assert(buf[i-1] == '\n'); // a long line
|
assert(buf[i - 1] == '\n'); // a long line
|
||||||
buf[i-1] = 0;
|
buf[i - 1] = 0;
|
||||||
hdr_line = bwa_insert_header(buf, hdr_line);
|
hdr_line = bwa_insert_header(buf, hdr_line);
|
||||||
}
|
}
|
||||||
free(buf);
|
free(buf);
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
} else hdr_line = bwa_insert_header(optarg, hdr_line);
|
}
|
||||||
} else if (c == 'I') { // specify the insert size distribution
|
else
|
||||||
|
hdr_line = bwa_insert_header(optarg, hdr_line);
|
||||||
|
}
|
||||||
|
else if (c == 'I')
|
||||||
|
{ // specify the insert size distribution
|
||||||
aux.pes0 = pes;
|
aux.pes0 = pes;
|
||||||
pes[1].failed = 0;
|
pes[1].failed = 0;
|
||||||
pes[1].avg = strtod(optarg, &p);
|
pes[1].avg = strtod(optarg, &p);
|
||||||
pes[1].std = pes[1].avg * .1;
|
pes[1].std = pes[1].avg * .1;
|
||||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||||
pes[1].std = strtod(p+1, &p);
|
pes[1].std = strtod(p + 1, &p);
|
||||||
pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499);
|
pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499);
|
||||||
pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499);
|
pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499);
|
||||||
if (pes[1].low < 1) pes[1].low = 1;
|
if (pes[1].low < 1)
|
||||||
|
pes[1].low = 1;
|
||||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||||
pes[1].high = (int)(strtod(p+1, &p) + .499);
|
pes[1].high = (int)(strtod(p + 1, &p) + .499);
|
||||||
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
if (*p != 0 && ispunct(*p) && isdigit(p[1]))
|
||||||
pes[1].low = (int)(strtod(p+1, &p) + .499);
|
pes[1].low = (int)(strtod(p + 1, &p) + .499);
|
||||||
if (bwa_verbose >= 3)
|
if (bwa_verbose >= 3)
|
||||||
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
|
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
|
||||||
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
|
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
|
||||||
}
|
}
|
||||||
else return 1;
|
else
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rg_line) {
|
if (rg_line)
|
||||||
|
{
|
||||||
hdr_line = bwa_insert_header(rg_line, hdr_line);
|
hdr_line = bwa_insert_header(rg_line, hdr_line);
|
||||||
free(rg_line);
|
free(rg_line);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opt->n_threads < 1) opt->n_threads = 1;
|
if (opt->n_threads < 1)
|
||||||
if (optind + 1 >= argc || optind + 3 < argc) {
|
opt->n_threads = 1;
|
||||||
|
if (optind + 1 >= argc || optind + 3 < argc)
|
||||||
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
|
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
|
||||||
fprintf(stderr, "Algorithm options:\n\n");
|
fprintf(stderr, "Algorithm options:\n\n");
|
||||||
|
|
@ -274,7 +438,7 @@ int main_mem(int argc, char *argv[])
|
||||||
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
|
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
|
||||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||||
fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
|
fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
|
||||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||||
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
|
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
|
||||||
fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n");
|
fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n");
|
||||||
|
|
@ -304,7 +468,7 @@ int main_mem(int argc, char *argv[])
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, " -v INT verbosity level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
|
fprintf(stderr, " -v INT verbosity level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
|
||||||
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
|
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
|
||||||
fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >%.2f%% of the max score, output all in XA [%d,%d]\n",
|
fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >%.2f%% of the max score, output all in XA [%d,%d]\n",
|
||||||
opt->XA_drop_ratio * 100.0,
|
opt->XA_drop_ratio * 100.0,
|
||||||
opt->max_XA_hits, opt->max_XA_hits_alt);
|
opt->max_XA_hits, opt->max_XA_hits_alt);
|
||||||
fprintf(stderr, " A second value may be given for alternate sequences.\n");
|
fprintf(stderr, " A second value may be given for alternate sequences.\n");
|
||||||
|
|
@ -327,62 +491,103 @@ int main_mem(int argc, char *argv[])
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mode) {
|
if (mode)
|
||||||
if (strcmp(mode, "intractg") == 0) {
|
{
|
||||||
if (!opt0.o_del) opt->o_del = 16;
|
if (strcmp(mode, "intractg") == 0)
|
||||||
if (!opt0.o_ins) opt->o_ins = 16;
|
{
|
||||||
if (!opt0.b) opt->b = 9;
|
if (!opt0.o_del)
|
||||||
if (!opt0.pen_clip5) opt->pen_clip5 = 5;
|
opt->o_del = 16;
|
||||||
if (!opt0.pen_clip3) opt->pen_clip3 = 5;
|
if (!opt0.o_ins)
|
||||||
} else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "ont2d") == 0) {
|
opt->o_ins = 16;
|
||||||
if (!opt0.o_del) opt->o_del = 1;
|
if (!opt0.b)
|
||||||
if (!opt0.e_del) opt->e_del = 1;
|
opt->b = 9;
|
||||||
if (!opt0.o_ins) opt->o_ins = 1;
|
if (!opt0.pen_clip5)
|
||||||
if (!opt0.e_ins) opt->e_ins = 1;
|
opt->pen_clip5 = 5;
|
||||||
if (!opt0.b) opt->b = 1;
|
if (!opt0.pen_clip3)
|
||||||
if (opt0.split_factor == 0.) opt->split_factor = 10.;
|
opt->pen_clip3 = 5;
|
||||||
if (strcmp(mode, "ont2d") == 0) {
|
}
|
||||||
if (!opt0.min_chain_weight) opt->min_chain_weight = 20;
|
else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "ont2d") == 0)
|
||||||
if (!opt0.min_seed_len) opt->min_seed_len = 14;
|
{
|
||||||
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
|
if (!opt0.o_del)
|
||||||
if (!opt0.pen_clip3) opt->pen_clip3 = 0;
|
opt->o_del = 1;
|
||||||
} else {
|
if (!opt0.e_del)
|
||||||
if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
|
opt->e_del = 1;
|
||||||
if (!opt0.min_seed_len) opt->min_seed_len = 17;
|
if (!opt0.o_ins)
|
||||||
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
|
opt->o_ins = 1;
|
||||||
if (!opt0.pen_clip3) opt->pen_clip3 = 0;
|
if (!opt0.e_ins)
|
||||||
|
opt->e_ins = 1;
|
||||||
|
if (!opt0.b)
|
||||||
|
opt->b = 1;
|
||||||
|
if (opt0.split_factor == 0.)
|
||||||
|
opt->split_factor = 10.;
|
||||||
|
if (strcmp(mode, "ont2d") == 0)
|
||||||
|
{
|
||||||
|
if (!opt0.min_chain_weight)
|
||||||
|
opt->min_chain_weight = 20;
|
||||||
|
if (!opt0.min_seed_len)
|
||||||
|
opt->min_seed_len = 14;
|
||||||
|
if (!opt0.pen_clip5)
|
||||||
|
opt->pen_clip5 = 0;
|
||||||
|
if (!opt0.pen_clip3)
|
||||||
|
opt->pen_clip3 = 0;
|
||||||
}
|
}
|
||||||
} else {
|
else
|
||||||
|
{
|
||||||
|
if (!opt0.min_chain_weight)
|
||||||
|
opt->min_chain_weight = 40;
|
||||||
|
if (!opt0.min_seed_len)
|
||||||
|
opt->min_seed_len = 17;
|
||||||
|
if (!opt0.pen_clip5)
|
||||||
|
opt->pen_clip5 = 0;
|
||||||
|
if (!opt0.pen_clip3)
|
||||||
|
opt->pen_clip3 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode);
|
fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode);
|
||||||
return 1; // FIXME memory leak
|
return 1; // FIXME memory leak
|
||||||
}
|
}
|
||||||
} else update_a(opt, &opt0);
|
}
|
||||||
|
else
|
||||||
|
update_a(opt, &opt0);
|
||||||
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
||||||
|
|
||||||
aux.idx = bwa_idx_load_from_shm(argv[optind]);
|
aux.idx = bwa_idx_load_from_shm(argv[optind]);
|
||||||
if (aux.idx == 0) {
|
if (aux.idx == 0)
|
||||||
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
{
|
||||||
} else if (bwa_verbose >= 3)
|
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0)
|
||||||
|
return 1; // FIXME: memory leak
|
||||||
|
}
|
||||||
|
else if (bwa_verbose >= 3)
|
||||||
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
|
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
|
||||||
if (ignore_alt)
|
if (ignore_alt)
|
||||||
for (i = 0; i < aux.idx->bns->n_seqs; ++i)
|
for (i = 0; i < aux.idx->bns->n_seqs; ++i)
|
||||||
aux.idx->bns->anns[i].is_alt = 0;
|
aux.idx->bns->anns[i].is_alt = 0;
|
||||||
|
|
||||||
ko = kopen(argv[optind + 1], &fd);
|
ko = kopen(argv[optind + 1], &fd);
|
||||||
if (ko == 0) {
|
if (ko == 0)
|
||||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
|
{
|
||||||
|
if (bwa_verbose >= 1)
|
||||||
|
fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
fp = gzdopen(fd, "r");
|
fp = gzdopen(fd, "r");
|
||||||
aux.ks = kseq_init(fp);
|
aux.ks = kseq_init(fp);
|
||||||
if (optind + 2 < argc) {
|
if (optind + 2 < argc)
|
||||||
if (opt->flag&MEM_F_PE) {
|
{
|
||||||
|
if (opt->flag & MEM_F_PE)
|
||||||
|
{
|
||||||
if (bwa_verbose >= 2)
|
if (bwa_verbose >= 2)
|
||||||
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
|
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
ko2 = kopen(argv[optind + 2], &fd2);
|
ko2 = kopen(argv[optind + 2], &fd2);
|
||||||
if (ko2 == 0) {
|
if (ko2 == 0)
|
||||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]);
|
{
|
||||||
|
if (bwa_verbose >= 1)
|
||||||
|
fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
fp2 = gzdopen(fd2, "r");
|
fp2 = gzdopen(fd2, "r");
|
||||||
|
|
@ -391,17 +596,64 @@ int main_mem(int argc, char *argv[])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
|
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
|
||||||
aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
|
aux.actual_chunk_size = fixed_chunk_size > 0 ? fixed_chunk_size : opt->chunk_size * opt->n_threads;
|
||||||
kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
|
#ifdef SHOW_PERF
|
||||||
|
int64_t tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_before_process, tmp_diff);
|
||||||
|
start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
|
kt_pipeline(no_mt_io ? 1 : 2, process, &aux, 3);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_process, tmp_diff);
|
||||||
|
start_time = get_mseconds();
|
||||||
|
#endif
|
||||||
free(hdr_line);
|
free(hdr_line);
|
||||||
free(opt);
|
free(opt);
|
||||||
bwa_idx_destroy(aux.idx);
|
bwa_idx_destroy(aux.idx);
|
||||||
kseq_destroy(aux.ks);
|
kseq_destroy(aux.ks);
|
||||||
err_gzclose(fp); kclose(ko);
|
err_gzclose(fp);
|
||||||
if (aux.ks2) {
|
kclose(ko);
|
||||||
|
if (aux.ks2)
|
||||||
|
{
|
||||||
kseq_destroy(aux.ks2);
|
kseq_destroy(aux.ks2);
|
||||||
err_gzclose(fp2); kclose(ko2);
|
err_gzclose(fp2);
|
||||||
|
kclose(ko2);
|
||||||
}
|
}
|
||||||
|
if (query_f != 0)
|
||||||
|
fclose(query_f);
|
||||||
|
if (target_f != 0)
|
||||||
|
fclose(target_f);
|
||||||
|
if (info_f != 0)
|
||||||
|
fclose(info_f);
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
tmp_diff = get_mseconds() - start_time;
|
||||||
|
__sync_fetch_and_add(&time_after_process, tmp_diff);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef SHOW_PERF
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads);
|
||||||
|
fprintf(stderr, "time_ksw_global2: %f s\n", time_ksw_global2 / 1000.0 / opt->n_threads);
|
||||||
|
fprintf(stderr, "time_ksw_align2: %f s\n", time_ksw_align2 / 1000.0 / opt->n_threads);
|
||||||
|
fprintf(stderr, "time_bwt_smem1a: %f s\n", time_bwt_smem1a / 1000.0 / opt->n_threads);
|
||||||
|
// fprintf(stderr, "time_bwt_occ4: %f s\n", time_bwt_occ4 / 1000.0 / opt->n_threads);
|
||||||
|
fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads);
|
||||||
|
fprintf(stderr, "time_mem_chain: %f s\n", time_mem_chain / 1000.0 / opt->n_threads);
|
||||||
|
fprintf(stderr, "time_worker1: %f s\n", time_worker1 / 1000.0);
|
||||||
|
fprintf(stderr, "time_worker2: %f s\n", time_worker2 / 1000.0);
|
||||||
|
fprintf(stderr, "time_process_step0: %f s\n", time_process_step0 / 1000.0);
|
||||||
|
fprintf(stderr, "time_process_step1: %f s\n", time_process_step1 / 1000.0);
|
||||||
|
fprintf(stderr, "time_process_step2: %f s\n", time_process_step2 / 1000.0);
|
||||||
|
fprintf(stderr, "time_before_process: %f s\n", time_before_process / 1000.0);
|
||||||
|
fprintf(stderr, "time_process: %f s\n", time_process / 1000.0);
|
||||||
|
fprintf(stderr, "time_after_process: %f s\n", time_after_process / 1000.0);
|
||||||
|
fprintf(stderr, "count_process_step2: %ld cnts\n", count_process_step2);
|
||||||
|
fprintf(stderr, "count_ksw_extend2: %ld cnts\n", count_ksw_extend2);
|
||||||
|
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -416,18 +668,34 @@ int main_fastmap(int argc, char *argv[])
|
||||||
const bwtintv_v *a;
|
const bwtintv_v *a;
|
||||||
bwaidx_t *idx;
|
bwaidx_t *idx;
|
||||||
|
|
||||||
while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
|
while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0)
|
||||||
switch (c) {
|
{
|
||||||
case 'p': print_seq = 1; break;
|
switch (c)
|
||||||
case 'w': min_iwidth = atoi(optarg); break;
|
{
|
||||||
case 'l': min_len = atoi(optarg); break;
|
case 'p':
|
||||||
case 'i': min_intv = atoi(optarg); break;
|
print_seq = 1;
|
||||||
case 'I': max_intv = atol(optarg); break;
|
break;
|
||||||
case 'L': max_len = atoi(optarg); break;
|
case 'w':
|
||||||
default: return 1;
|
min_iwidth = atoi(optarg);
|
||||||
|
break;
|
||||||
|
case 'l':
|
||||||
|
min_len = atoi(optarg);
|
||||||
|
break;
|
||||||
|
case 'i':
|
||||||
|
min_intv = atoi(optarg);
|
||||||
|
break;
|
||||||
|
case 'I':
|
||||||
|
max_intv = atol(optarg);
|
||||||
|
break;
|
||||||
|
case 'L':
|
||||||
|
max_len = atoi(optarg);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (optind + 1 >= argc) {
|
if (optind + 1 >= argc)
|
||||||
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
|
fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
|
||||||
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
|
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
|
||||||
|
|
@ -441,34 +709,47 @@ int main_fastmap(int argc, char *argv[])
|
||||||
|
|
||||||
fp = xzopen(argv[optind + 1], "r");
|
fp = xzopen(argv[optind + 1], "r");
|
||||||
seq = kseq_init(fp);
|
seq = kseq_init(fp);
|
||||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
|
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT | BWA_IDX_BNS)) == 0)
|
||||||
|
return 1;
|
||||||
itr = smem_itr_init(idx->bwt);
|
itr = smem_itr_init(idx->bwt);
|
||||||
smem_config(itr, min_intv, max_len, max_intv);
|
smem_config(itr, min_intv, max_len, max_intv);
|
||||||
while (kseq_read(seq) >= 0) {
|
while (kseq_read(seq) >= 0)
|
||||||
|
{
|
||||||
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
|
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
|
||||||
if (print_seq) {
|
if (print_seq)
|
||||||
|
{
|
||||||
err_putchar('\t');
|
err_putchar('\t');
|
||||||
err_puts(seq->seq.s);
|
err_puts(seq->seq.s);
|
||||||
} else err_putchar('\n');
|
}
|
||||||
|
else
|
||||||
|
err_putchar('\n');
|
||||||
for (i = 0; i < seq->seq.l; ++i)
|
for (i = 0; i < seq->seq.l; ++i)
|
||||||
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
||||||
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
|
smem_set_query(itr, seq->seq.l, (uint8_t *)seq->seq.s);
|
||||||
while ((a = smem_next(itr)) != 0) {
|
while ((a = smem_next(itr)) != 0)
|
||||||
for (i = 0; i < a->n; ++i) {
|
{
|
||||||
|
for (i = 0; i < a->n; ++i)
|
||||||
|
{
|
||||||
bwtintv_t *p = &a->a[i];
|
bwtintv_t *p = &a->a[i];
|
||||||
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
|
if ((uint32_t)p->info - (p->info >> 32) < min_len)
|
||||||
err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
|
continue;
|
||||||
if (p->x[2] <= min_iwidth) {
|
err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info >> 32), (uint32_t)p->info, (long)p->x[2]);
|
||||||
for (k = 0; k < p->x[2]; ++k) {
|
if (p->x[2] <= min_iwidth)
|
||||||
|
{
|
||||||
|
for (k = 0; k < p->x[2]; ++k)
|
||||||
|
{
|
||||||
bwtint_t pos;
|
bwtint_t pos;
|
||||||
int len, is_rev, ref_id;
|
int len, is_rev, ref_id;
|
||||||
len = (uint32_t)p->info - (p->info>>32);
|
len = (uint32_t)p->info - (p->info >> 32);
|
||||||
pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
|
pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
|
||||||
if (is_rev) pos -= len - 1;
|
if (is_rev)
|
||||||
|
pos -= len - 1;
|
||||||
bns_cnt_ambi(idx->bns, pos, len, &ref_id);
|
bns_cnt_ambi(idx->bns, pos, len, &ref_id);
|
||||||
err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
|
err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
|
||||||
}
|
}
|
||||||
} else err_puts("\t*");
|
}
|
||||||
|
else
|
||||||
|
err_puts("\t*");
|
||||||
err_putchar('\n');
|
err_putchar('\n');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,450 @@
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#define LIKELY(x) __builtin_expect((x), 1)
|
||||||
|
#define UNLIKELY(x) __builtin_expect((x), 0)
|
||||||
|
#else
|
||||||
|
#define LIKELY(x) (x)
|
||||||
|
#define UNLIKELY(x) (x)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef MAX
|
||||||
|
#undef MIN
|
||||||
|
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||||
|
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||||
|
#define SIMD_WIDTH 32
|
||||||
|
|
||||||
|
static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
|
||||||
|
{0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0},
|
||||||
|
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
|
||||||
|
|
||||||
|
// static const uint8_t reverse_mask[SIMD_WIDTH] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
|
||||||
|
static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
|
||||||
|
|
||||||
|
// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
|
||||||
|
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
|
||||||
|
// 初始化变量
|
||||||
|
#define SIMD_INIT \
|
||||||
|
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
|
||||||
|
__m256i zero_vec; \
|
||||||
|
__m256i max_vec; \
|
||||||
|
__m256i oe_del_vec; \
|
||||||
|
__m256i oe_ins_vec; \
|
||||||
|
__m256i e_del_vec; \
|
||||||
|
__m256i e_ins_vec; \
|
||||||
|
__m256i h_vec_mask[SIMD_WIDTH]; \
|
||||||
|
__m256i reverse_mask_vec; \
|
||||||
|
zero_vec = _mm256_setzero_si256(); \
|
||||||
|
oe_del_vec = _mm256_set1_epi8(oe_del); \
|
||||||
|
oe_ins_vec = _mm256_set1_epi8(oe_ins); \
|
||||||
|
e_del_vec = _mm256_set1_epi8(e_del); \
|
||||||
|
e_ins_vec = _mm256_set1_epi8(e_ins); \
|
||||||
|
__m256i match_sc_vec = _mm256_set1_epi8(a); \
|
||||||
|
__m256i mis_sc_vec = _mm256_set1_epi8(b); \
|
||||||
|
__m256i amb_sc_vec = _mm256_set1_epi8(1); \
|
||||||
|
__m256i amb_vec = _mm256_set1_epi8(4); \
|
||||||
|
reverse_mask_vec = _mm256_loadu_si256((__m256i *)(reverse_mask)); \
|
||||||
|
for (i = 0; i < SIMD_WIDTH; ++i) \
|
||||||
|
h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i]));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* e 表示当前ref的碱基被删除
|
||||||
|
* f 表示当前seq的碱基插入
|
||||||
|
* m 表示当前碱基匹配(可以相等,也可以不想等)
|
||||||
|
* h 表示最大值
|
||||||
|
*/
|
||||||
|
// load向量化数据
|
||||||
|
#define SIMD_LOAD \
|
||||||
|
__m256i m1 = _mm256_loadu_si256((__m256i *)(&mA1[j])); \
|
||||||
|
__m256i e1 = _mm256_loadu_si256((__m256i *)(&eA1[j])); \
|
||||||
|
__m256i m1j1 = _mm256_loadu_si256((__m256i *)(&mA1[j - 1])); \
|
||||||
|
__m256i f1j1 = _mm256_loadu_si256((__m256i *)(&fA1[j - 1])); \
|
||||||
|
__m256i h0j1 = _mm256_loadu_si256((__m256i *)(&hA0[j - 1])); \
|
||||||
|
__m256i qs_vec = _mm256_loadu_si256((__m256i *)(&seq[j - 1])); \
|
||||||
|
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i]));
|
||||||
|
|
||||||
|
// 比对ref和seq的序列,计算罚分
|
||||||
|
#define SIMD_CMP_SEQ \
|
||||||
|
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
|
||||||
|
ts_vec = _mm256_shuffle_epi8(ts_vec, reverse_mask_vec); \
|
||||||
|
__m256i match_mask_vec = _mm256_cmpeq_epi8(qs_vec, ts_vec); \
|
||||||
|
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
|
||||||
|
__m256i match_score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
|
||||||
|
__m256i q_amb_mask_vec = _mm256_cmpeq_epi8(qs_vec, amb_vec); \
|
||||||
|
__m256i t_amb_mask_vec = _mm256_cmpeq_epi8(ts_vec, amb_vec); \
|
||||||
|
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
|
||||||
|
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
|
||||||
|
mis_score_vec = _mm256_andnot_si256(amb_mask_vec, mis_score_vec); \
|
||||||
|
mis_score_vec = _mm256_or_si256(amb_score_vec, mis_score_vec); \
|
||||||
|
match_score_vec = _mm256_andnot_si256(amb_mask_vec, match_score_vec);
|
||||||
|
|
||||||
|
// 向量化计算h, e, f, m
|
||||||
|
#define SIMD_COMPUTE \
|
||||||
|
__m256i en_vec0 = _mm256_max_epu8(m1, oe_del_vec); \
|
||||||
|
en_vec0 = _mm256_subs_epu8(en_vec0, oe_del_vec); \
|
||||||
|
__m256i en_vec1 = _mm256_max_epu8(e1, e_del_vec); \
|
||||||
|
en_vec1 = _mm256_subs_epu8(en_vec1, e_del_vec); \
|
||||||
|
__m256i en_vec = _mm256_max_epu8(en_vec0, en_vec1); \
|
||||||
|
__m256i fn_vec0 = _mm256_max_epu8(m1j1, oe_ins_vec); \
|
||||||
|
fn_vec0 = _mm256_subs_epu8(fn_vec0, oe_ins_vec); \
|
||||||
|
__m256i fn_vec1 = _mm256_max_epu8(f1j1, e_ins_vec); \
|
||||||
|
fn_vec1 = _mm256_subs_epu8(fn_vec1, e_ins_vec); \
|
||||||
|
__m256i fn_vec = _mm256_max_epu8(fn_vec0, fn_vec1); \
|
||||||
|
__m256i mn_vec0 = _mm256_adds_epu8(h0j1, match_score_vec); \
|
||||||
|
mn_vec0 = _mm256_max_epu8(mn_vec0, mis_score_vec); \
|
||||||
|
mn_vec0 = _mm256_subs_epu8(mn_vec0, mis_score_vec); \
|
||||||
|
__m256i mn_mask = _mm256_cmpeq_epi8(h0j1, zero_vec); \
|
||||||
|
__m256i mn_vec = _mm256_andnot_si256(mn_mask, mn_vec0); \
|
||||||
|
__m256i hn_vec0 = _mm256_max_epu8(en_vec, fn_vec); \
|
||||||
|
__m256i hn_vec = _mm256_max_epu8(hn_vec0, mn_vec);
|
||||||
|
|
||||||
|
// 存储向量化结果
|
||||||
|
#define SIMD_STORE \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, hn_vec); \
|
||||||
|
_mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \
|
||||||
|
_mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \
|
||||||
|
_mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \
|
||||||
|
_mm256_storeu_si256((__m256i *)&hA2[j], hn_vec);
|
||||||
|
|
||||||
|
// 去除多余的部分
|
||||||
|
#define SIMD_REMOVE_EXTRA \
|
||||||
|
en_vec = _mm256_and_si256(en_vec, h_vec_mask[end - j]); \
|
||||||
|
fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end - j]); \
|
||||||
|
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \
|
||||||
|
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);
|
||||||
|
|
||||||
|
// 找最大值和位置
|
||||||
|
#define SIMD_FIND_MAX \
|
||||||
|
uint8_t *maxVal = (uint8_t *)&max_vec; \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
|
||||||
|
max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
|
||||||
|
m = maxVal[0]; \
|
||||||
|
if (m > 0) \
|
||||||
|
{ \
|
||||||
|
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
|
||||||
|
{ \
|
||||||
|
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
|
||||||
|
__m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \
|
||||||
|
uint32_t mask = _mm256_movemask_epi8(vcmp); \
|
||||||
|
if (mask > 0) \
|
||||||
|
{ \
|
||||||
|
int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \
|
||||||
|
mj = j - 1 + pos; \
|
||||||
|
mi = i - 1 - pos; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
// 每轮迭代后,交换数组
|
||||||
|
#define SWAP_DATA_POINTER \
|
||||||
|
uint8_t *tmp = hA0; \
|
||||||
|
hA0 = hA1; \
|
||||||
|
hA1 = hA2; \
|
||||||
|
hA2 = tmp; \
|
||||||
|
tmp = eA1; \
|
||||||
|
eA1 = eA2; \
|
||||||
|
eA2 = tmp; \
|
||||||
|
tmp = fA1; \
|
||||||
|
fA1 = fA2; \
|
||||||
|
fA2 = tmp; \
|
||||||
|
tmp = mA1; \
|
||||||
|
mA1 = mA2; \
|
||||||
|
mA2 = tmp;
|
||||||
|
|
||||||
|
int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长度
|
||||||
|
const uint8_t *query, // read碱基序列
|
||||||
|
int tlen, // target length reference的长度
|
||||||
|
const uint8_t *target, // reference序列
|
||||||
|
int is_left, // 是不是向左扩展
|
||||||
|
int m, // 碱基种类 (5)
|
||||||
|
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
|
||||||
|
int o_del, // deletion 错配开始的惩罚系数
|
||||||
|
int e_del, // deletion extension的惩罚系数
|
||||||
|
int o_ins, // insertion 错配开始的惩罚系数
|
||||||
|
int e_ins, // insertion extension的惩罚系数
|
||||||
|
int a, // 碱基match时的分数
|
||||||
|
int b, // 碱基mismatch时的惩罚分数(正数)
|
||||||
|
int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离
|
||||||
|
int end_bonus,
|
||||||
|
int zdrop,
|
||||||
|
int h0, // 该seed的初始得分(完全匹配query的碱基数)
|
||||||
|
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
|
||||||
|
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
|
||||||
|
int *_gtle, // query全部匹配上的target的长度
|
||||||
|
int *_gscore, // query的端到端匹配得分
|
||||||
|
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
|
||||||
|
{
|
||||||
|
uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M
|
||||||
|
uint8_t *seq, *ref;
|
||||||
|
uint8_t *mem, *qtmem, *vmem;
|
||||||
|
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
|
||||||
|
int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
|
||||||
|
int Dloop = tlen + qlen; // 循环跳出条件
|
||||||
|
int span, beg1, end1; // 边界条件计算
|
||||||
|
int col_size = qlen + 2 + SIMD_WIDTH;
|
||||||
|
int val_mem_size = (col_size * 9 + 31) >> 5 << 5; // 32字节的整数倍
|
||||||
|
int mem_size = seq_size + ref_size + val_mem_size;
|
||||||
|
|
||||||
|
SIMD_INIT; // 初始化simd用的数据
|
||||||
|
|
||||||
|
assert(h0 > 0);
|
||||||
|
|
||||||
|
// allocate memory
|
||||||
|
mem = malloc(mem_size);
|
||||||
|
qtmem = &mem[0];
|
||||||
|
seq = (uint8_t *)&qtmem[0];
|
||||||
|
ref = (uint8_t *)&qtmem[seq_size];
|
||||||
|
if (is_left)
|
||||||
|
{
|
||||||
|
for (i = 0; i < qlen; ++i)
|
||||||
|
seq[i] = query[qlen - 1 - i];
|
||||||
|
for (i = 0; i < tlen; ++i)
|
||||||
|
ref[i + SIMD_WIDTH] = target[tlen - 1 - i];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (i = 0; i < qlen; ++i)
|
||||||
|
seq[i] = query[i];
|
||||||
|
for (i = 0; i < tlen; ++i)
|
||||||
|
ref[i + SIMD_WIDTH] = target[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
vmem = &ref[ref_size];
|
||||||
|
for (i = 0; i < val_mem_size; i += SIMD_WIDTH)
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256((__m256i *)&vmem[i], zero_vec);
|
||||||
|
}
|
||||||
|
|
||||||
|
hA = &vmem[0];
|
||||||
|
mA = &vmem[col_size * 3];
|
||||||
|
eA = &vmem[col_size * 5];
|
||||||
|
fA = &vmem[col_size * 7];
|
||||||
|
|
||||||
|
hA0 = &hA[0];
|
||||||
|
hA1 = &hA[col_size];
|
||||||
|
hA2 = &hA1[col_size];
|
||||||
|
mA1 = &mA[0];
|
||||||
|
mA2 = &mA[col_size];
|
||||||
|
eA1 = &eA[0];
|
||||||
|
eA2 = &eA[col_size];
|
||||||
|
fA1 = &fA[0];
|
||||||
|
fA2 = &fA[col_size];
|
||||||
|
|
||||||
|
// adjust $w if it is too large
|
||||||
|
k = m * m;
|
||||||
|
// get the max score
|
||||||
|
for (i = 0, max = 0; i < k; ++i)
|
||||||
|
max = max > mat[i] ? max : mat[i];
|
||||||
|
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
|
||||||
|
max_ins = max_ins > 1 ? max_ins : 1;
|
||||||
|
w = w < max_ins ? w : max_ins;
|
||||||
|
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
|
||||||
|
max_del = max_del > 1 ? max_del : 1;
|
||||||
|
w = w < max_del ? w : max_del; // TODO: is this necessary?
|
||||||
|
if (tlen < qlen)
|
||||||
|
w = MIN(tlen - 1, w);
|
||||||
|
|
||||||
|
// DP loop
|
||||||
|
max = h0, max_i = max_j = -1;
|
||||||
|
max_ie = -1, gscore = -1;
|
||||||
|
;
|
||||||
|
max_off = 0;
|
||||||
|
beg = 1;
|
||||||
|
end = qlen;
|
||||||
|
// init h0
|
||||||
|
hA0[0] = h0; // 左上角
|
||||||
|
|
||||||
|
if (qlen == 0 || tlen == 0)
|
||||||
|
Dloop = 0; // 防止意外情况
|
||||||
|
if (w >= qlen)
|
||||||
|
{
|
||||||
|
max_ie = 0;
|
||||||
|
gscore = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int m_last = 0;
|
||||||
|
int iend;
|
||||||
|
|
||||||
|
for (D = 1; LIKELY(D < Dloop); ++D)
|
||||||
|
{
|
||||||
|
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
|
||||||
|
if (D > tlen)
|
||||||
|
{
|
||||||
|
span = MIN(Dloop - D, w);
|
||||||
|
beg1 = MAX(D - tlen + 1, ((D - w) / 2) + 1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
span = MIN(D - 1, w);
|
||||||
|
beg1 = MAX(1, ((D - w) / 2) + 1);
|
||||||
|
}
|
||||||
|
end1 = MIN(qlen, beg1 + span);
|
||||||
|
|
||||||
|
if (beg < beg1)
|
||||||
|
beg = beg1;
|
||||||
|
if (end > end1)
|
||||||
|
end = end1;
|
||||||
|
if (beg > end)
|
||||||
|
break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug
|
||||||
|
|
||||||
|
iend = D - (beg - 1); // ref开始计算的位置,倒序
|
||||||
|
span = end - beg;
|
||||||
|
iStart = iend - span - 1; // 0开始的ref索引位置
|
||||||
|
|
||||||
|
// 每一轮需要记录的数据
|
||||||
|
int m = 0, mj = -1, mi = -1;
|
||||||
|
max_vec = zero_vec;
|
||||||
|
|
||||||
|
// 要处理边界
|
||||||
|
// 左边界 处理f (insert)
|
||||||
|
if (iStart == 0)
|
||||||
|
{
|
||||||
|
hA1[end] = MAX(0, h0 - (o_ins + e_ins * end));
|
||||||
|
}
|
||||||
|
// 上边界
|
||||||
|
if (beg == 1)
|
||||||
|
{
|
||||||
|
hA1[0] = MAX(0, h0 - (o_del + e_del * iend));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
hA1[beg - 1] = 0;
|
||||||
|
eA1[beg - 1] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH)
|
||||||
|
{
|
||||||
|
// 取数据
|
||||||
|
SIMD_LOAD;
|
||||||
|
// 比对seq,计算罚分
|
||||||
|
SIMD_CMP_SEQ;
|
||||||
|
// 计算
|
||||||
|
SIMD_COMPUTE;
|
||||||
|
// 存储结果
|
||||||
|
SIMD_STORE;
|
||||||
|
}
|
||||||
|
// 剩下的计算单元
|
||||||
|
if (j <= end)
|
||||||
|
{
|
||||||
|
// 取数据
|
||||||
|
SIMD_LOAD;
|
||||||
|
// 比对seq,计算罚分
|
||||||
|
SIMD_CMP_SEQ;
|
||||||
|
// 计算
|
||||||
|
SIMD_COMPUTE;
|
||||||
|
// 去除多余计算的部分
|
||||||
|
SIMD_REMOVE_EXTRA;
|
||||||
|
// 存储结果
|
||||||
|
SIMD_STORE;
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMD_FIND_MAX;
|
||||||
|
|
||||||
|
// 注意最后跳出循环j的值
|
||||||
|
j = end + 1;
|
||||||
|
|
||||||
|
if (j == qlen + 1)
|
||||||
|
{
|
||||||
|
max_ie = gscore > hA2[qlen] ? max_ie : iStart;
|
||||||
|
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
|
||||||
|
}
|
||||||
|
if (m == 0 && m_last == 0)
|
||||||
|
break; // 一定要注意,斜对角遍历和按列遍历的不同点
|
||||||
|
if (m > max)
|
||||||
|
{
|
||||||
|
max = m, max_i = mi, max_j = mj;
|
||||||
|
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
|
||||||
|
}
|
||||||
|
else if (zdrop > 0)
|
||||||
|
{
|
||||||
|
if (mi - max_i > mj - max_j)
|
||||||
|
{
|
||||||
|
if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 调整计算的边界
|
||||||
|
for (j = beg; LIKELY(j <= end); ++j)
|
||||||
|
{
|
||||||
|
int has_val = hA1[j - 1] | hA2[j];
|
||||||
|
if (has_val)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
beg = j;
|
||||||
|
for (j = end + 1; LIKELY(j >= beg); --j)
|
||||||
|
{
|
||||||
|
int has_val = hA1[j - 1] | hA2[j];
|
||||||
|
if (has_val)
|
||||||
|
break;
|
||||||
|
else
|
||||||
|
hA0[j - 1] = 0;
|
||||||
|
}
|
||||||
|
end = j + 1 <= qlen ? j + 1 : qlen;
|
||||||
|
|
||||||
|
m_last = m;
|
||||||
|
// swap m, h, e, f
|
||||||
|
SWAP_DATA_POINTER;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(mem);
|
||||||
|
if (_qle)
|
||||||
|
*_qle = max_j + 1;
|
||||||
|
if (_tle)
|
||||||
|
*_tle = max_i + 1;
|
||||||
|
if (_gtle)
|
||||||
|
*_gtle = max_ie + 1;
|
||||||
|
if (_gscore)
|
||||||
|
*_gscore = gscore;
|
||||||
|
if (_max_off)
|
||||||
|
*_max_off = max_off;
|
||||||
|
return max;
|
||||||
|
}
|
||||||
23
run.sh
23
run.sh
|
|
@ -1,23 +1,26 @@
|
||||||
time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||||
/home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
/home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||||
/home/zzh/data/fastq/nm1.fq \
|
/home/zzh/data/fastq/nm1.fq \
|
||||||
/home/zzh/data/fastq/nm2.fq \
|
/home/zzh/data/fastq/nm2.fq \
|
||||||
-o /dev/null
|
-o /dev/null
|
||||||
|
|
||||||
|
# time ./bwa mem -t 1 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||||
|
# /public/home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||||
|
# /public/home/zzh/data/fastq/ZY2003109152013000/nm1.fq \
|
||||||
|
# /public/home/zzh/data/fastq/ZY2003109152013000/nm2.fq \
|
||||||
|
# -o /dev/null
|
||||||
|
|
||||||
# time ./bwa mem -t 64 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
# time ./bwa mem -t 64 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||||
# /public/home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
# /public/home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||||
# /public/home/zzh/data/fastq/ZY2003109152013000/nm1.fq \
|
# /public/home/zzh/data/fastq/ZY2003109152013000/nm1.fq \
|
||||||
# /public/home/zzh/data/fastq/ZY2003109152013000/nm2.fq \
|
# /public/home/zzh/data/fastq/ZY2003109152013000/nm2.fq \
|
||||||
# -o /dev/null
|
# -o /dev/null
|
||||||
|
|
||||||
#/public/home/zzh/data/fastq/n1.fq \
|
#/public/home/zzh/data/fastq/n1.fq \
|
||||||
#/public/home/zzh/data/fastq/n2.fq \
|
#/public/home/zzh/data/fastq/n2.fq \
|
||||||
|
#/share_nas3/zyseq-release-v1.1.3/zyseq/wes/resource/reference/human_g1k_v37_decoy.fasta \
|
||||||
|
#/share_nas3/zyseq-release-v1.1.3/zyseq/data/n1.fq \
|
||||||
#/share_nas3/zyseq-release-v1.1.3/zyseq/wes/resource/reference/human_g1k_v37_decoy.fasta \
|
#/share_nas3/zyseq-release-v1.1.3/zyseq/data/n2.fq \
|
||||||
#/share_nas3/zyseq-release-v1.1.3/zyseq/data/n1.fq \
|
#-o reads_mapping.sam
|
||||||
#/share_nas3/zyseq-release-v1.1.3/zyseq/data/n2.fq \
|
|
||||||
|
|
||||||
#-o reads_mapping.sam
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue