添加了一些注释
This commit is contained in:
parent
e000ecbf45
commit
44fe799773
|
|
@ -0,0 +1,41 @@
|
||||||
|
{
|
||||||
|
// 使用 IntelliSense 了解相关属性。
|
||||||
|
// 悬停以查看现有属性的描述。
|
||||||
|
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "bwa-mem",
|
||||||
|
"preLaunchTask": "Build",
|
||||||
|
"type": "cppdbg",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${workspaceRoot}/bwa",
|
||||||
|
"args": [
|
||||||
|
"mem",
|
||||||
|
"-t",
|
||||||
|
"1",
|
||||||
|
"-M",
|
||||||
|
"-R",
|
||||||
|
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||||
|
"/home/zzh/data/reference/human_g1k_v37_decoy.fasta",
|
||||||
|
"/home/zzh/data/fastq/n_s1.fq",
|
||||||
|
"/home/zzh/data/fastq/n_s2.fq",
|
||||||
|
"-o",
|
||||||
|
"/dev/null"
|
||||||
|
],
|
||||||
|
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "index",
|
||||||
|
"preLaunchTask": "Build",
|
||||||
|
"type": "cppdbg",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${workspaceRoot}/bwa",
|
||||||
|
"args": [
|
||||||
|
"index",
|
||||||
|
"/mnt/d/data/reference/human_g1k_v37_decoy.fasta"
|
||||||
|
],
|
||||||
|
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"files.associations": {
|
||||||
|
"bwt.h": "c",
|
||||||
|
"bwa.h": "c",
|
||||||
|
"malloc_wrap.h": "c",
|
||||||
|
"bntseq.h": "c",
|
||||||
|
"utils.h": "c",
|
||||||
|
"rle.h": "c",
|
||||||
|
"rope.h": "c",
|
||||||
|
"random": "c",
|
||||||
|
"kseq.h": "c"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
// See https://go.microsoft.com/fwlink/?LinkId=733558
|
||||||
|
// for the documentation about the tasks.json format
|
||||||
|
"version": "2.0.0",
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"label": "Build",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "make clean; make -j 16",
|
||||||
|
"problemMatcher": [],
|
||||||
|
"group": {
|
||||||
|
"kind": "build",
|
||||||
|
"isDefault": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
3
Makefile
3
Makefile
|
|
@ -1,6 +1,7 @@
|
||||||
CC= gcc
|
CC= gcc
|
||||||
#CC= clang --analyze
|
#CC= clang --analyze
|
||||||
CFLAGS= -g -Wall -Wno-unused-function -O2
|
# CFLAGS= -g -Wall -Wno-unused-function -O2
|
||||||
|
CFLAGS= -g -Wall -Wno-unused-function #-O2
|
||||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||||
AR= ar
|
AR= ar
|
||||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
||||||
|
|
|
||||||
36
bntseq.c
36
bntseq.c
|
|
@ -231,39 +231,39 @@ void bns_destroy(bntseq_t *bns)
|
||||||
|
|
||||||
static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
|
static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
|
||||||
{
|
{
|
||||||
bntann1_t *p;
|
bntann1_t *p; // 染色体,contig
|
||||||
int i, lasts;
|
int i, lasts;
|
||||||
if (bns->n_seqs == *m_seqs) {
|
if (bns->n_seqs == *m_seqs) { // 空间不够,重新开辟空间,n_seqs表示contig(染色体)数量
|
||||||
*m_seqs <<= 1;
|
*m_seqs <<= 1;
|
||||||
bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
|
bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
|
||||||
}
|
}
|
||||||
p = bns->anns + bns->n_seqs;
|
p = bns->anns + bns->n_seqs; // p表示当前要读入的contig
|
||||||
p->name = strdup((char*)seq->name.s);
|
p->name = strdup((char*)seq->name.s); // contig名字,1,2,3... X,Y
|
||||||
p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
|
p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); // 染色体注释,名称等信息
|
||||||
p->gi = 0; p->len = seq->seq.l;
|
p->gi = 0; p->len = seq->seq.l; // contig长度
|
||||||
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
|
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; // offset表示该contig在所有序列中的偏移位置
|
||||||
p->n_ambs = 0;
|
p->n_ambs = 0; // 模棱两可碱基的个数
|
||||||
for (i = lasts = 0; i < seq->seq.l; ++i) {
|
for (i = lasts = 0; i < seq->seq.l; ++i) { // 挨个读取该contig的碱基
|
||||||
int c = nst_nt4_table[(int)seq->seq.s[i]];
|
int c = nst_nt4_table[(int)seq->seq.s[i]]; // 碱基编码
|
||||||
if (c >= 4) { // N
|
if (c >= 4) { // N
|
||||||
if (lasts == seq->seq.s[i]) { // contiguous N
|
if (lasts == seq->seq.s[i]) { // contiguous N
|
||||||
++(*q)->len;
|
++(*q)->len; // 该连续的模棱两可碱基长度+1
|
||||||
} else {
|
} else { // 新一串模棱两可碱基
|
||||||
if (bns->n_holes == *m_holes) {
|
if (bns->n_holes == *m_holes) { // 模棱两可碱基串容量不够,扩容
|
||||||
(*m_holes) <<= 1;
|
(*m_holes) <<= 1;
|
||||||
bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
|
bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
|
||||||
}
|
}
|
||||||
*q = bns->ambs + bns->n_holes;
|
*q = bns->ambs + bns->n_holes;
|
||||||
(*q)->len = 1;
|
(*q)->len = 1;
|
||||||
(*q)->offset = p->offset + i;
|
(*q)->offset = p->offset + i; // 模棱两可碱基偏移
|
||||||
(*q)->amb = seq->seq.s[i];
|
(*q)->amb = seq->seq.s[i];
|
||||||
++p->n_ambs;
|
++p->n_ambs; // 该contig包括的模棱两可碱基数量
|
||||||
++bns->n_holes;
|
++bns->n_holes; // 模棱两可碱基串数量
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lasts = seq->seq.s[i];
|
lasts = seq->seq.s[i]; // 保存当前字符,用来下一轮比较
|
||||||
{ // fill buffer
|
{ // fill buffer
|
||||||
if (c >= 4) c = lrand48()&3;
|
if (c >= 4) c = lrand48()&3; // 如果是模棱两可碱基,那就随机给分配一个ATGC
|
||||||
if (bns->l_pac == *m_pac) { // double the pac size
|
if (bns->l_pac == *m_pac) { // double the pac size
|
||||||
*m_pac <<= 1;
|
*m_pac <<= 1;
|
||||||
pac = realloc(pac, *m_pac/4);
|
pac = realloc(pac, *m_pac/4);
|
||||||
|
|
|
||||||
4
bntseq.h
4
bntseq.h
|
|
@ -57,9 +57,9 @@ typedef struct {
|
||||||
int64_t l_pac;
|
int64_t l_pac;
|
||||||
int32_t n_seqs;
|
int32_t n_seqs;
|
||||||
uint32_t seed;
|
uint32_t seed;
|
||||||
bntann1_t *anns; // n_seqs elements
|
bntann1_t *anns; // n_seqs elements 染色体
|
||||||
int32_t n_holes;
|
int32_t n_holes;
|
||||||
bntamb1_t *ambs; // n_holes elements
|
bntamb1_t *ambs; // n_holes elements 非AGCT字符
|
||||||
FILE *fp_pac;
|
FILE *fp_pac;
|
||||||
} bntseq_t;
|
} bntseq_t;
|
||||||
|
|
||||||
|
|
|
||||||
26
bwt.c
26
bwt.c
|
|
@ -39,6 +39,7 @@
|
||||||
# include "malloc_wrap.h"
|
# include "malloc_wrap.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// 计算一个字节构成的A,T,C,G序列,对应的每个碱基的个数,因为最多有4个相同的碱基,所以每次左移3位就行
|
||||||
void bwt_gen_cnt_table(bwt_t *bwt)
|
void bwt_gen_cnt_table(bwt_t *bwt)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
@ -52,17 +53,23 @@ void bwt_gen_cnt_table(bwt_t *bwt)
|
||||||
|
|
||||||
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
|
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
|
||||||
{
|
{
|
||||||
bwtint_t x = k - (k > bwt->primary);
|
bwtint_t x = k - (k > bwt->primary); // bwt中不包含$,所以位置超过序列长度后,要减掉1,(因为后边加上了互补链)
|
||||||
x = bwt_B0(bwt, x);
|
// x = bwt_B0(bwt, x); // 获取x位置对应的字符
|
||||||
x = bwt->L2[x] + bwt_occ(bwt, k, x);
|
|
||||||
|
uint32_t t1 = (bwt->bwt[((x) >> 7 << 4) + sizeof(bwtint_t) + (((x) & 0x7f) >> 4)]);
|
||||||
|
uint32_t t2 = t1 >> ((~(x) & 0xf) << 1) & 3;
|
||||||
|
|
||||||
|
x = t2;
|
||||||
|
|
||||||
|
x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置(后缀索引),
|
||||||
return k == bwt->primary? 0 : x;
|
return k == bwt->primary? 0 : x;
|
||||||
}
|
}
|
||||||
|
|
||||||
// bwt->bwt and bwt->occ must be precalculated
|
// bwt->bwt and bwt->occ must be precalculated
|
||||||
void bwt_cal_sa(bwt_t *bwt, int intv)
|
void bwt_cal_sa(bwt_t *bwt, int intv)
|
||||||
{
|
{
|
||||||
bwtint_t isa, sa, i; // S(isa) = sa
|
bwtint_t isa, sa, i; // S(isa) = sa isa是后缀数组的索引,sa表示位置
|
||||||
int intv_round = intv;
|
int intv_round = intv; // 间隔多少来保存一个位置信息
|
||||||
|
|
||||||
kv_roundup32(intv_round);
|
kv_roundup32(intv_round);
|
||||||
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
|
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
|
||||||
|
|
@ -71,13 +78,13 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
|
||||||
if (bwt->sa) free(bwt->sa);
|
if (bwt->sa) free(bwt->sa);
|
||||||
bwt->sa_intv = intv;
|
bwt->sa_intv = intv;
|
||||||
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
||||||
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); // 用64位无符号整数来保存位置信息,其实用33位就够了
|
||||||
// calculate SA value
|
// calculate SA value
|
||||||
isa = 0; sa = bwt->seq_len;
|
isa = 0; sa = bwt->seq_len;
|
||||||
for (i = 0; i < bwt->seq_len; ++i) {
|
for (i = 0; i < bwt->seq_len; ++i) {
|
||||||
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
if (isa % intv == 0) bwt->sa[isa/intv] = sa; // 第一个位置是$,所以位置就是序列长度
|
||||||
--sa;
|
--sa; // 从后往前,一个位置一个位置的找对应的后缀数组,isa就是与sa对应的后缀数组排序后在sa数组中的相对位置
|
||||||
isa = bwt_invPsi(bwt, isa);
|
isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置
|
||||||
}
|
}
|
||||||
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
||||||
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
||||||
|
|
@ -286,6 +293,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// NOTE: $max_intv is not currently used in BWA-MEM
|
// NOTE: $max_intv is not currently used in BWA-MEM
|
||||||
|
// 找smem(seed)
|
||||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||||
{
|
{
|
||||||
int i, j, c, ret;
|
int i, j, c, ret;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue