添加了一些注释

This commit is contained in:
zzh 2023-12-24 17:23:14 +08:00
parent e000ecbf45
commit 44fe799773
7 changed files with 110 additions and 30 deletions

41
.vscode/launch.json vendored 100644
View File

@ -0,0 +1,41 @@
{
// 使 IntelliSense
//
// 访: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "bwa-mem",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/bwa",
"args": [
"mem",
"-t",
"1",
"-M",
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"/home/zzh/data/reference/human_g1k_v37_decoy.fasta",
"/home/zzh/data/fastq/n_s1.fq",
"/home/zzh/data/fastq/n_s2.fq",
"-o",
"/dev/null"
],
"cwd": "${workspaceFolder}", //
},
{
"name": "index",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/bwa",
"args": [
"index",
"/mnt/d/data/reference/human_g1k_v37_decoy.fasta"
],
"cwd": "${workspaceFolder}", //
}
]
}

13
.vscode/settings.json vendored 100644
View File

@ -0,0 +1,13 @@
{
"files.associations": {
"bwt.h": "c",
"bwa.h": "c",
"malloc_wrap.h": "c",
"bntseq.h": "c",
"utils.h": "c",
"rle.h": "c",
"rope.h": "c",
"random": "c",
"kseq.h": "c"
}
}

17
.vscode/tasks.json vendored 100644
View File

@ -0,0 +1,17 @@
{
// See https://go.microsoft.com/fwlink/?LinkId=733558
// for the documentation about the tasks.json format
"version": "2.0.0",
"tasks": [
{
"label": "Build",
"type": "shell",
"command": "make clean; make -j 16",
"problemMatcher": [],
"group": {
"kind": "build",
"isDefault": true
}
}
]
}

View File

@ -1,6 +1,7 @@
CC= gcc
#CC= clang --analyze
CFLAGS= -g -Wall -Wno-unused-function -O2
# CFLAGS= -g -Wall -Wno-unused-function -O2
CFLAGS= -g -Wall -Wno-unused-function #-O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)

View File

@ -231,39 +231,39 @@ void bns_destroy(bntseq_t *bns)
static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
{
bntann1_t *p;
bntann1_t *p; // 染色体contig
int i, lasts;
if (bns->n_seqs == *m_seqs) {
if (bns->n_seqs == *m_seqs) { // 空间不够重新开辟空间n_seqs表示contig染色体数量
*m_seqs <<= 1;
bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
}
p = bns->anns + bns->n_seqs;
p->name = strdup((char*)seq->name.s);
p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
p->gi = 0; p->len = seq->seq.l;
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0;
for (i = lasts = 0; i < seq->seq.l; ++i) {
int c = nst_nt4_table[(int)seq->seq.s[i]];
p = bns->anns + bns->n_seqs; // p表示当前要读入的contig
p->name = strdup((char*)seq->name.s); // contig名字1,2,3... X,Y
p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); // 染色体注释,名称等信息
p->gi = 0; p->len = seq->seq.l; // contig长度
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; // offset表示该contig在所有序列中的偏移位置
p->n_ambs = 0; // 模棱两可碱基的个数
for (i = lasts = 0; i < seq->seq.l; ++i) { // 挨个读取该contig的碱基
int c = nst_nt4_table[(int)seq->seq.s[i]]; // 碱基编码
if (c >= 4) { // N
if (lasts == seq->seq.s[i]) { // contiguous N
++(*q)->len;
} else {
if (bns->n_holes == *m_holes) {
++(*q)->len; // 该连续的模棱两可碱基长度+1
} else { // 新一串模棱两可碱基
if (bns->n_holes == *m_holes) { // 模棱两可碱基串容量不够,扩容
(*m_holes) <<= 1;
bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
}
*q = bns->ambs + bns->n_holes;
(*q)->len = 1;
(*q)->offset = p->offset + i;
(*q)->offset = p->offset + i; // 模棱两可碱基偏移
(*q)->amb = seq->seq.s[i];
++p->n_ambs;
++bns->n_holes;
++p->n_ambs; // 该contig包括的模棱两可碱基数量
++bns->n_holes; // 模棱两可碱基串数量
}
}
lasts = seq->seq.s[i];
lasts = seq->seq.s[i]; // 保存当前字符,用来下一轮比较
{ // fill buffer
if (c >= 4) c = lrand48()&3;
if (c >= 4) c = lrand48()&3; // 如果是模棱两可碱基那就随机给分配一个ATGC
if (bns->l_pac == *m_pac) { // double the pac size
*m_pac <<= 1;
pac = realloc(pac, *m_pac/4);

View File

@ -57,9 +57,9 @@ typedef struct {
int64_t l_pac;
int32_t n_seqs;
uint32_t seed;
bntann1_t *anns; // n_seqs elements
bntann1_t *anns; // n_seqs elements 染色体
int32_t n_holes;
bntamb1_t *ambs; // n_holes elements
bntamb1_t *ambs; // n_holes elements 非AGCT字符
FILE *fp_pac;
} bntseq_t;

26
bwt.c
View File

@ -39,6 +39,7 @@
# include "malloc_wrap.h"
#endif
// 计算一个字节构成的A,T,C,G序列对应的每个碱基的个数因为最多有4个相同的碱基所以每次左移3位就行
void bwt_gen_cnt_table(bwt_t *bwt)
{
int i, j;
@ -52,17 +53,23 @@ void bwt_gen_cnt_table(bwt_t *bwt)
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
{
bwtint_t x = k - (k > bwt->primary);
x = bwt_B0(bwt, x);
x = bwt->L2[x] + bwt_occ(bwt, k, x);
bwtint_t x = k - (k > bwt->primary); // bwt中不包含$所以位置超过序列长度后要减掉1因为后边加上了互补链
// x = bwt_B0(bwt, x); // 获取x位置对应的字符
uint32_t t1 = (bwt->bwt[((x) >> 7 << 4) + sizeof(bwtint_t) + (((x) & 0x7f) >> 4)]);
uint32_t t2 = t1 >> ((~(x) & 0xf) << 1) & 3;
x = t2;
x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置后缀索引
return k == bwt->primary? 0 : x;
}
// bwt->bwt and bwt->occ must be precalculated
void bwt_cal_sa(bwt_t *bwt, int intv)
{
bwtint_t isa, sa, i; // S(isa) = sa
int intv_round = intv;
bwtint_t isa, sa, i; // S(isa) = sa isa是后缀数组的索引sa表示位置
int intv_round = intv; // 间隔多少来保存一个位置信息
kv_roundup32(intv_round);
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
@ -71,13 +78,13 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
if (bwt->sa) free(bwt->sa);
bwt->sa_intv = intv;
bwt->n_sa = (bwt->seq_len + intv) / intv;
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); // 用64位无符号整数来保存位置信息其实用33位就够了
// calculate SA value
isa = 0; sa = bwt->seq_len;
for (i = 0; i < bwt->seq_len; ++i) {
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
--sa;
isa = bwt_invPsi(bwt, isa);
if (isa % intv == 0) bwt->sa[isa/intv] = sa; // 第一个位置是$,所以位置就是序列长度
--sa; // 从后往前一个位置一个位置的找对应的后缀数组isa就是与sa对应的后缀数组排序后在sa数组中的相对位置
isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置
}
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
@ -286,6 +293,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
}
}
// NOTE: $max_intv is not currently used in BWA-MEM
// 找smemseed
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;