From 44fe79977303cb523eba8e8f8e4e0d0ce4291f4b Mon Sep 17 00:00:00 2001 From: zzh Date: Sun, 24 Dec 2023 17:23:14 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 41 +++++++++++++++++++++++++++++++++++++++++ .vscode/settings.json | 13 +++++++++++++ .vscode/tasks.json | 17 +++++++++++++++++ Makefile | 3 ++- bntseq.c | 36 ++++++++++++++++++------------------ bntseq.h | 4 ++-- bwt.c | 26 +++++++++++++++++--------- 7 files changed, 110 insertions(+), 30 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..791fe75 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,41 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "bwa-mem", + "preLaunchTask": "Build", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceRoot}/bwa", + "args": [ + "mem", + "-t", + "1", + "-M", + "-R", + "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", + "/home/zzh/data/reference/human_g1k_v37_decoy.fasta", + "/home/zzh/data/fastq/n_s1.fq", + "/home/zzh/data/fastq/n_s2.fq", + "-o", + "/dev/null" + ], + "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 + }, + { + "name": "index", + "preLaunchTask": "Build", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceRoot}/bwa", + "args": [ + "index", + "/mnt/d/data/reference/human_g1k_v37_decoy.fasta" + ], + "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..433979a --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,13 @@ +{ + "files.associations": { + "bwt.h": "c", + "bwa.h": "c", + "malloc_wrap.h": "c", + "bntseq.h": "c", + "utils.h": "c", + "rle.h": "c", + "rope.h": "c", + "random": "c", + "kseq.h": "c" + } +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..f76ae19 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,17 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "2.0.0", + "tasks": [ + { + "label": "Build", + "type": "shell", + "command": "make clean; make -j 16", + "problemMatcher": [], + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} \ No newline at end of file diff --git a/Makefile b/Makefile index 5480536..3169076 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ CC= gcc #CC= clang --analyze -CFLAGS= -g -Wall -Wno-unused-function -O2 +# CFLAGS= -g -Wall -Wno-unused-function -O2 +CFLAGS= -g -Wall -Wno-unused-function #-O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) diff --git a/bntseq.c b/bntseq.c index 6380689..c844992 100644 --- a/bntseq.c +++ b/bntseq.c @@ -231,39 +231,39 @@ void bns_destroy(bntseq_t *bns) static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) { - bntann1_t *p; + bntann1_t *p; // 染色体,contig int i, lasts; - if (bns->n_seqs == *m_seqs) { + if (bns->n_seqs == *m_seqs) { // 空间不够,重新开辟空间,n_seqs表示contig(染色体)数量 *m_seqs <<= 1; bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); } - p = bns->anns + bns->n_seqs; - p->name = strdup((char*)seq->name.s); - p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); - p->gi = 0; p->len = seq->seq.l; - p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; - p->n_ambs = 0; - for (i = lasts = 0; i < seq->seq.l; ++i) { - int c = nst_nt4_table[(int)seq->seq.s[i]]; + p = bns->anns + bns->n_seqs; // p表示当前要读入的contig + p->name = strdup((char*)seq->name.s); // contig名字,1,2,3... X,Y + p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); // 染色体注释,名称等信息 + p->gi = 0; p->len = seq->seq.l; // contig长度 + p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; // offset表示该contig在所有序列中的偏移位置 + p->n_ambs = 0; // 模棱两可碱基的个数 + for (i = lasts = 0; i < seq->seq.l; ++i) { // 挨个读取该contig的碱基 + int c = nst_nt4_table[(int)seq->seq.s[i]]; // 碱基编码 if (c >= 4) { // N if (lasts == seq->seq.s[i]) { // contiguous N - ++(*q)->len; - } else { - if (bns->n_holes == *m_holes) { + ++(*q)->len; // 该连续的模棱两可碱基长度+1 + } else { // 新一串模棱两可碱基 + if (bns->n_holes == *m_holes) { // 模棱两可碱基串容量不够,扩容 (*m_holes) <<= 1; bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); } *q = bns->ambs + bns->n_holes; (*q)->len = 1; - (*q)->offset = p->offset + i; + (*q)->offset = p->offset + i; // 模棱两可碱基偏移 (*q)->amb = seq->seq.s[i]; - ++p->n_ambs; - ++bns->n_holes; + ++p->n_ambs; // 该contig包括的模棱两可碱基数量 + ++bns->n_holes; // 模棱两可碱基串数量 } } - lasts = seq->seq.s[i]; + lasts = seq->seq.s[i]; // 保存当前字符,用来下一轮比较 { // fill buffer - if (c >= 4) c = lrand48()&3; + if (c >= 4) c = lrand48()&3; // 如果是模棱两可碱基,那就随机给分配一个ATGC if (bns->l_pac == *m_pac) { // double the pac size *m_pac <<= 1; pac = realloc(pac, *m_pac/4); diff --git a/bntseq.h b/bntseq.h index 92e3afc..354adb6 100644 --- a/bntseq.h +++ b/bntseq.h @@ -57,9 +57,9 @@ typedef struct { int64_t l_pac; int32_t n_seqs; uint32_t seed; - bntann1_t *anns; // n_seqs elements + bntann1_t *anns; // n_seqs elements 染色体 int32_t n_holes; - bntamb1_t *ambs; // n_holes elements + bntamb1_t *ambs; // n_holes elements 非AGCT字符 FILE *fp_pac; } bntseq_t; diff --git a/bwt.c b/bwt.c index 9083654..6fa38fd 100644 --- a/bwt.c +++ b/bwt.c @@ -39,6 +39,7 @@ # include "malloc_wrap.h" #endif +// 计算一个字节构成的A,T,C,G序列,对应的每个碱基的个数,因为最多有4个相同的碱基,所以每次左移3位就行 void bwt_gen_cnt_table(bwt_t *bwt) { int i, j; @@ -52,17 +53,23 @@ void bwt_gen_cnt_table(bwt_t *bwt) static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA { - bwtint_t x = k - (k > bwt->primary); - x = bwt_B0(bwt, x); - x = bwt->L2[x] + bwt_occ(bwt, k, x); + bwtint_t x = k - (k > bwt->primary); // bwt中不包含$,所以位置超过序列长度后,要减掉1,(因为后边加上了互补链) + // x = bwt_B0(bwt, x); // 获取x位置对应的字符 + + uint32_t t1 = (bwt->bwt[((x) >> 7 << 4) + sizeof(bwtint_t) + (((x) & 0x7f) >> 4)]); + uint32_t t2 = t1 >> ((~(x) & 0xf) << 1) & 3; + + x = t2; + + x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置(后缀索引), return k == bwt->primary? 0 : x; } // bwt->bwt and bwt->occ must be precalculated void bwt_cal_sa(bwt_t *bwt, int intv) { - bwtint_t isa, sa, i; // S(isa) = sa - int intv_round = intv; + bwtint_t isa, sa, i; // S(isa) = sa isa是后缀数组的索引,sa表示位置 + int intv_round = intv; // 间隔多少来保存一个位置信息 kv_roundup32(intv_round); xassert(intv_round == intv, "SA sample interval is not a power of 2."); @@ -71,13 +78,13 @@ void bwt_cal_sa(bwt_t *bwt, int intv) if (bwt->sa) free(bwt->sa); bwt->sa_intv = intv; bwt->n_sa = (bwt->seq_len + intv) / intv; - bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); // 用64位无符号整数来保存位置信息,其实用33位就够了 // calculate SA value isa = 0; sa = bwt->seq_len; for (i = 0; i < bwt->seq_len; ++i) { - if (isa % intv == 0) bwt->sa[isa/intv] = sa; - --sa; - isa = bwt_invPsi(bwt, isa); + if (isa % intv == 0) bwt->sa[isa/intv] = sa; // 第一个位置是$,所以位置就是序列长度 + --sa; // 从后往前,一个位置一个位置的找对应的后缀数组,isa就是与sa对应的后缀数组排序后在sa数组中的相对位置 + isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置 } if (isa % intv == 0) bwt->sa[isa/intv] = sa; bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len @@ -286,6 +293,7 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } // NOTE: $max_intv is not currently used in BWA-MEM +// 找smem(seed) int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret;