到了读取和解析known vcf部分了，性能还需优化

2025-12-20 16:35:45 +08:00 · 2025-12-20 16:35:45 +08:00 · 1e9b58fac1
parent 0fca937fab
commit 1e9b58fac1
17 changed files with 2397 additions and 161 deletions
--- a/src/bqsr/baq.cpp
+++ b/src/bqsr/baq.cpp
@ -0,0 +1,17 @@
+#include "baq.h"
+
+#include <spdlog/spdlog.h>
+
+vector<double> BAQ::qual2prob(256); // 质量分数转化概率
+vector<vector<vector<double>>> BAQ::EPSILONS(256, vector<vector<double>>(256, vector<double>(SAM_MAX_PHRED_SCORE + 1)));  // [ref][read][qual]
+
+// 计算baq数组，返回成功与否
+bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray) {
+    // 检测ref是否覆盖了read
+    if (ref.size() < refOffset + ad.read_len) {
+        spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})",
+                      ref.size(), refOffset + ad.read_len, refOffset,  ad.read_len);
+        return false;
+    }
+    return true;
+}
--- a/src/bqsr/baq.h
+++ b/src/bqsr/baq.h
@ -0,0 +1,88 @@
+// ####################################################################################################
+//
+// NOTE -- THIS CODE IS SYNCHRONIZED WITH CODE IN THE SAMTOOLS REPOSITORY.  CHANGES TO THIS CODE SHOULD BE
+// NOTE -- PUSHED BACK TO HENG LI
+//
+// ####################################################################################################
+// NOTE -- this code is a cpp version of code originally written by Heng Li in JAVA for the samtools project
+// ####################################################################################################
+#pragma once
+
+#include <cctype>
+#include <cmath>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "util/bam_wrap.h"
+
+using std::vector;
+using std::string;
+
+// base alignment quality (BAQ)
+
+struct BAQ {
+    static vector<double> qual2prob;  // 质量分数转化概率
+    static constexpr double EM = 0.33333333333;
+    static constexpr double EI = 0.25;
+    // Phred scaled now (changed 1/10/2011)
+    static constexpr double DEFAULT_GOP = 40;
+    static constexpr int DEFAULT_BANDWIDTH = 7;
+    static constexpr int SAM_MAX_PHRED_SCORE = 93;
+    // 94 = max qual score + 1
+    static vector<vector<vector<double>>> EPSILONS;  // [ref][read][qual]
+    
+    double cd = -1;   // gap open probability [1e-3]
+    double ce = 0.1;  // gap extension probability [0.1]
+    int cb = DEFAULT_BANDWIDTH;  // band width [7]
+
+    /**
+     * Any bases with Q < MIN_BASE_QUAL are raised up to this base quality
+     */
+    uint8_t minBaseQual = 4;
+
+    /**
+     * Use defaults for everything
+     */
+    BAQ() : BAQ(DEFAULT_GOP) {}
+
+    /**
+     * Use defaults for everything
+     */
+    BAQ(const double gapOpenPenalty) {
+        cd = convertFromPhredScale(gapOpenPenalty);
+        initializeCachedData();
+    }
+
+    /*  Takes a Phred Scale quality score and returns the error probability.
+     *
+     *  Quick conversion function to maintain internal structure of BAQ calculation on
+     *  probability scale, but take the user entered parameter in phred-scale.
+     *
+     *  @param x phred scaled score
+     *  @return probability of incorrect base call
+     */
+    static double convertFromPhredScale(double x) { return (std::pow(10, (-x) / 10.)); }
+
+    // 初始化一些静态全局数据
+    void initializeCachedData() {
+        for (int i = 0; i < 256; i++)
+            for (int j = 0; j < 256; j++)
+                for (int q = 0; q <= SAM_MAX_PHRED_SCORE; q++) EPSILONS[i][j][q] = 1.0;
+
+        for (char b1 : "ACGTacgt") {
+            for (char b2 : "ACGTacgt") {
+                for (int q = 0; q <= SAM_MAX_PHRED_SCORE; q++) {
+                    double qual = qual2prob[q < minBaseQual ? minBaseQual : q];
+                    double e = std::tolower(b1) == std::tolower(b2) ? 1 - qual : qual * EM;
+                    EPSILONS[(uint8_t)b1][(uint8_t)b2][q] = e;
+                }
+            }
+        }
+    }
+
+    double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; }
+
+    // 计算baq数组，返回成功与否
+    bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector<int>& baqArray);
+};
--- a/src/bqsr/bqsr_args.h
+++ b/src/bqsr/bqsr_args.h
@ -48,6 +48,9 @@ struct BQSRArg {

    string PROGRAM_RECORD_ID = "FastBQSR";

+    // reference file
+    string REFERENCE_FILE;
+
    // known sites vcf files
    vector<string> KNOWN_SITES_VCFS;

@ -123,7 +126,7 @@ struct BQSRArg {
    int PRESERVE_QSCORES_LESS_THAN = 6;

    /**
-     * enable-baq, do BAQ correction"
+     * enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了，用不到了？
     */
    bool enableBAQ = false;

--- a/src/bqsr/bqsr_entry.cpp
+++ b/src/bqsr/bqsr_entry.cpp
@ -7,36 +7,93 @@ Copyright : All right reserved by ICT
 Author : Zhang Zhonghai
 Date : 2023/10/23
 */
+#include <htslib/faidx.h>
+#include <htslib/kstring.h>
 #include <htslib/sam.h>
 #include <htslib/synced_bcf_reader.h>
 #include <htslib/thread_pool.h>
+#include <header.h>
 #include <spdlog/spdlog.h>

 #include <iomanip>
+#include <numeric>
 #include <vector>
+#include <queue>

+#include "baq.h"
 #include "bqsr_args.h"
 #include "bqsr_funcs.h"
 #include "bqsr_pipeline.h"
+#include "covariate.h"
 #include "dup_metrics.h"
 #include "fastbqsr_version.h"
 #include "read_name_parser.h"
+#include "util/interval.h"
 #include "util/profiling.h"
 #include "util/utils.h"
+#include "util/linear_index.h"
+
+using std::deque;

 #define BAM_BLOCK_SIZE 16L * 1024 * 1024

+const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
+
+// 解析knownSites
+struct VCFParser {
+    deque<Interval> knownSites;  // 已知的变异位点
+    char* buf = nullptr;         // // 数据buffer
+    uint32_t bufLen = 4 * 1024;  // 数据buffer长度
+    LinearIndex index;           // vcf文件索引
+    ifstream inStm;              // vcf文件流
+
+    VCFParser() { Init(); }
+    VCFParser(const string& vcfFileName) { Init(vcfFileName); }
+    VCFParser(const string& vcfFileName, sam_hdr_t* samHeader) { Init(vcfFileName, samHeader); }
+
+    void Init() { buf = (char*)malloc(bufLen); }
+    void Init(const string& vcfFileName) {
+        Init();
+        inStm.open(vcfFileName, ifstream::in);
+        string idxFileName = vcfFileName + ".idx";
+        if (!index.ReadIndex(idxFileName))
+            error("[%s] fail to load the %s index file\n", __func__, idxFileName.c_str());
+    }
+    void Init(const string& vcfFileName, sam_hdr_t *samHeader) {
+        index.SetHeader(samHeader);
+        Init(vcfFileName);
+    }
+};
+
+// 解析后的一些参数，文件，数据等
+struct AuxVar {
+    const static int REF_CONTEXT_PAD = 3;                 // 需要做一些填充
+    const static int REFERENCE_HALF_WINDOW_LENGTH = 150;  // 需要额外多取出一些ref序列，防止边界效应
+
+    sam_hdr_t* header = nullptr;  // bam header
+    faidx_t* faidx = nullptr;     // reference index
+    char* ref_seq = nullptr;      // reference sequence
+    int ref_len = 0;              // reference sequence length
+    int offset = 0;               // 在要求的ref序列两边，多余取出的碱基数量
+
+    vector<VCFParser> vcfArr;     // 从vcf中获取已知位点
+};
+
 namespace nsgv {

+// 全局变量 for bqsr
+BQSRArg gBqsrArg;         // bqsr arguments
+samFile* gInBamFp;        // input BAM file pointer
+sam_hdr_t* gInBamHeader;  // input BAM header
+vector<AuxVar> gAuxVars;  // auxiliary variables，保存一些文件，数据等，每个线程对应一个

+
+// 下面是需要删除或修改的变量
 std::vector<ReadNameParser> gNameParsers;  // read name parser
 DuplicationMetrics gMetrics;               //
 DupResult gDupRes;
 PipelineArg gPipe(&gDupRes);

-BQSRArg gBqsrArg;                     //
-samFile *gInBamFp;                    // bam
-sam_hdr_t *gInBamHeader;              // bam
 samFile *gOutBamFp;                   // , sambam
 sam_hdr_t *gOutBamHeader;             // header
 vector <bcf_srs_t*> gKnownSitesVcfSrs; // known sites vcf srs
@ -49,6 +106,13 @@ struct ByteBuf {
    int capacity = 0;  // 
 };

+// 读进来的这一批bam总共占了几个染色体，这个方案不行，读取太多，没必要
+// 开区间
+struct Region {
+    int64_t start;
+    int64_t end;
+};
+
 /*
 * 
 */
@ -60,26 +124,495 @@ static string getFileExtension(const string &filename) {
    return filename.substr(last_dot + 1);
 }

+// 过滤掉bqsr过程不符合要求的bam数据
+bool bqsrReadFilterOut(const bam1_t *b) {
+    // 过滤掉unmapped的read
+    if (b->core.qual == 0) // mapping quality 0
+        return true;
+    if (b->core.qual == 255) // mapping quality not available
+        return true;
+    if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
+        return true;
+    }
+    if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
+        return true;
+    }
+    if (b->core.flag & BAM_FDUP) {  // secondary alignment
+        return true;
+    }
+    if (b->core.flag & BAM_FQCFAIL) {  // Not passing quality controls
+        return true;
+    }
+    return false;
+}
+
+// 该操作符是否消耗read的碱基
+bool consumeReadBases(char cigar) {
+    return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S';
+}
+
+// 该操作符是否消耗参考基因组的碱基
+bool consumeRefBases(char cigar) {
+    return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N';
+}
+
+// 给定一个ref位置，在read内部找到对应的位置和操作符
+struct PosAndOperator {
+    int readPosAtRefCoord = -1;  // read中的位置
+    char cigarOperator = '0';     // cigar操作符
+    int cigarIndex = -1;        // cigar索引
+    int cigarLen = 0;
+    int preCigarLen = 0;  // 截止cigar之前的，消耗read base的长度
+};
+
+/**
+ * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
+ * the element containing that base.  If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
+ * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
+ *
+ * @param alignmentStart        The soft start of the read on the reference
+ * @param cigar                 The read's cigar
+ * @param refCoord              The target reference coordinate
+ * @return                      If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
+ *                              if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read coordinate;
+ *                              if the reference coordinate falls within a deletion, the first read coordinate after the deletion.  Note: if the last
+ * cigar element is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
+ */
+PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStart, int refCoord) {
+    PosAndOperator po;
+    if (refCoord < alignmentStart) {
+        return po;
+    }
+    int firstReadPosOfElement = 0;              // inclusive
+    int firstRefPosOfElement = alignmentStart;  // inclusive
+    int lastReadPosOfElement = 0;               // exclusive
+    int lastRefPosOfElement = alignmentStart;   // exclusive
+
+    // advance forward through all the cigar elements until we bracket the reference coordinate
+    const uint32_t* cigar = bam_get_cigar(bw->b);
+    const bam1_core_t& bc = bw->b->core;
+    const int idx = bc.n_cigar - 1;
+    if (idx < 0)
+        return po;
+    for (int i = 0; i < bc.n_cigar; ++i) {
+        const char c = bam_cigar_opchr(cigar[i]);
+        const int len = bam_cigar_oplen(cigar[i]);
+        firstReadPosOfElement = lastReadPosOfElement;
+        firstRefPosOfElement = lastRefPosOfElement;
+        lastReadPosOfElement += consumeReadBases(c) ? len : 0;
+        lastRefPosOfElement += (consumeRefBases(c) || c == 'S') ? len : 0;
+        if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) {  // refCoord falls within this cigar element
+            int readPosAtRefCoord = firstReadPosOfElement + (consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
+            return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
+        }
+    }
+    return po;
+}
+
+// 根据adapter位置，对read进行hardclip，返回左侧或右侧减掉的base数量
+void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) {
+    int start, stop;
+    // Determine the read coordinate to start and stop hard clipping
+    if (refStart < 0) {
+        if (refStop < 0) return;
+        PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
+        // if the refStop falls in a deletion, the above method returns the position after the deletion.  Since the stop we return here
+        // is inclusive, we decrement the stop to avoid overclipping by one base.  As a result we do not clip the deletion, which is fine.
+        stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
+        ad.left_clip = stop + 1;
+        ad.cigar_start = stopPosAndOperator.cigarIndex;
+        ad.first_cigar_clip = ad.left_clip - stopPosAndOperator.preCigarLen;
+    } else {
+        if (refStop >= 0) return;
+        // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids
+        // overclipping
+        PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
+        start = startPosAndOperator.readPosAtRefCoord;
+        ad.right_clip = bw->b->core.l_qseq - start;
+        ad.cigar_end = startPosAndOperator.cigarIndex + 1;
+        ad.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
+    }
+}
+
+// 计算切掉adapter之后，ref相对原始ref的偏移量
+void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) {
+    const uint32_t* cigar = bam_get_cigar(bw->b);
+    const bam1_core_t& bc = bw->b->core;
+    int i = 0;
+    for (i = 0; i < ad.cigar_start; ++i) {
+        const char c = bam_cigar_opchr(cigar[i]);
+        int len = bam_cigar_oplen(cigar[i]);
+        if (consumeRefBases(c)) {
+            ad.ref_offset += len;
+        }
+    }
+    const char c = bam_cigar_opchr(cigar[i]);
+    if (consumeRefBases(c)) {
+        ad.ref_offset += ad.first_cigar_clip;
+    }
+}
+
+// 计算clip处理之后，剩余的碱基
+void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) {
+    ad.bases.resize(ad.read_len);
+    uint8_t* seq = bam_get_seq(bw->b);
+    for (int i = 0; i < ad.read_len; ++i) {
+        ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
+    }
+}
+
+// 计算read两端softclip的碱基数量，可能会修改ad里的clip值
+void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) {
+    const uint32_t* cigar = bam_get_cigar(bw->b);
+    const bam1_core_t& bc = bw->b->core;
+    int readIndex = ad.left_clip;
+    int cutLeft = -1;           // first position to hard clip (inclusive)
+    int cutRight = -1;          // first position to hard clip (inclusive)
+    int cigar_start = ad.cigar_start;
+    int cigar_end = ad.cigar_end;
+    bool rightTail = false;  // trigger to stop clipping the left tail and start cutting the right tail
+
+    for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
+        const char c = bam_cigar_opchr(cigar[i]);
+        int len = bam_cigar_oplen(cigar[i]);
+        if (i == ad.cigar_start) len -= ad.first_cigar_clip;
+        if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
+        if (c == 'S') {
+            if (rightTail) {
+                cutRight = readIndex;
+                cigar_end = i;
+            } else {
+                cutLeft = readIndex + len - 1;
+                cigar_start = i + 1;
+            }
+        } else if (c != 'H') {
+            rightTail = true;
+        }
+        if (consumeReadBases(c)) {
+            readIndex += len;
+        }
+    }
+    if (cutRight >= 0) {
+        ad.right_clip = bw->b->core.l_qseq - cutRight;
+        ad.cigar_end = cigar_end;
+        ad.last_cigar_clip = 0;
+    }
+    if (cutLeft >= 0) {
+        ad.left_clip = cutLeft + 1;
+        ad.cigar_start = cigar_start;
+        ad.first_cigar_clip = 0;
+    }
+}
+
+// 读取给定区间的reference
+static inline void read_ref_base(AuxVar& aux, int64_t cur_pos, Interval& interval) {
+    if (aux.ref_seq != NULL)
+        free(aux.ref_seq);
+    int tid = BamWrap::bam_tid(cur_pos);
+    const char* chr = sam_hdr_tid2name(aux.header, tid);
+    int seq_begin = BamWrap::bam_pos(interval.left);  //- aux.REFERENCE_HALF_WINDOW_LENGTH;
+    int seq_end = BamWrap::bam_pos(interval.right);   //+ aux.REFERENCE_HALF_WINDOW_LENGTH;
+    aux.ref_seq = faidx_fetch_seq(aux.faidx, chr, seq_begin, seq_end, &aux.ref_len);
+    // aux.offset = aux.REFERENCE_HALF_WINDOW_LENGTH;
+}
+
+// 设置某个位置是indel
+inline void updateIndel(vector<int> &isIndel, int index) {
+    if (index >=0 && index < isIndel.size()) {
+        isIndel[index] = 1;
+    }
+}
+
+// 计算该read的每个碱基位置是否是SNP或Indel
+int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector<int> &isSNP, vector<int> &isIns, vector<int> &isDel) {
+    // 1. 读取参考基因组，先看看串行运行性能，稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤
+    Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间
+    read_ref_base(aux, interval.left, interval);
+    string refBases(aux.ref_seq);
+    // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos());
+
+    // 2. 遍历cigar，计算每个碱基是否是SNP或Indel
+    int readPos = 0, refPos = 0, nEvents = 0;
+    const uint32_t* cigar = bam_get_cigar(bw->b);
+    const bam1_core_t& bc = bw->b->core;
+    uint8_t* seq = bam_get_seq(bw->b);
+    for (int i = ad.cigar_start; i < ad.cigar_end; ++i) {
+        const char c = bam_cigar_opchr(cigar[i]);
+        int len = bam_cigar_oplen(cigar[i]);
+        if (i == ad.cigar_start) len -= ad.first_cigar_clip;
+        if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip;
+        if (c == 'M' || c == '=' || c == 'X') {
+            for (int j = 0; j < len; ++j) {
+                // 按位置将read和ref碱基进行比较，不同则是snp，注意read起始位置要加上left_clip
+                int snpInt = cBaseToChar[bam_seqi(seq, readPos + ad.left_clip)] == refBases[refPos] ? 0 : 1;
+                isSNP[readPos] = snpInt;
+                nEvents += snpInt;
+                readPos++;
+                refPos++;
+            }
+        } else if (c == 'D') {
+            // 应该是在上一个消耗碱基的cigar的最后一个位置，标记Del
+            int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1;
+            updateIndel(isDel, index);
+            refPos += len;
+        } else if (c == 'N') {
+            refPos += len;
+        } else if (c == 'I') {
+            // 与Del不同，Ins应该是在下一个cigar开始的位置，标记Ins
+            bool forwardStrandRead = !bw->GetReadNegativeStrandFlag();
+            if (forwardStrandRead) {
+                updateIndel(isIns, readPos - 1);
+            }
+            readPos += len;
+            if (!forwardStrandRead) {
+                updateIndel(isIns, readPos);
+            }
+        } else if (c == 'S') {
+            readPos += len;
+        }
+    }
+    nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0);
+
+    //spdlog::info("SNPs: {}, Ins: {}, Del: {}, total events: {}", std::accumulate(isSNP.begin(), isSNP.end(), 0),
+    //             std::accumulate(isIns.begin(), isIns.end(), 0), std::accumulate(isDel.begin(), isDel.end(), 0), nEvents);
+    // exit(0);
+
+    return nEvents;
+}
+
+// 简单计算baq数组，就是全部赋值为'@' (64)
+bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) { 
+    baqArray.resize(ad.read_len, (int)'@'); 
+    return true;
+}
+
+// 计算真实的baq数组，耗时更多，好像enable-baq参数默认是关闭的，那就先不实现这个了
+bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector<int>& baqArray) {
+    baqArray.resize(ad.read_len, 0);
+    return true;
+}
+
+// 获取一行字符串
+static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) {
+    line->clear();
+    if (*cur >= total)
+        return;
+    char b;
+    while (*cur < total && (b = buf[(*cur)++]) != '\n') {
+        line->push_back(b);
+    }
+}
+
+// 计算与read有交叉的已知位点信息， 应该要判断一下，是按照read的范围去读取vcf，还是按照一个batch read的范围去读取
+void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector<VCFParser> &vcfs) {
+    int tid = bw->contig_id();
+    uint64_t startPos = bw->start_pos();   // 闭区间
+    uint64_t endPos = bw->end_pos(); // 闭区间
+    // spdlog::info("bam {}, {}", startPos, endPos);
+    // update vcfs
+    for(auto &vcf : vcfs) {
+        // 清理旧的interval
+        while(!vcf.knownSites.empty()) {
+            auto& intv = vcf.knownSites.front();
+            // spdlog::info("intv bam {}, {}", intv.right, startPos);
+            if (intv.right < startPos)
+                vcf.knownSites.pop_front();
+            else
+                break;
+        }
+        if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) continue;
+
+        // spdlog::info("intv {}, {}, {}", vcf.knownSites.size(), vcf.knownSites.front().right, vcf.knownSites.front().right);
+        // exit(0);
+        //spdlog::info("before intervals : {}", vcf.knownSites.size());
+        // 读取新的interval
+        int64_t fpos, flen;
+        vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
+        //spdlog::info("file index: {}, {}", fpos, flen);
+        if (flen > 0) {
+            vcf.inStm.seekg(fpos, ios::beg);
+            if (flen > vcf.bufLen) {
+                vcf.bufLen = flen;
+                vcf.buf = (char*)realloc(vcf.buf, flen);
+            }
+            char* buf = vcf.buf;
+            vcf.inStm.read(buf, flen);
+            string line;
+            int64_t cur = 0;
+            get_line_from_buf(buf, flen, &cur, &line);
+            while (line.size() > 0) {
+                stringstream ss_line(line);
+                string stid;
+                int tid, pos;
+                int64_t locus;
+                string id, ref;
+                ss_line >> stid >> pos >> id >> ref;
+                tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str());
+                if (tid >= 0 && pos > 0) {
+                    vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
+                    //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
+                }
+                get_line_from_buf(buf, flen, &cur, &line);
+            }
+        }
+        //spdlog::info("after intervals : {}", vcf.knownSites.size());
+        //for(auto &val : vcf.knownSites) {
+        //    spdlog::info("intv {}, {}", val.left, val.right);
+        //}
+    }
+    //exit(0);
+}
+
 // 串行bqsr
 int SerialBQSR() {
    int round = 0;
    BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
-    inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
+    // inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
+    inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
    int64_t readNumSum = 0;
+    // 0. 初始化一些全局数据
+    // BAQ baq{BAQ::DEFAULT_GOP};
+
+    // 1. 协变量数据相关初始化
+    PerReadCovariateMatrix readCovariates;
+    CovariateUtils::InitPerReadCovMat(readCovariates);
+    ContextCovariate::InitContextCovariate(nsgv::gBqsrArg);
+    CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg);
+
+    // 2. 读取bam的read group
+    if (nsgv::gInBamHeader->hrecs->nrg == 0) {
+        spdlog::error("No RG tag found in the header!");
+        return 1;
+    }
+    for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) {
+        spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name);
+        ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i;
+        ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
+    }
+
+    int test = 0;
+
    while (1) {
        ++ round;
+
+        // 一. 读取bam数据
        size_t readNum = 0;
        if (inBamBuf.ReadStat() >= 0)
            readNum = inBamBuf.ReadBam();
        if (readNum < 1) {
            break;
        }
-        spdlog::info("{} reads processed in {} round", readNum, round);
-
        auto bams = inBamBuf.GetBamArr();
-        spdlog::info("region: {} - {}", bams[0]->softclip_start(), bams.back()->softclip_end());
-        // 1. 获取bams数组覆盖的region范围
+        spdlog::info("{} reads processed in {} round, {}", readNum, round, test);

+        // 二. 遍历每个bam（read）记录，进行处理
+
+        for (int i = 0; i < bams.size(); ++i) {
+            // 1. 对每个read，需要检查cigar是否合法，即没有两个连续的相同的cigar，而且需要将首尾的deletion处理掉，目前看好像没啥影响，我们忽略这一步
+            // 2. 对质量分数长度跟碱基长度不匹配的read，缺少的质量分数用默认值补齐，先忽略，后边有需要再处理
+            // 3. 如果bam文件之前做过bqsr，tag中包含OQ（originnal quality，原始质量分数），检查用户参数里是否指定用原始质量分数进行bqsr，如果是则将质量分数替换为OQ，否则忽略OQ，先忽略
+            // 4. 对read的两端进行检测，去除（hardclip）adapter
+            BamWrap *bw = bams[i];
+            ReadAdditionData ad;
+            ad.read_len = BamWrap::BamEffectiveLength(bw->b);
+            ad.cigar_end = bw->b->core.n_cigar;
+            if (ad.read_len <= 0) continue;
+            
+            int adapter_boundary = bw->GetAdapterBoundary();
+            if (bw->IsAdapterInRead(adapter_boundary)) {
+                // adapter在read范围内
+                if (bw->GetReadNegativeStrandFlag()) { // 反链
+                    clipByReferenceCoordinates(bw, -1, adapter_boundary, ad);
+                } else { // 正链
+                    clipByReferenceCoordinates(bw, adapter_boundary, -1, ad);
+                }
+            }
+            ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度
+            // 5. 然后再去除softclip部分
+            calculateSoftClip(bw, ad);
+            ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip;  // 更新read长度
+            if (ad.read_len <= 0) continue;
+
+            calculateRefOffset(bw, ad); // 计算ref_offset，就是相对比对的position，要将ref右移多少
+            calculateReadBases(bw, ad);  // 计算clip处理之后，剩余的碱基
+
+            //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq,
+            //             ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str());
+
+            // 6. 更新每个read的platform信息，好像没啥用，暂时忽略
+            vector<int> isSNP(ad.read_len, 0);     // 该位置是否是SNP位置，0不是，1是
+            vector<int> isIns(ad.read_len, 0); // 该位置是否是插入位置，0不是，1是
+            vector<int> isDel(ad.read_len, 0); // 该位置是否是删除位置，0不是，1是
+            const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel);
+
+            // 7. 计算baqArray
+            // BAQ = base alignment quality
+            // note for efficiency reasons we don't compute the BAQ array unless we actually have
+            // some error to marginalize over.  For ILMN data ~85% of reads have no error
+            vector<int> baqArray;
+            bool baqCalculated = false;
+            if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) {
+                baqCalculated = flatBAQArray(bw, ad, baqArray);
+            } else {
+                // baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, bw, ad, baqArray);
+            }
+            if (!baqCalculated) continue;
+            // 到这里，基本的数据都准备好了，后续就是进行bqsr的统计了
+
+            // 8. 计算这条read对应的协变量
+            CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
+            test = readCovariates[1][0][0] + readCovariates[2][1][3];
+            int end_pos = bw->contig_end_pos();
+            //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
+            //             bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
+        
+            // 9. 计算这条read需要跳过的位置
+            vector<bool> skip(ad.read_len, 0);
+            calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr);
+        }
+
+#if 0
+        // spdlog::info("region: {} - {}", bams[0]->global_softclip_start(), bams.back()->global_softclip_end());
+        // 1. 获取bams数组覆盖的region范围
+        // 如果读取的bam数组跨越了不同的染色体，咋搞？还是按照每个线程都有独立的vcf文件来做吧
+        int64_t region_start = bams[0]->global_softclip_start();
+        vector<Region> contig_bams;
+        int contig_id = bams[0]->contig_id();
+        int64_t start = 0, stop = 0;
+        while (true) {
+            stop = start;
+            while (stop < bams.size() && bams[stop]->contig_id() == contig_id) ++stop;
+            if (stop > start) contig_bams.push_back(Region{start, stop});
+            if (stop >= bams.size()) break;
+            contig_id = bams[stop]->contig_id();
+            start = stop;
+        }
+
+        spdlog::info("{}, {} contig regions", contig_id, contig_bams.size());
+
+        for (int i = 0; i < bams.size();)  {
+            int64_t a1 = bams[i]->contig_pos();
+            int64_t b1 = bams[i]->contig_end_pos();
+            int64_t a = bams[i]->softclip_start();
+            int64_t b = bams[i]->softclip_end();
+            spdlog::info("{}: ({}, {}), ({}, {})", bams[i]->query_name(), a1, b1, a, b);
+            ++i;
+        }
+        // 依次处理每个contig的bams
+        vector<uint32_t> bitmap(100, 0);  // 用来表示known sites覆盖情况的bitmap
+        for (const auto& cr : contig_bams) {
+            spdlog::info("  contig id: {}, bam count: {}, bitmap size: {}", contig_id, cr.end - cr.start, bitmap.size());
+            // 当前处理的contig
+            int contig_id = bams[cr.start]->contig_id();
+            int64_t region_start = bams[cr.start]->softclip_start();
+            int64_t region_end = bams[cr.end - 1]->softclip_end();
+            if ((bitmap.size() << 5)) {
+
+            }
+        
+        }
+
+#endif
        // 2. 开辟一个uint32_t的数组作为bitmap（如果上一轮的不够就重开），用来表示region的每个位点是否有known sites覆盖（每轮使用前需清零）

        // 3. 读取在region范围内的所有known sites，并为对应的bitmap设定0 or 1 (作为skip标识)
@ -87,14 +620,18 @@ int SerialBQSR() {
        // 4. 遍历bams数组中的每一条记录并进行处理

        readNumSum += readNum;
-        inBamBuf.ClearAll();     // 
+        inBamBuf.ClearAll();     //
    }
    spdlog::info("read count: {}", readNumSum);

    return 0;
 }

-// entrance of mark duplicates
+// 需要支持vcf idx，tbi，csi三种索引方式
+// vcf和idx是一对
+// vcf.gz和tbi或csi是一对
+
+// entrance of mark BQSR
 int BaseRecalibrator() {
    
    PROF_START(whole_process);
@ -107,6 +644,18 @@ int BaseRecalibrator() {
    hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
    nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp);  // header
    
+    // 初始化AuxVar
+    nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS);
+    for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) {
+        nsgv::gAuxVars[i].header = nsgv::gInBamHeader;
+        nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str());
+        if (nsgv::gAuxVars[i].faidx == 0)
+            error("[%s] fail to load the fasta index.\n", __func__);
+        for (auto &vcfFileName : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
+            nsgv::gAuxVars[i].vcfArr.push_back(VCFParser(vcfFileName, nsgv::gInBamHeader));
+        }
+    }
+
    // (libraryId)
    nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0);

@ -124,21 +673,21 @@ int BaseRecalibrator() {

    return SerialBQSR();

-//    // 读取known sites vcfs
-//    for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
-//        spdlog::info("  {}", ks);
-//        bcf_srs_t* srs = bcf_sr_init();
-//        if (!bcf_sr_add_reader(srs, ks.c_str()))
-//            error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
-//        nsgv::gKnownSitesVcfSrs.push_back(srs);
-//
-//        while (bcf_sr_next_line(srs)) {
-//            bcf1_t* line = srs->readers[0].buffer[0];
-//            cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
-//        }
-//    }
-//
-//    /* 先实现串行的bqsr-phase-1 */
+    // 读取known sites vcfs
+    for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
+        spdlog::info("  {}", ks);
+        bcf_srs_t* srs = bcf_sr_init();
+        if (!bcf_sr_add_reader(srs, ks.c_str()))
+            error("Failed to read from %s: %s\n", !strcmp("-", ks.c_str()) ? "standard input" : ks.c_str(), bcf_sr_strerror(srs->errnum));
+        nsgv::gKnownSitesVcfSrs.push_back(srs);
+
+        while (bcf_sr_next_line(srs)) {
+            bcf1_t* line = srs->readers[0].buffer[0];
+            cout << line->pos << '\t' << line->rlen << '\t' << line->n_allele << '\t' << line->n_info << endl;
+        }
+    }
+
+    /* 先实现串行的bqsr-phase-1 */
    


--- a/src/bqsr/covariate.cpp
+++ b/src/bqsr/covariate.cpp
@ -0,0 +1,317 @@
+#include "covariate.h"
+
+// for EventType
+EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"};
+EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"};
+EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"};
+
+// static变量 for ContextCovariate
+int ContextCovariate::mismatchesContextSize;
+int ContextCovariate::indelsContextSize;
+int ContextCovariate::mismatchesKeyMask;
+int ContextCovariate::indelsKeyMask;
+uint8_t ContextCovariate::lowQualTail;
+int ContextCovariate::baseIndexMap[256];
+
+// for ReadGroupCovariate
+map<string, int> ReadGroupCovariate::RgToId;  // read group name到id的映射
+map<int, string> ReadGroupCovariate::IdToRg;  // id到read group name的映射
+
+// for cycleCovariate
+int CycleCovariate::MAXIMUM_CYCLE_VALUE;
+
+// for CovariateUtils
+// 对一条read计算协变量（该协变量被上一个read用过）
+void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
+                                            bool recordIndelValues) {
+    //ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    //BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    //ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+    //CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
+}
+
+// ReadGroupCovariate 协变量的方法
+void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+    uint8_t *rgStr = bam_aux_get(bw->b, "RG");
+    char* rgVal = nullptr;
+    if (rgStr) rgVal = bam_aux2Z(rgStr);
+    int key = 0;
+    if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
+        spdlog::error("The RG tag value for read can not be found in header!");
+    } else {
+        key = RgToId[rgVal];
+    }
+    for (int i = 0; i < ad.read_len; ++i) {
+        CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
+    }
+}
+
+// BaseQualityCovariate 协变量的方法
+void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
+                                        bool recordIndelValues) {
+    // 在前面的处理过后，quals应该和base长度一致了
+#define __bq_set_cov(ins, del)                                                                                           \
+    do {                                                                                                                 \
+        for (int i = 0; i < ad.read_len; ++i) {                                                                          \
+            CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \
+        }                                                                                                                \
+    } while (0)
+
+    const int INDEL_QUAL = 45;
+    uint8_t* quals = bam_get_qual(bw->b);
+    if (recordIndelValues) {
+        uint8_t* insQualPtr = bam_aux_get(bw->b, "BI");  // base qualities for insertions
+        uint8_t* delQualPtr = bam_aux_get(bw->b, "BD");  // base qualities for deletions
+        if (insQualPtr == nullptr && delQualPtr == nullptr) {
+            __bq_set_cov(INDEL_QUAL, INDEL_QUAL);
+        } else if (insQualPtr == nullptr) {
+            uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr);
+            __bq_set_cov(INDEL_QUAL, delQuals[i]);
+        } else {
+            uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr);
+            __bq_set_cov(insQuals[i], INDEL_QUAL);
+        }
+    } else {
+        __bq_set_cov(0, 0);
+    }
+}
+
+// ContextCovariate 协变量的方法
+
+static char SimpleComplement(const char base) {
+    switch (base) {
+    case 'A':
+    case 'a':
+        return 'T';
+    case 'C':
+    case 'c':
+        return 'G';
+    case 'G':
+    case 'g':
+        return 'C';
+    case 'T':
+    case 't':
+        return 'A';
+    default:
+        return base;
+    }
+}
+
+// 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
+void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) {
+    uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
+
+    if (bw->GetReadNegativeStrandFlag()) {  // 反向互补
+        for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
+    }
+
+    // 处理左边
+    int left = 0;
+    for (; left < ad.read_len; ++left) {
+        if (quals[left] <= lowQTail)
+            clippedBases[left] = 'N';
+        else
+            break;
+    }
+    if (left == ad.read_len) {
+        clippedBases.clear();
+        return;
+    }
+    // 处理右边
+    int right = ad.read_len - 1;
+    for (; right >= 0; --right) {
+        if (quals[right] <= lowQTail)
+            clippedBases[right] = 'N';
+        else
+            break;
+    }
+    if (right < left)
+        clippedBases.clear();
+}
+
+/**
+ * Creates a int representation of a given dna string.
+ *
+ * @param dna    the dna sequence
+ * @param start  the start position in the byte array (inclusive)
+ * @param end    the end position in the array (exclusive)
+ * @return the key representing the dna sequence
+ */
+int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) {
+    int key = end - start;
+    int bitOffset = LENGTH_BITS;
+    for (int i = start; i < end; i++) {
+        const int baseIndex = baseIndexMap[dna[i] & 0xff];
+        if (baseIndex == -1) {  // ignore non-ACGT bases
+            return -1;
+        }
+        key |= (baseIndex << bitOffset);
+        bitOffset += 2;
+    }
+    return key;
+}
+
+/**
+ * For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
+ *
+ * For example, for the read [AGCTG], return the list
+ *   [-1, "AG", "GC", "CT", "TG" ]
+ * with each string context encoded as an integer.
+ *
+ * @param bases       the bases in the read to build the context from
+ * @param contextSize context size to use building the context
+ * @param mask        mask for pulling out just the context bits
+ *
+ * @return a list that has the same length as the read and contains the (preceding) n-base context at each position.
+ *
+ */
+void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys) {
+    int readLength = bases.size();
+    keys.resize(readLength);
+    int keyIdx = 0;
+    // the first contextSize-1 bases will not have enough previous context
+    for (int i = 1; i < contextSize && i <= readLength; i++) {
+        keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE;
+    }
+    if (readLength < contextSize)
+        return;
+
+    int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS;
+
+    // get (and add) the key for the context starting at the first base
+    int currentKey = KeyFromContext(bases, 0, contextSize);
+    keys[keyIdx++] = currentKey;
+
+    // if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects
+    int currentNPenalty = 0;
+    if (currentKey == -1) {
+        currentKey = 0;
+        currentNPenalty = contextSize - 1;
+        int offset = newBaseOffset;
+        int baseIndex;
+        while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) {
+            currentKey |= (baseIndex << offset);
+            offset -= 2;
+            currentNPenalty--;
+        }
+    }
+
+    for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) {
+        const int baseIndex = baseIndexMap[bases[currentIndex]];
+        if (baseIndex == -1) {  // ignore non-ACGT bases
+            currentNPenalty = contextSize;
+            currentKey = 0;  // reset the key
+        } else {
+            // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length
+            // in
+            currentKey = (currentKey >> 2) & mask;
+            currentKey |= (baseIndex << newBaseOffset);
+            currentKey |= contextSize;
+        }
+
+        if (currentNPenalty == 0) {
+            keys[keyIdx++] = currentKey;
+        } else {
+            currentNPenalty--;
+            keys[keyIdx++] = -1;
+        }
+    }
+}
+
+void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+    const int originalReadLength = ad.read_len;
+
+    // store the original bases and then write Ns over low quality ones
+    string strandedClippedBases(ad.bases);
+    GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
+    // spdlog::info("bases: {}", strandedClippedBases);
+    vector<int> nBasePairContextAtEachCycle;
+    GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
+
+    const int readLengthAfterClipping = strandedClippedBases.size();
+
+    // this is necessary to ensure that we don't keep historical data in the ReadCovariates values
+    // since the context covariate may not span the entire set of values in read covariates
+    // due to the clipping of the low quality bases
+    if (readLengthAfterClipping != originalReadLength) {
+        // don't bother zeroing out if we are going to overwrite the whole array
+        for (int i = 0; i < originalReadLength; i++) {
+            // this base has been clipped off, so zero out the covariate values here
+            CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values);
+        }
+    }
+
+    const bool negativeStrand = bw->GetReadNegativeStrandFlag();
+    // Note: duplicated the loop to avoid checking recordIndelValues on each iteration
+    if (recordIndelValues) {
+        vector<int> indelKeys;
+        GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys);
+        for (int i = 0; i < readLengthAfterClipping; i++) {
+            const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
+            const int indelKey = indelKeys[i];
+            CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values);
+        }
+    } else {
+        for (int i = 0; i < readLengthAfterClipping; i++) {
+            const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
+            CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values);
+        }
+    }
+}
+
+// CycleCovariate 协变量的方法
+
+/**
+ * Computes the encoded value of CycleCovariate's key for the given position at the read.
+ * Uses keyFromCycle to do the encoding.
+ * @param baseNumber index of the base to compute the key for
+ * @param read the read
+ * @param indel is this an indel key or a substitution key?
+ * @param maxCycle max value of the base to compute the key for
+ *                 (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
+ */
+int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) {
+    const bool isNegStrand = bw->GetReadNegativeStrandFlag();
+    const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
+    const int readLength = ad.read_len;
+
+    const int readOrderFactor = isSecondInPair ? -1 : 1;
+    int increment;
+    int cycle;
+    if (isNegStrand) {
+        cycle = readLength * readOrderFactor;
+        increment = -1 * readOrderFactor;
+    } else {
+        cycle = readOrderFactor;
+        increment = readOrderFactor;
+    }
+    cycle += baseNumber * increment;
+
+    if (!indel) {
+        return CycleCovariate::KeyFromCycle(cycle, maxCycle);
+    }
+    const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1;
+    if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) {
+        return -1;
+    } else {
+        return CycleCovariate::KeyFromCycle(cycle, maxCycle);
+    }
+}
+
+// Used to pick out the covariate's value from attributes of the read
+void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
+    const int readLength = ad.read_len;
+    // Note: duplicate the loop to void checking recordIndelValues on every iteration
+    if (recordIndelValues) {
+        for (int i = 0; i < readLength; i++) {
+            const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
+            const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE);
+            CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
+        }
+    } else {
+        for (int i = 0; i < readLength; i++) {
+            const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
+            CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
+        }
+    }
+}
--- a/src/bqsr/covariate.h
+++ b/src/bqsr/covariate.h
@ -0,0 +1,209 @@
+/*
+     Description: 在bqsr过程中，计算协变量相关的类和方法
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2025/12/08
+*/
+
+#pragma once
+
+#include <spdlog/spdlog.h>
+
+#include <cstdint>
+#include <cstdlib>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "bqsr_args.h"
+#include "util/bam_wrap.h"
+
+using std::map;
+using std::string;
+using std::vector;
+
+/**
+ * This is where we store the pre-read covariates, also indexed by (event type) and (read position).
+ * Thus the array has shape { event type } x { read position (aka cycle) } x { covariate }.
+ * For instance, { covariate } is by default 4-dimensional (read group, base quality, context, cycle).
+ */
+typedef vector<vector<vector<int>>> PerReadCovariateMatrix;
+
+// 变异类型（snp, insert, deletion）
+struct EventTypeValue {
+    int index; // 在协变量数组中对应的索引
+    char representation;
+    string longRepresentation;
+};
+
+struct EventType {
+    static constexpr int EVENT_SIZE = 3;
+    static EventTypeValue BASE_SUBSTITUTION;
+    static EventTypeValue BASE_INSERTION;
+    static EventTypeValue BASE_DELETION;
+};
+
+// 协变量相关的工具类
+struct CovariateUtils {
+    static constexpr int MAX_READ_LENGTH = 300;  // 最大read长度
+    static constexpr int NUM_COVARIATES = 4;
+
+    // 初始化PerReadCovariateMatrix
+    static void InitPerReadCovMat(PerReadCovariateMatrix& matrix) {
+        matrix.resize(EventType::EVENT_SIZE);
+        for (int event_type = 0; event_type < EventType::EVENT_SIZE; ++event_type) {
+            matrix[event_type].resize(MAX_READ_LENGTH);
+            for (int pos = 0; pos < MAX_READ_LENGTH; ++pos) {
+                matrix[event_type][pos].resize(NUM_COVARIATES, 0);
+            }
+        }
+    }
+
+    // 设置协变量
+    static void SetCovariate(int mismatch, int insertion, int deletion, int readOffset, int covIndex, PerReadCovariateMatrix& matrix) {
+        matrix[EventType::BASE_SUBSTITUTION.index][readOffset][covIndex] = mismatch;
+        matrix[EventType::BASE_INSERTION.index][readOffset][covIndex] = insertion;
+        matrix[EventType::BASE_DELETION.index][readOffset][covIndex] = deletion;
+    }
+
+    // 对一条read计算协变量（该协变量被上一个read用过）
+    static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+};
+
+// Read group协变量
+struct ReadGroupCovariate {
+    static constexpr int index = 0;      // 在协变量数组中的索引位置
+    static map<string, int> RgToId;  // read group name到id的映射
+    static map<int, string> IdToRg;  // id到read group name的映射
+
+    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+};
+
+// Base quality协变量
+struct BaseQualityCovariate {
+    static constexpr int index = 1;  // 在协变量数组中的索引位置
+    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+};
+
+// Context协变量
+struct ContextCovariate {
+    static constexpr int index = 2;  // 在协变量数组中的索引位置
+    static constexpr int UNKNOWN_OR_ERROR_CONTEXT_CODE = -1;
+    static constexpr int LENGTH_BITS = 4;
+    static constexpr int LENGTH_MASK = 15;
+
+    // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are
+    // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base.
+    static constexpr int MAX_DNA_CONTEXT = 13;
+
+    static int mismatchesContextSize;
+    static int indelsContextSize;
+    static int mismatchesKeyMask;
+    static int indelsKeyMask;
+    static uint8_t lowQualTail;
+
+    static int baseIndexMap[256];
+
+    static void InitContextCovariate(BQSRArg& p) {
+        mismatchesContextSize = p.MISMATCHES_CONTEXT_SIZE;
+        indelsContextSize = p.INDELS_CONTEXT_SIZE;
+        if (mismatchesContextSize > MAX_DNA_CONTEXT) {
+            spdlog::error("mismatches_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, mismatchesContextSize);
+            exit(1);
+        }
+        if (indelsContextSize > MAX_DNA_CONTEXT) {
+            spdlog::error("indels_context_size: context size cannot be bigger than {}, but was {}", MAX_DNA_CONTEXT, indelsContextSize);
+            exit(1);
+        }
+        lowQualTail = p.LOW_QUAL_TAIL;
+        if (mismatchesContextSize <= 0 || indelsContextSize <= 0) {
+            spdlog::error("Context size must be positive. Mismatches: {} Indels: {}", mismatchesContextSize, indelsContextSize);
+            exit(1);
+        }
+        mismatchesKeyMask = CreateMask(mismatchesContextSize);
+        indelsKeyMask = CreateMask(indelsContextSize);
+
+        // init baseIndexMap
+        for (int i = 0; i < 256; ++i) {
+            baseIndexMap[i] = -1;
+        }
+        baseIndexMap['A'] = 0;
+        baseIndexMap['a'] = 0;
+        baseIndexMap['*'] = 0;
+        baseIndexMap['C'] = 1;
+        baseIndexMap['c'] = 1;
+        baseIndexMap['G'] = 2;
+        baseIndexMap['g'] = 2;
+        baseIndexMap['T'] = 3;
+        baseIndexMap['t'] = 3;
+    }
+
+    static int CreateMask(int contextSize) {
+        int mask = 0;
+        // create 2*contextSize worth of bits
+        for (int i = 0; i < contextSize; i++) {
+            mask = (mask << 2) | 3;
+        }
+        // shift 4 bits to mask out the bits used to encode the length
+        return mask << LENGTH_BITS;
+    }
+
+    /**
+     * Helper method: computes the correct offset to use in computations of covariate values.
+     * @param isNegativeStrand is the read on the negative strand
+     * @param offset 0-based index of the base in the read
+     * @param readLength length of the read
+     * @return
+     */
+    static int GetStrandedOffset(const bool isNegativeStrand, const int offset, const int readLength) {
+        return isNegativeStrand ? (readLength - offset - 1) : offset;
+    }
+
+    // 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
+    static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail);
+    // Creates a int representation of a given dna string.
+    static int KeyFromContext(const string& dna, const int start, const int end);
+    // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
+    static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys);
+
+    // 设置协变量的值
+    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+};
+
+// Cycle协变量
+struct CycleCovariate {
+    static constexpr int index = 3;  // 在协变量数组中的索引位置
+    static int MAXIMUM_CYCLE_VALUE;
+    static constexpr int CUSHION_FOR_INDELS = 4;
+
+    static void InitCycleCovariate(BQSRArg& p) { MAXIMUM_CYCLE_VALUE = p.MAXIMUM_CYCLE_VALUE; }
+
+    /**
+     * Encodes the cycle number as a key.
+     */
+    static int KeyFromCycle(const int cycle, const int maxCycle) {
+        // no negative values because values must fit into the first few bits of the long
+        int result = std::abs(cycle);
+        if (result > maxCycle) {
+            spdlog::error(
+                "The maximum allowed value for the cycle is {}, but a larger cycle ({}) was detected.  Please use the --maximum-cycle-value argument "
+                "(when creating the recalibration table in "
+                "BaseRecalibrator) to increase this value (at the expense of requiring more memory to run)",
+                maxCycle, result);
+            exit(1);
+        }
+
+        result <<= 1;  // shift so we can add the "sign" bit
+        if (cycle < 0) {
+            result++;  // negative cycles get the lower-most bit set
+        }
+        return result;
+}
+
+    // Computes the encoded value of CycleCovariate's key for the given position at the read.
+    static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle);
+
+    static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues);
+};
--- a/src/main.cpp
+++ b/src/main.cpp
@ -78,6 +78,12 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
        .nargs(1)
        .metavar("<IndexFormat>");

+    program.add_argument("--enable-baq")
+        .help("Whether to do BAQ correction.")
+        .default_value(false)
+        .implicit_value(true)
+        .hidden();
+
    // add help and version args
    program.add_argument("-h", "--help")
        .action([&](const auto & /*unused*/) {
@ -113,8 +119,9 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
        nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
        nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
        nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
-
+        nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
        nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
+        nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
        // spdlog::info("known sites vcf files:");
        // for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
        //     spdlog::info("  {}", ks);
--- a/src/util/bam_buf.cpp
+++ b/src/util/bam_buf.cpp
@ -1,5 +1,5 @@
 /*
-     Description: sam/bam，buf，
+     Description: 读入sam/bam时，开辟一个大的buf，存放这些数据

     Copyright : All right reserved by ICT

@ -10,25 +10,28 @@
 #include "bam_buf.h"

 /*
- * BamBuf
+ * BamBuf类
 */
-// ，
+// 读取数据直到读完，或者缓冲区满
 int BamBuf::ReadBam() {
    int read_num = 0;
-    if (handle_last) {             // bam
-        if (has_enough_space()) {  // ，memffset
+    if (handle_last) {             // 处理上次读入的最后一个bam
+        if (has_enough_space()) {  // 必须调用，在边界处调整memffset
            ++read_num;
            append_one_bam();
        } else {
-            return read_num;  // 
+            return read_num;  // 还是没空间
        }
    }
    while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
        bw->end_pos_ = BamWrap::BamEndPos(bw->b);
-        if (has_enough_space()) {  // 
-                                   // if (true) {  // 
-            append_one_bam();
-            ++read_num;  // 
+        if (has_enough_space()) {  // 还有空间
+                                   // if (true) {  // 还有空间
+            // 加过滤器
+            if (filter_out == nullptr || !filter_out(bw->b)) {
+                append_one_bam();
+                ++read_num;  // 放进缓存才算读取到
+            }
        } else {
            break;
        }
@ -41,7 +44,7 @@ int BamBuf::ReadBam() {
    return read_num;
 }

-// 
+// 初始化缓存
 void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
    this->fp = fp;
    this->hdr = hdr;
@ -71,9 +74,9 @@ void BamBuf::ClearAll() {
    prepare_read();
 }

-// , 
+// 为下一次读取做准备, 计算一些边界条件
 inline void BamBuf::prepare_read() {
-    // bam
+    // 计算余留的下次计算可能用到的bam所占的位置
    if (bv.size() > 0) {
        BamWrap *bw = bv[0];
        legacy_start = (int64_t)bw - (int64_t)mem;
@ -81,11 +84,11 @@ inline void BamBuf::prepare_read() {
        legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
    } else {
        legacy_start = legacy_end = 0;
-        mem_offset = 0;  // ，
+        mem_offset = 0;  // 上次没剩下，那就从头存储
    }
 }

-// 
+// 检查缓存是否还有空间
 inline bool BamBuf::has_enough_space() {
    const uint32_t bam_len = bw->length();
    int64_t potential_end = mem_offset + bam_len;
@ -104,7 +107,7 @@ inline bool BamBuf::has_enough_space() {
    return potential_end < legacy_start;
 }

-// bam
+// 处理一个读取后的bam
 inline void BamBuf::append_one_bam() {
    BamWrap *bwp = (BamWrap *)(mem + mem_offset);
    *bwp = *bw;
@ -113,15 +116,15 @@ inline void BamBuf::append_one_bam() {
    *bp = *bw->b;
    bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t));
    memcpy(bp->data, bw->b->data, bw->b->l_data);
-    // 
+    // 更新下次存储的位置
    mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1));
    bv.push_back(bwp);
 }

-// read
+// 处理上次读入的最后一个read
 inline bool BamBuf::handle_last_read() {
-    if (handle_last) {             // bam
-        if (has_enough_space()) {  // ，memffset
+    if (handle_last) {             // 处理上次读入的最后一个bam
+        if (has_enough_space()) {  // 必须调用，在边界处调整memffset
            append_one_bam();
            handle_last = false;
            return true;
@ -131,9 +134,9 @@ inline bool BamBuf::handle_last_read() {
 }

 /*
- * AsyncIoBamBuf 
+ * AsyncIoBamBuf 类
 */
-// 
+// 初始化缓存
 void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
    if (use_async_io_) {
        buf1_.Init(fp, hdr, mem_size >> 1);
@ -147,7 +150,7 @@ void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
    }
 }

-// 
+// 读取数据
 int AsyncIoBamBuf::ReadBam() {
    if (use_async_io_) {
        hasThread = true;
@ -178,11 +181,11 @@ int AsyncIoBamBuf::async_read_bam() {
        first_read_ = false;
        refresh_bam_arr();
    } else {
-        // join, 
+        // join, 交换缓冲区指针
        pthread_join(*tid_, 0);
        resize_buf();

-        if (need_read_) {  // 
+        if (need_read_) {  // 需要交换指针
            BamBuf *tmp = pi_;
            pi_ = po_;
            po_ = tmp;
@ -190,14 +193,14 @@ int AsyncIoBamBuf::async_read_bam() {
        read_num = last_read_num_;
        refresh_bam_arr();
    }
-    // 
+    // 异步读
    pthread_create(tid_, 0, async_read, this);
    return read_num;
 }

 void *AsyncIoBamBuf::async_read(void *data) {
    AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
-    if (ab->need_read_ && ab->ReadStat() >= 0) {  // 
+    if (ab->need_read_ && ab->ReadStat() >= 0) {  // 需要读取
        ab->last_read_num_ = ab->po_->ReadBam();
    } else {
        ab->last_read_num_ = 0;
@ -205,23 +208,23 @@ void *AsyncIoBamBuf::async_read(void *data) {
    pthread_exit(0);
 }

-// ,
-// ，，po_buf
+// 为下一次读取做准备,
+// 计算一些边界条件，延迟操作，因为此时可能po_对应的buf正在读取
 void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; }

-// ，，po_buf
+// 清空上一次所有读入的数据，延迟操作，因为此时可能po_对应的buf正在读取
 void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }

 inline void AsyncIoBamBuf::resize_buf() {
-    if (clear_all_) {  // 
+    if (clear_all_) {  // 清理上一轮的数据
        clear_all_ = false;
        po_->ClearBeforeIdx(legacy_size_);
        pi_->ClearAll();
-        if (pi_->handle_last_read()) {  // read
+        if (pi_->handle_last_read()) {  // 上次读取有一个read没放入缓存
            last_read_num_ += 1;
-            legacy_size_ = pi_->Size();  // read
+            legacy_size_ = pi_->Size();  // 应该只有一个read
            need_read_ = true;
-        } else {  // ，，
+        } else {  // 没空间存放，则不交换指针，或者文件已经读取完毕
            legacy_size_ = 0;
            need_read_ = false;
        }
@ -229,16 +232,16 @@ inline void AsyncIoBamBuf::resize_buf() {
        if (clear_before_idx_ < legacy_size_) {
            po_->ClearBeforeIdx(clear_before_idx_);
            legacy_size_ -= clear_before_idx_;
-            // ，
+            // 不需要交换指针，不需要读取
            need_read_ = false;
        } else {
            po_->ClearBeforeIdx(legacy_size_);
            pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
-            if (pi_->handle_last_read()) {  // read
+            if (pi_->handle_last_read()) {  // 上次读取有一个read没放入缓存
                last_read_num_ += 1;
-                legacy_size_ = pi_->Size();  // read
+                legacy_size_ = pi_->Size();  // 应该只有一个read
                need_read_ = true;
-            } else {  // ，，
+            } else {  // 没空间存放，则不交换指针，或者文件已经读取完毕
                legacy_size_ = 0;
                need_read_ = false;
            }
--- a/src/util/bam_buf.h
+++ b/src/util/bam_buf.h
@ -1,5 +1,5 @@
 /*
-     Description: sam/bam，buf，
+     Description: 读入sam/bam时，开辟一个大的buf，存放这些数据

     Copyright : All right reserved by ICT

@ -26,33 +26,40 @@
 using std::vector;
 using namespace std;

+using ReadFilterOut = bool (*)(const bam1_t *b);
+
 /*
- * bam
+ * 存放读入的bam数据
 */
 struct BamBuf {
-    sam_hdr_t *hdr;            // samheader
-    samFile *fp;               // sam
-    BamWrap *bw = nullptr;     // bam
-    uint8_t *mem = nullptr;    // bam,
-                               // ，
-    int64_t mem_offset = 0;    // 
-    int64_t mem_size;          // 
-    int read_stat_ = 0;        // ，
-    vector<BamWrap *> bv;      // bam
-    int64_t legacy_start = 0;  // bammem, 
-    int64_t legacy_end = 0;    // bammem, 
-    bool handle_last = false;  // bam
+    sam_hdr_t *hdr;            // sam文件的header信息
+    samFile *fp;               // sam文件指针
+    BamWrap *bw = nullptr;     // 用来循环读入bam
+    uint8_t *mem = nullptr;    // 用来存放bam的数据,
+                               // 程序结束后自动释放，所以没在析构函数里释放
+    int64_t mem_offset = 0;    // 下一次要存放的位置
+    int64_t mem_size;          // 缓存大小
+    int read_stat_ = 0;        // 读取状态，是否读完
+    vector<BamWrap *> bv;      // 方便对bam数据的访问
+    int64_t legacy_start = 0;  // 没处理完的bam在mem中的起始位置, 闭区间
+    int64_t legacy_end = 0;    // 没处理完的bam在mem中的结束位置, 开区间
+    bool handle_last = false;  // 上次最后读入的bam是否需要处理
+    ReadFilterOut filter_out = nullptr;  // 读入过滤函数指针

-    // 
+    // 初始化缓存
    void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
-    // ，
+    void Init(samFile* fp, sam_hdr_t* hdr, int64_t mem_size, ReadFilterOut filter) {
+        this->filter_out = filter;
+        Init(fp, hdr, mem_size);
+    }
+    // 读取数据直到读完，或者缓冲区满
    int ReadBam();
-    // , 
+    // 为下一次读取做准备, 计算一些边界条件
    void ClearBeforeIdx(size_t idxInBv);
-    // 
+    // 清空上一次所有读入的数据
    void ClearAll();
-    inline int64_t Size() { return bv.size(); }  // read
-    inline int ReadStat() { return read_stat_; }  // ，（）
+    inline int64_t Size() { return bv.size(); }  // 包含多少个read
+    inline int ReadStat() { return read_stat_; }  // 文件的读取状态，是否可读（读取完全）
    ~BamBuf() {
        if (this->mem != nullptr) {
            free(this->mem);
@ -62,7 +69,7 @@ struct BamBuf {
            free(bw);
        }
    }
-    void FreeMemory()  // 
+    void FreeMemory()  // 释放开辟的内存
    {
        if (this->mem != nullptr) {
            free(this->mem);
@ -75,14 +82,14 @@ struct BamBuf {
        this->bw = nullptr;
    }
    void prepare_read();
-    // 
+    // 检查缓存是否还有空间
    bool has_enough_space();
-    // bam
+    // 处理一个读取后的bam
    void append_one_bam();
-    // read
+    // 处理上次读入的最后一个read
    bool handle_last_read();

-    // bv
+    // 针对bv的操作
    inline BamWrap *operator[](int64_t pos) { return bv[pos]; }
    inline void push_back(BamWrap *val) { bv.push_back(val); }
    inline void clear() { bv.clear(); }
@ -90,53 +97,57 @@ struct BamBuf {
 };

 /*
- * io
+ * io异步缓冲区
 */
 struct AsyncIoBamBuf {
    BamBuf buf1_;
    BamBuf buf2_;
-    BamBuf *pi_;  // buf
-    BamBuf *po_;  // buf
+    BamBuf *pi_;  // 当前用的buf
+    BamBuf *po_;  // 后台在读取的buf
    pthread_t *tid_ = NULL;
    bool hasThread = false;
-    int64_t legacy_size_ = 0;  // ，read
+    int64_t legacy_size_ = 0;  // 上一轮运算之后，缓存中还剩余的上次读取的read数量
    bool first_read_ = true;
-    int last_read_num_ = 0;  // reads
+    int last_read_num_ = 0;  // 上一次读取了多少reads
    bool need_read_ = true;
    bool use_async_io_ = true;
-    int64_t clear_before_idx_ = 0;  // ，clear_before_idx_reads
-    bool clear_all_ = false;        // ，reads
+    int64_t clear_before_idx_ = 0;  // 用户异步读取，下一轮读取之前清理掉clear_before_idx_之前的所有reads
+    bool clear_all_ = false;        // 用于异步读取，下一轮读取之前清理掉之前的所有reads

-    vector<BamWrap *> bam_arr_;  // bufbam
+    vector<BamWrap *> bam_arr_;  // 用来访问buf中的bam

    AsyncIoBamBuf() {}
    AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
-    // 
+    // 析构
    ~AsyncIoBamBuf() {
        if (tid_ != NULL) {
            if (hasThread)
                pthread_join(*tid_, 0);
            free(tid_);
        }
-        // 
-        // buf
+        // 其他的内存就等程序结束自动释放
+        // buf的析构函数会自动调用
    }

-    // 
+    // 初始化缓存
    void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
-
-    // 
+    void Init(samFile* fp, sam_hdr_t* hdr, int64_t mem_size, ReadFilterOut filter) {
+        Init(fp, hdr, mem_size);
+        buf1_.filter_out = filter;
+        buf2_.filter_out = filter;
+    }
+    // 读取数据
    int ReadBam();
-    // , 
+    // 为下一次读取做准备, 计算一些边界条件
    void ClearBeforeIdx(size_t idxInBv);
-    vector<BamWrap *> &GetBamArr() { return bam_arr_; }  // bam array
-    // 
+    vector<BamWrap *> &GetBamArr() { return bam_arr_; }  // 获取bam array
+    // 清空上一次所有读入的数据
    void ClearAll();
-    // read
+    // 包含的read数量
    inline int64_t Size() { return legacy_size_ + pi_->Size(); }
    inline int ReadStat() { return pi_->read_stat_; }
    inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; }
-    // reads
+    // 获取某一段reads
    inline vector<BamWrap *> Slice(size_t startIdx, size_t endIdx) {
        if (endIdx > startIdx) {
            auto begItr = bam_arr_.begin();
@ -149,11 +160,11 @@ struct AsyncIoBamBuf {
        buf2_.FreeMemory();
    }

-    // 
+    // 同步读取
    int sync_read_bam();
-    // 
+    // 异步读取
    int async_read_bam();
-    // 
+    // 异步读取线程函数
    static void *async_read(void *data);
    void resize_buf();
    inline void refresh_bam_arr() {
--- a/src/util/bam_wrap.h
+++ b/src/util/bam_wrap.h
@ -1,5 +1,5 @@
 /*
-     Description: sam/bam，buf，
+     Description: 读入sam/bam时，开辟一个大的buf，存放这些数据

     Copyright : All right reserved by ICT

@ -19,38 +19,55 @@

 using namespace std;

+// 对原始bam数据的补充，比如对两端进行hardclip等
+struct ReadAdditionData {
+    int read_len = 0;          // read长度，各种clip之后的长度
+    int cigar_start = 0;       // cigar起始位置，闭区间
+    int cigar_end = 0;         // cigar结束位置，开区间
+    int first_cigar_clip = 0;  // 第一个cigar, clip的数量，切左侧
+    int last_cigar_clip = 0;   // 最后一个cigar, clip的数量，切右侧
+    int left_clip = 0;         // 左侧被切掉的碱基长度
+    int right_clip = 0;        // 右侧被切掉的碱基长度
+    int ref_offset = 0;        // 切除adapter和softclip之后(softclip应该不影响)，相对原始ref比对位置（contig_pos）的偏移量
+    string bases;              // 处理之后的read的碱基
+};
+
 /*
-    ，，，，，
-    bam，，。
+    这里的成员函数命名有点混乱，特此说明，小写加下划线的函数命名，无论是静态函数，还是普通成员函数，更侧重说明
+    这是类似bam的一个属性，而大写加驼峰命名的函数，更侧重说明这是通过计算得出的。
 */
+
 /*
- * sam read
+ * sam read的封装
 */
 struct BamWrap {
-    // contigpos
-    const static int MAX_CONTIG_LEN_SHIFT = 40;  // id，
+    // 将contig左移后加上pos作为全局位置
+    const static int MAX_CONTIG_LEN_SHIFT = 40;  // 将染色体id左移多少位，和位点拼合在一起
    const static int READ_MAX_LENGTH = 200;
-    const static int READ_MAX_DEPTH = 1000;  // ，
+    const static int READ_MAX_DEPTH = 1000;  // 这只是用来初始化空间用的，深度大于这个值也没关系

-    // ，
+    // 成员变量尽量少，减少占用内存空间
    bam1_t *b;
-    int64_t end_pos_;  // bam, 
+    int64_t end_pos_;  // bam的全局结束位置, 相对ref, 闭区间

-    // 
+    // 全局开始位置
    inline int64_t start_pos() { return bam_global_pos(b); }
-    // 
+    // 全局结束位置
    inline int64_t end_pos() { return end_pos_; }
-    // reference
+    // 和reference对应的序列长度，不是read包含碱基的个数
    inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }

-    // contig
+    // contig id
+    inline int32_t contig_id() { return b->core.tid; }
+    // 在contig内的开始位置
    inline int32_t contig_pos() { return b->core.pos; }
-    // contig
+    // 在contig内部的结束位置
    inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
-    // （AGTC）
+    // 序列的长度（AGTC字母个数）
    inline int16_t seq_len() { return b->core.l_qseq; }

-    // softclip
+/*
+    // 算上开头的softclip
    inline int32_t softclip_start() {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
@ -61,18 +78,28 @@ struct BamWrap {
        return bc.pos;
    }

-    // softclip
+    inline int64_t global_softclip_start() { 
+        return softclip_start()  + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT); 
+    }
+
+    // 算上结尾的softclip，闭区间
    inline int32_t softclip_end() {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
-        const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
-        const int len = bam_cigar_oplen(cigar[bc.n_cigar - 1]);
+        const int idx = bc.n_cigar - 1;
+        if (idx < 0) return bam_pos(end_pos_);
+        const char c = bam_cigar_opchr(cigar[idx]);
+        const int len = bam_cigar_oplen(cigar[idx]);
        if (c == 'S')
            return bam_pos(end_pos_) + len;
        return bam_pos(end_pos_);
    }

-    // softclip
+    inline int64_t global_softclip_end() {
+        return softclip_end() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT);
+    }
+
+    // 右边softclip的长度
    inline int32_t right_softclip_len() {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
@ -82,8 +109,9 @@ struct BamWrap {
            return len;
        return 0;
    }
+*/

-    // 
+    // 获取序列
    inline std::string sequence() {
        ostringstream oss;
        char *seq = (char *)bam_get_seq(b);
@ -96,9 +124,9 @@ struct BamWrap {
        return std::move(oss.str());
    }

-    // 
+    // 获取名字
    inline const char *query_name() { return bam_get_qname(b); }
-    // cigar 
+    // 获取cigar 字符串
    inline string cigar_str() {
        ostringstream oss;
        const uint32_t *cigar = bam_get_cigar(b);
@ -111,10 +139,10 @@ struct BamWrap {
        return std::move(oss.str());
    }

-    // 
+    // 占用的内存大小
    inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }

-    // cigarinsert
+    // 获取cigar中insert的总长度
    inline int32_t insert_cigar_len() {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
@ -128,7 +156,7 @@ struct BamWrap {
        return ret;
    }

-    // cigardelete
+    // 获取cigar中delete的总长度
    inline int32_t del_cigar_len() {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
@ -142,7 +170,7 @@ struct BamWrap {
        return ret;
    }

-    // sam read
+    // 计算sam read的终点位置，相对参考基因组
    static inline int64_t BamEndPos(const bam1_t *b) {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
@ -156,6 +184,20 @@ struct BamWrap {
        return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
    };

+    // 计算read的有效长度，即除了softclip和hardclip之外的长度
+    static inline int BamEffectiveLength(const bam1_t *b) {
+        const uint32_t *cigar = bam_get_cigar(b);
+        const bam1_core_t &bc = b->core;
+        int effective_len = 0;
+        for (int i = 0; i < bc.n_cigar; ++i) {
+            const char c = bam_cigar_opchr(cigar[i]);
+            const int len = bam_cigar_oplen(cigar[i]);
+            if (c == 'I' || c == 'N' || c == 'M' || c == '=' || c == 'X')
+                effective_len += len;
+        }
+        return effective_len;
+    };
+
    bool HasWellDefinedFragmentSize() {
        const bam1_core_t &bc = b->core;
        bool hasWellDefinedFragmentSize = true;
@ -170,20 +212,25 @@ struct BamWrap {
        return hasWellDefinedFragmentSize;
    }

-    // bamadapterBoundary
+    // 计算bam的adapterBoundary
    int GetAdapterBoundary() {
        const bam1_core_t &bc = b->core;
-        int adapterBoundary;
+        int adapterBoundary = INT_MIN;
        if (!HasWellDefinedFragmentSize())
            adapterBoundary = INT_MIN;
        else if (bc.flag & BAM_FREVERSE)
            adapterBoundary = bc.mpos - 1;
        else
-            adapterBoundary = bc.pos + abs(bc.isize);  // GATK4.0  GATK3.5，3.5+1
+            adapterBoundary = bc.pos + abs(bc.isize);  // GATK4.0 和 GATK3.5不一样，3.5的这里+1
        return adapterBoundary;
    }

-    // I
+    // 检测adapter boundary是否在read范围内
+    bool IsAdapterInRead(int adapterBoundary) {
+        return (adapterBoundary != INT_MIN && (adapterBoundary >= contig_pos() && adapterBoundary <= contig_end_pos()));
+    }
+
+    // 获取开头的I的长度
    inline int GetHeadInsertLen() {
        int insLen = 0;
        const uint32_t *cigar = bam_get_cigar(b);
@ -200,8 +247,8 @@ struct BamWrap {
        return insLen;
    }

-    // soft clip(HS，？,
-    // IS？)
+    // 获取soft clip开始位置(能处理H和S相连的情况，有这种情况么？,
+    // 注意开头的I要当做S？)
    inline int64_t GetSoftStart() {
        int64_t softStart = b->core.pos;
        const uint32_t *cigar = bam_get_cigar(b);
@ -209,7 +256,8 @@ struct BamWrap {
        for (int i = 0; i < bc.n_cigar; ++i) {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
-            if (c == 'S' || c == 'I')
+            // if (c == 'S' || c == 'I')
+            if (c == 'S')
                softStart -= len;
            else if (c != 'H')
                break;
@ -217,7 +265,36 @@ struct BamWrap {
        return softStart;
    }

-    // unclipped(hardclip)
+    /**
+     * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips.
+     *
+     * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
+     *
+     * @return the unclipped end of the read taking soft clips (but not hard clips) into account
+     */
+    inline int64_t GetSoftEnd() {
+        int64_t softEnd = contig_end_pos();
+        const uint32_t* cigar = bam_get_cigar(b);
+        const bam1_core_t& bc = b->core;
+        bool foundAlignedBase = false;
+        for (int i = bc.n_cigar - 1; i >= 0; --i) {
+            const char c = bam_cigar_opchr(cigar[i]);
+            const int len = bam_cigar_oplen(cigar[i]);
+            // if (c == 'S' || c == 'I')
+            if (c == 'S')
+                softEnd += len;
+            else if (c != 'H') {
+                foundAlignedBase = true;
+                break;
+            }
+        }
+        if (!foundAlignedBase) {  // for example 64H14S, the soft end is actually the same as the alignment end
+            softEnd = contig_end_pos();
+        }
+        return softEnd;
+    }
+
+    // 获取unclipped开始位置(包括hardclip)
    inline int64_t GetUnclippedStart() {
        int64_t start = b->core.pos;
        const uint32_t *cigar = bam_get_cigar(b);
@ -233,7 +310,7 @@ struct BamWrap {
        return start;
    }

-    // unclipped(hardclip)
+    // 获取unclipped结束位置(包括hardclip)
    inline int64_t GetUnclippedEnd() {
        int64_t end_pos = bam_endpos(b);
        const uint32_t *cigar = bam_get_cigar(b);
@ -249,7 +326,7 @@ struct BamWrap {
        return end_pos - 1;
    }

-    /*  */
+    /* 获取碱基质量分数的加和 */
    /** Calculates a score for the read which is the sum of scores over Q15. */
    inline int GetSumOfBaseQualities() {
        int score = 0;
@ -262,9 +339,9 @@ struct BamWrap {
        return score;
    }

-    /* flag */
+    /* 与flag相关的检测 */

-    /*  unmapped */
+    /* 没有比对上 unmapped */
    inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }

    /* Template having multiple segments in sequencing  */
@ -313,7 +390,7 @@ struct BamWrap {
     */
    bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }

-    /*  */
+    /* 其他的一些信息 */
    inline int GetReferenceLength() {
        int length = 0;
        const uint32_t *cigar = bam_get_cigar(b);
@ -336,26 +413,26 @@ struct BamWrap {
        return length;
    }

-    // bam，
+    // 计算bam的全局位置，算上染色体序号和比对位置
    static inline int64_t bam_global_pos(bam1_t *b) {
        return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
    }
    static inline int64_t bam_global_pos(int tid, int pos) {
        return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
    }
-    // bam
+    // 根据全局位置获取bam的染色体序号
    static inline int32_t bam_tid(int64_t global_pos) {
        const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
        const int64_t high_tid = global_pos & mask;
        return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
    }
-    // bam()
+    // 根据全局位置获取bam的比对位置(染色体内)
    static inline int32_t bam_pos(int64_t global_pos) {
        const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
        return (int32_t)(global_pos & mask);
    }

-    // 
+    // 设置是否冗余的标记
    void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); }

    void setFlag(bool flag, int bit) {
--- a/src/util/binary_util.h
+++ b/src/util/binary_util.h
@ -0,0 +1,131 @@
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using std::ifstream;
+using std::ofstream;
+using std::ostringstream;
+using std::string;
+using std::vector;
+
+using namespace std;
+
+// 二进制读写相关
+struct BinaryUtil {
+    static void WriteInt(ofstream& out, int val) {
+        uint32_t i = (uint32_t)val;
+        out << (char)(i & 0xFF) << (char)((i >> 8) & 0xFF) << (char)((i >> 16) & 0xFF) << (char)((i >> 24) & 0xFF);
+    };
+
+    static void WriteLong(ofstream& out, uint64_t val) {
+        uint64_t i = val;
+        out << (char)(i & 0xFF) << (char)((i >> 8) & 0xFF) << (char)((i >> 16) & 0xFF) << (char)((i >> 24) & 0xFF) << (char)((i >> 32) & 0xFF)
+            << (char)((i >> 40) & 0xFF) << (char)((i >> 48) & 0xFF) << (char)((i >> 56) & 0xFF);
+    };
+
+    static void WriteStr(ofstream& out, const string& s) {
+        for (int i = 0; i < s.size(); ++i) out << s[i];
+        out << (char)0;
+    }
+
+    static bool ReadInt(char* buf, uint64_t total, uint64_t* cur, int* res) {
+        char b1, b2, b3, b4;
+        if (*cur + 4 > total)
+            return false;
+        b1 = buf[(*cur)++];
+        b2 = buf[(*cur)++];
+        b3 = buf[(*cur)++];
+        b4 = buf[(*cur)++];
+        *res = (((uint32_t)(uint8_t)b4) << 24) + (((uint32_t)(uint8_t)b3) << 16) + (((uint32_t)(uint8_t)b2) << 8) + (((uint32_t)(uint8_t)b1));
+        return true;
+    };
+
+    static bool ReadInt(ifstream& ifs, int* res) {
+        // if(ifs.read((char*)res, sizeof(*res))) return true;
+        char b[4];
+        if (!ifs.read(&b[0], 1))
+            return false;
+        if (!ifs.read(&b[1], 1))
+            return false;
+        if (!ifs.read(&b[2], 1))
+            return false;
+        if (!ifs.read(&b[3], 1))
+            return false;
+        uint64_t cur = 0;
+        return ReadInt((char*)b, 4, &cur, res);
+    }
+
+    static bool ReadLong(char* buf, uint64_t total, uint64_t* cur, uint64_t* res) {
+        char b1, b2, b3, b4, b5, b6, b7, b8;
+        if (*cur + 8 > total)
+            return false;
+        b1 = buf[(*cur)++];
+        b2 = buf[(*cur)++];
+        b3 = buf[(*cur)++];
+        b4 = buf[(*cur)++];
+        b5 = buf[(*cur)++];
+        b6 = buf[(*cur)++];
+        b7 = buf[(*cur)++];
+        b8 = buf[(*cur)++];
+        *res = (((uint64_t)(uint8_t)b8) << 56) + (((uint64_t)(uint8_t)b7) << 48) + (((uint64_t)(uint8_t)b6) << 40) + (((uint64_t)(uint8_t)b5) << 32) +
+               (((uint64_t)(uint8_t)b4) << 24) + (((uint64_t)(uint8_t)b3) << 16) + (((uint64_t)(uint8_t)b2) << 8) + (((uint64_t)(uint8_t)b1));
+        return true;
+    };
+
+    static bool ReadLong(ifstream& ifs, uint64_t* res) {
+        // if(ifs.read((char*)res, sizeof(*res))) return true;
+        char b[8];
+        if (!ifs.read(&b[0], 1))
+            return false;
+        if (!ifs.read(&b[1], 1))
+            return false;
+        if (!ifs.read(&b[2], 1))
+            return false;
+        if (!ifs.read(&b[3], 1))
+            return false;
+        if (!ifs.read(&b[4], 1))
+            return false;
+        if (!ifs.read(&b[5], 1))
+            return false;
+        if (!ifs.read(&b[6], 1))
+            return false;
+        if (!ifs.read(&b[7], 1))
+            return false;
+        uint64_t cur = 0;
+        return ReadLong((char*)b, 8, &cur, res);
+    }
+    static bool ReadStr(ifstream& ifs, string* res) {
+        char b;
+        res->clear();
+        if (!ifs.read(&b, 1))
+            return false;
+        while ((int)b != 0) {
+            res->push_back(b);
+            if (!ifs.read(&b, 1))
+                return false;
+        }
+        return true;
+    }
+    static bool ReadStr(char* buf, uint64_t total, uint64_t* cur, string* res) {
+        char b;
+        res->clear();
+        if (*cur >= total)
+            return false;
+        b = buf[(*cur)++];
+        while ((int)b != 0) {
+            res->push_back(b);
+            if (*cur >= total)
+                return false;
+            b = buf[(*cur)++];
+        }
+        return true;
+    }
+};
--- a/src/util/interval.cpp
+++ b/src/util/interval.cpp
@ -0,0 +1,295 @@
+/* 
+     Description: 处理intervals
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2019/11/24
+*/
+
+#include "interval.h"
+
+#include <algorithm>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <iostream>
+
+#include <htslib/sam.h>
+
+#include "utils.h"
+#include "bam_wrap.h"
+
+using std::min;
+using std::max;
+using std::string;
+using std::ifstream;
+using std::stringstream;
+
+using namespace std;
+
+// 构造函数
+Interval::Interval() : Interval(0, 0) {}
+Interval::Interval(int64_t l, int64_t r) : left(l), right(r) {}
+
+// 比较函数
+bool Interval::operator<(const Interval& other) {
+    if (left == other.left) {
+        return right < other.right;
+    }
+    return left < other.left;
+}
+
+// 是否有重叠
+bool Interval::overlaps(const Interval &other) {
+    return left <= other.right && right >= other.left;
+}
+
+// 两个interval的合并
+Interval& Interval::spanWith(const Interval &other) {
+    left = min(left, other.left);
+    right = max(right, other.right);
+    return *this;
+}
+
+// 返回两个interval的交集，不改变当前interval
+Interval Interval::intersect(const Interval &that) const {
+    Interval val;
+    val.left = max(left, that.left);
+    val.right = min(right, that.right);
+    return val;
+}
+
+
+/*
+ * 合并两个interval arr，取相交区域的交集, interval arr都是排序后的
+ */
+void Interval::IntersectIntervals(const IntervalArr &a_arr, 
+                                  const IntervalArr &b_arr, 
+                                  IntervalArr *r_arr) {
+    if (a_arr.size() < 1 || b_arr.size() < 1) return;
+    int ai=0, bi=0;
+    const Interval *last, *cur;
+    if (a_arr[ai].left < b_arr[bi].left) last = &a_arr[ai++];
+    else last = &b_arr[bi++];
+    while (ai < a_arr.size() && bi < b_arr.size()) {
+        if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
+        else cur = &b_arr[bi++];
+        if (last->right < cur->left) {
+            last = cur; continue;
+        } else if (last->right > cur->right) {
+            r_arr->push_back(*cur); 
+        } else {
+            r_arr->push_back(Interval(cur->left, last->right));
+            last = cur;
+        }
+    }
+    const IntervalArr *arrp;
+    int ii;
+    if (ai < a_arr.size()) { arrp = &a_arr; ii = ai;} 
+    else { arrp = &b_arr; ii = bi; }
+    const IntervalArr &arr = *arrp;
+    while(ii < arr.size()) {
+        cur = &arr[ii++];
+        if (last->right < cur->left) {
+            break;
+        } else if (last->right > cur->right) {
+            r_arr->push_back(*cur); 
+        } else {
+            r_arr->push_back(Interval(cur->left, last->right));
+            break;
+        }
+    }
+}
+
+/*
+ * 合并两个interval arr，取并集
+ */
+void Interval::UnionIntervals(const IntervalArr &a_arr,
+                              const IntervalArr &b_arr,
+                              IntervalArr *r_arr) {
+    Interval tmp;
+    const Interval *cur;
+    Interval *last;
+    int ai=0, bi=0;
+    if (a_arr.size() < 1) { *r_arr = b_arr; return; }
+    if (b_arr.size() < 1) { *r_arr = a_arr; return; }
+    r_arr->clear();
+
+    if (a_arr[ai].left < b_arr[bi].left) tmp = a_arr[ai++];
+    else tmp = b_arr[bi++];
+    last = &tmp;
+    while(ai < a_arr.size() && bi < b_arr.size()) {
+        if (a_arr[ai].left < b_arr[bi].left) cur = &a_arr[ai++];
+        else cur = &b_arr[bi++];
+        if (last->right < cur->left) {
+            r_arr->push_back(*last);
+            *last = *cur;
+        } else {
+            last->right = max(last->right, cur->right);
+        }
+    }
+    const IntervalArr *arrp;
+    int ii;
+    if (ai < a_arr.size()) { arrp = &a_arr; ii = ai; } 
+    else { arrp = &b_arr; ii = bi; }
+    const IntervalArr &arr = *arrp;
+
+    while(ii < arr.size()) {
+        cur = &arr[ii++];
+        if (last->right < cur->left) {
+            r_arr->push_back(*last);
+            *last = *cur;
+        } else {
+            last->right = max(last->right, cur->right);
+        }
+    }
+    r_arr->push_back(*last);
+}
+
+/*
+ * 将有read覆盖的区域和参数提供的interval文件中的区域做一个交集
+ */
+int64_t Interval::MergeIntervals(const IntervalArr &n_arr,
+                                  const IntervalArr &t_arr,
+                                  IntervalArr &in_arr,
+                                  int64_t start_loc, // 闭区间
+                                  int64_t *end_loc, // 开区间
+                                  IntervalArr *r_arr) {
+    IntervalArr tmp_arr;
+    const int64_t end_loc_val = *end_loc;
+    if (in_arr.size() < 1) { // 如果输入的interval为空，则使用tumor normal覆盖的interval
+        UnionIntervals(n_arr, t_arr, &tmp_arr);
+    } else {
+        IntervalArr mid_arr;
+        UnionIntervals(n_arr, t_arr, &mid_arr);
+        IntersectIntervals(mid_arr, in_arr, &tmp_arr);
+    }
+    for(int i=tmp_arr.size()-1; i>=0; --i) {
+        if (tmp_arr[i].left >= end_loc_val) {
+            tmp_arr.pop_back(); // 删除该元素
+            continue;
+        }
+        tmp_arr[i].right = min(tmp_arr[i].right, end_loc_val - 1); // end_loc是开区间
+        break;
+    }
+    for (int i=0; i<tmp_arr.size(); ++i) {
+        if (tmp_arr[i].right < start_loc) {
+            continue;
+        }
+        if (tmp_arr[i].left < start_loc) {
+            r_arr->push_back(Interval(start_loc, tmp_arr[i].right));
+        } else {
+            r_arr->push_back(tmp_arr[i]);
+        }
+    }
+
+    int next_i = 0;
+    while(next_i < in_arr.size() && in_arr[next_i].right < end_loc_val) ++next_i;
+    if (next_i < in_arr.size()) {
+        if (end_loc_val < in_arr[next_i].left) {
+            *end_loc = in_arr[next_i].left; // 更新本次处理的终点
+        } else {
+            in_arr[next_i].left = end_loc_val; // 更新panel
+        }
+        int i=0, j=next_i;
+        for (; j<in_arr.size(); ++i, ++j) {
+            in_arr[i] = in_arr[j];
+        }
+        in_arr.resize(i);
+    } else {
+        in_arr.clear();
+    }
+
+    int64_t locus_num = 0;
+    for (int i=0; i<r_arr->size(); ++i) {
+        locus_num += (*r_arr)[i].right - (*r_arr)[i].left + 1; 
+    }
+    return locus_num;
+}
+
+/*
+ * 读取interval文件
+ */
+void Interval::ReadInterval(const string &interval_fn, 
+                            bam_hdr_t* header, 
+                            int interval_padding,
+                            IntervalArr *r_arr) {
+    ifstream interval_fs(interval_fn);
+    string one_line;
+    IntervalArr intervals;
+    getline(interval_fs, one_line);
+    while (!interval_fs.eof()) {
+        if (one_line[0] == '@') {
+            getline(interval_fs, one_line);
+            continue;
+        }
+        stringstream ss_line(one_line);
+        string contig_name;
+        ss_line >> contig_name;
+        int itid = sam_hdr_name2tid(header, contig_name.c_str());
+        if (itid < 0) error("[%s] interval file has unknown contig name [%s]\n", __func__, contig_name.c_str());
+        int64_t tid = (int64_t)itid;
+        tid <<= CONTIG_SHIFT;
+        int64_t start, stop;
+        ss_line >> start >> stop;
+        // interval文件是1-based，所以这里要减去1
+        intervals.push_back(Interval(tid + start - 1, tid + stop -1));
+        getline(interval_fs, one_line);
+    }
+    sort(intervals.begin(), intervals.end());
+    if (intervals.size() > 0) {
+        Interval new_span(intervals[0].left-interval_padding, intervals[0].right+interval_padding);
+        for (int i=1; i<intervals.size(); ++i) {
+            if (intervals[i].left - interval_padding > new_span.right) {
+                r_arr->push_back(new_span);
+                new_span.left = intervals[i].left - interval_padding;
+                new_span.right = intervals[i].right + interval_padding;
+            } else {
+                new_span.right = max(new_span.right, intervals[i].right + interval_padding);
+            }
+        }
+        r_arr->push_back(new_span);
+    }
+    interval_fs.close();
+}
+
+/*
+ * 将interval相连的区域合并
+ */
+void Interval::ShrinkInterval(IntervalArr *ivap) {
+    if (ivap->size() < 1) return;
+    IntervalArr &iva = *ivap;
+    IntervalArr tiva = iva;
+    iva.clear();
+    Interval iv;
+    iv.left = tiva[0].left;
+    iv.right = tiva[0].right;
+    for (int i=1; i<tiva.size(); ++i) {
+        if (iv.right+1 < tiva[i].left) {
+            iva.push_back(iv);
+            iv.left = tiva[i].left;
+        }
+        iv.right = tiva[i].right;
+    }
+    iva.push_back(iv);
+}
+
+/*
+ * 根据header信息，扩展interval
+ */
+Interval Interval::ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header) {
+    Interval result;
+    result.left = start;
+    result.right = end;
+
+    int64_t ext_left = start - expandVal;
+    int64_t ext_right = end + expandVal;
+    int tid = BamWrap::bam_tid(start);
+    uint32_t contig_len = header->target_len[tid];
+    result.left = max(BamWrap::bam_global_pos(tid, 0), ext_left);
+    result.right = min(ext_right, contig_len - 1 + BamWrap::bam_global_pos(tid, 0));
+
+    return result;
+}
+
--- a/src/util/interval.h
+++ b/src/util/interval.h
@ -0,0 +1,108 @@
+/* 
+     Description: 处理intervals
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2019/11/24
+*/
+
+#ifndef INTERVAL_H_
+#define INTERVAL_H_
+
+#include <stdint.h>
+#include <vector>
+#include <string>
+#include <sstream>
+
+#include <htslib/sam.h>
+
+#include "bam_wrap.h"
+
+using namespace std;
+
+// 前向声明
+class Interval;
+typedef std::vector<Interval> IntervalArr;
+/*
+ * 闭区间
+ */
+struct Interval {
+    // const常量
+    const static int CONTIG_SHIFT = 40;
+    const static uint64_t POS_MASK = (1LL << CONTIG_SHIFT) - 1;
+
+    // 类变量
+    int64_t left;
+    int64_t right;
+
+    // 构造函数
+    Interval();
+    explicit Interval(int64_t l, int64_t r);
+    explicit Interval(int tid, int l, int r) : Interval(BamWrap::bam_global_pos(tid, l), BamWrap::bam_global_pos(tid, r)) {}
+    // 获取tid
+    int contig() { return left >> CONTIG_SHIFT; }
+    int contigLeft() { return left & POS_MASK; }
+    int contigRight() { return right & POS_MASK; }
+
+    // 比较函数
+    bool operator<(const Interval &other);
+    // 是否有重叠
+    bool overlaps(const Interval &other);
+    // 两个interval的合并, 会改变当前interval
+    Interval& spanWith(const Interval &other);
+    // 返回两个interval的交集，不改变当前interval
+    Interval intersect(const Interval &that) const;
+
+    // for debug
+    string toString() const {
+        ostringstream oss;
+        oss << BamWrap::bam_tid(left) + 1 << ":"
+            << BamWrap::bam_pos(left) + 1 << "-"
+            << BamWrap::bam_pos(right) + 1;
+
+        return oss.str();
+    }
+    /*
+     * 合并两个interval arr，取相交区域的交集, interval arr都是排序后的
+     */
+    static void IntersectIntervals(const IntervalArr &a_arr, 
+                                   const IntervalArr &b_arr, 
+                                   IntervalArr *r_arr);
+    /*
+     * 合并两个interval arr，相交的区域取并集
+     */
+    static void UnionIntervals(const IntervalArr &a_arr,
+                               const IntervalArr &b_arr,
+                               IntervalArr *r_arr);
+    /*
+     * 将有read覆盖的区域和参数提供的interval文件中的区域做一个交集
+     */
+    static int64_t MergeIntervals(const IntervalArr &n_arr,
+                                   const IntervalArr &t_arr,
+                                   IntervalArr &in_arr, // 会更改
+                                   int64_t start_loc, // 闭区间
+                                   int64_t *end_loc, // 开区间, 会更改
+                                   IntervalArr *r_arr);
+    /*
+     * 读取interval文件
+     */
+    static void ReadInterval(const std::string &interval_fn, 
+                             bam_hdr_t* header, 
+                             int interval_padding,
+                             IntervalArr *r_arr);
+    /*
+     * 将interval相连的区域合并
+     */
+    static void ShrinkInterval(IntervalArr *iva);
+
+    /*
+     * 根据header信息，扩展interval
+     */
+    static Interval ExpandInterval(int64_t start, int64_t end, int expandVal, bam_hdr_t* header);
+};
+
+
+#endif
+
+
--- a/src/util/linear_index.cpp
+++ b/src/util/linear_index.cpp
@ -0,0 +1,292 @@
+/*
+     Description: vcf index utils
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2019/11/23
+*/
+
+#include "linear_index.h"
+#include "bam_wrap.h"
+#include "binary_util.h"
+
+/*
+ * LinearIndex 类
+ */
+// 搜索区间参数对应的数据区域，闭区间
+void LinearIndex::SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len) {
+    int stid, spos, etid, epos, stid_origin, spos_origin, etid_origin, epos_origin;
+    stid = stid_origin = BamWrap::bam_tid(start);
+    spos = spos_origin = BamWrap::bam_pos(start);
+    etid = etid_origin = BamWrap::bam_tid(end);
+    epos = epos_origin = BamWrap::bam_pos(end);
+
+    while (stid < idx_.size() && idx_[stid].size() == 0) ++stid;
+    while (etid >= 0 && idx_[etid].size() == 0) --etid;
+    if (stid > etid || stid >= idx_.size()) {
+        *content_len = 0;
+        return;
+    }
+    int sb_idx, eb_idx;
+    if (stid == stid_origin) {
+        sb_idx = spos / idx_[stid].binWidth;
+        if (sb_idx >= idx_[stid].size()) {  // 开始区域没有对应的block
+            sb_idx = idx_[stid].size() - 1;
+            Block& sb = idx_[stid][sb_idx];
+            *file_pos = sb.startPosition + sb.size;
+        } else {
+            Block& sb = idx_[stid][sb_idx];
+            *file_pos = sb.startPosition;
+        }
+    } else {
+        sb_idx = 0;
+        Block& sb = idx_[stid][sb_idx];
+        *file_pos = sb.startPosition;
+    }
+    if (etid == etid_origin) {
+        eb_idx = epos / idx_[etid].binWidth;
+        if (eb_idx >= idx_[etid].size()) {
+            eb_idx = idx_[etid].size() - 1;
+        }
+    } else {
+        eb_idx = idx_[etid].size() - 1;
+    }
+    Block& eb = idx_[etid][eb_idx];
+    if (eb.startPosition + eb.size > *file_pos)
+        *content_len = eb.startPosition + eb.size - *file_pos;
+    else
+        *content_len = 0;
+}
+
+// 读入index文件信息
+bool LinearIndex::ReadIndex(const string& idx_fn) {
+    if (bam_hdr_ == NULL)
+        return false;
+    ifstream ifs(idx_fn, ios::in | ios::binary);
+    // 如果idx文件不存在，则创建 todo
+    if (!ifs.good())
+        return false;
+    // 全部读进来再处理
+    uint64_t fsize = 0;
+    uint64_t cur = 0;
+    ifs.seekg(0, ios::end);
+    fsize = (uint64_t)ifs.tellg();
+    ifs.seekg(0, ios::beg);
+    char* buf = (char*)malloc(fsize + 10);
+    if (!ifs.read(buf, fsize)) {
+        free(buf);
+        ifs.close();
+        return false;
+    }
+    ifs.close();
+    int int_val, version, flags;
+    uint64_t long_val;
+    string str_val;
+
+    // read index header
+    BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
+    // cout << "magic: " << int_val << endl;
+    BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
+    if (int_val != INDEX_TYPE) {
+        free(buf);
+        ifs.close();
+        return false;
+    }
+    // cout << "type: " << int_val << endl;
+    BinaryUtil::ReadInt(buf, fsize, &cur, &version);
+    // cout << "version: " << version << endl;
+    BinaryUtil::ReadStr(buf, fsize, &cur, &str_val);
+    // cout << "path: " << str_val << endl;
+    BinaryUtil::ReadLong(buf, fsize, &cur, &vcf_fsize);  // vcf file size
+    // cout << "file size: " << long_val << endl;
+    BinaryUtil::ReadLong(buf, fsize, &cur, &long_val);  // test number
+    BinaryUtil::ReadStr(buf, fsize, &cur, &str_val);    // md5
+    BinaryUtil::ReadInt(buf, fsize, &cur, &flags);
+
+    if (version < 3 && (flags & 0x8000) == 0x8000) {
+        BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
+        if (int_val < 0)
+            return false;
+        for (int i = 0; i < int_val; ++i) {
+            BinaryUtil::ReadStr(buf, fsize, &cur, &str_val);
+            BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
+        }
+    }
+    if (version >= 3) {
+        int nProperties;
+        BinaryUtil::ReadInt(buf, fsize, &cur, &nProperties);
+        string key, val;
+        while (nProperties-- > 0) {
+            BinaryUtil::ReadStr(buf, fsize, &cur, &key);
+            BinaryUtil::ReadStr(buf, fsize, &cur, &val);
+            properties_[key] = val;
+        }
+    }
+    // read index content
+    int n_chrom;
+    BinaryUtil::ReadInt(buf, fsize, &cur, &n_chrom);
+    int last_tid = 0;
+    string last_name;
+    int bam_tid = 0;  // tid从0开始，索引的染色体也要补充一个数据, 使得idx_ vector与tid一一对应
+    while (n_chrom-- > 0) {
+        ChrIndex ci;
+        BinaryUtil::ReadStr(buf, fsize, &cur, &ci.name);
+        BinaryUtil::ReadInt(buf, fsize, &cur, &ci.binWidth);
+        int nBins;
+        BinaryUtil::ReadInt(buf, fsize, &cur, &nBins);
+        BinaryUtil::ReadInt(buf, fsize, &cur, &ci.longestFeature);
+        BinaryUtil::ReadInt(buf, fsize, &cur, &int_val);
+        BinaryUtil::ReadInt(buf, fsize, &cur, &ci.nFeatures);
+        ci.blocks.reserve(nBins);
+        int tid = sam_hdr_name2tid(bam_hdr_, ci.name.c_str());
+        if (tid < 0) {
+            if (ci.name == last_name) {
+                tid = last_tid;
+            } else {
+                tid = ++last_tid;
+                last_name = ci.name;
+            }
+        }
+        while (bam_tid + 1 < tid) {
+            idx_.push_back(ChrIndex());  // 补充空染色体
+            ++bam_tid;
+        }
+        bam_tid = tid;
+        ci.tid = tid;
+        uint64_t pos;
+        uint64_t nextPos;
+        BinaryUtil::ReadLong(buf, fsize, &cur, &pos);
+        for (int i = 0; i < nBins; ++i) {
+            BinaryUtil::ReadLong(buf, fsize, &cur, &nextPos);
+            ci.blocks.push_back(Block(pos, nextPos - pos));
+            pos = nextPos;
+        }
+        idx_.push_back(ci);
+    }
+    sort(idx_.begin(), idx_.end());
+    ifs.close();
+    free(buf);
+    return true;
+}
+
+/*
+ * LinearIndex::ChrIndex 类
+ */
+void LinearIndex::ChrIndex::write(ofstream& out) const {
+    BinaryUtil::WriteStr(out, name);
+    BinaryUtil::WriteInt(out, binWidth);
+    BinaryUtil::WriteInt(out, blocks.size());
+    BinaryUtil::WriteInt(out, longestFeature);
+    BinaryUtil::WriteInt(out, 0);
+    BinaryUtil::WriteInt(out, nFeatures);
+    uint64_t pos = 0, si = 0;
+    for (int i = 0; i < blocks.size(); ++i) {
+        pos = blocks[i].startPosition;
+        si = blocks[i].size;
+        BinaryUtil::WriteLong(out, pos);
+    }
+    BinaryUtil::WriteLong(out, pos + si);
+}
+
+/*
+ * LinearIndexCreator类
+ */
+// 初始化索引文件头部信息
+void LinearIndexCreator::InitHeaderDict(bam_hdr_t* hdr) {
+    for (int i = 0; i < hdr->n_targets; ++i) {
+        v_contig_name_.push_back(hdr->target_name[i]);
+        contig_name_to_id_[hdr->target_name[i]] = i;
+        contig_len_[hdr->target_name[i]] = hdr->target_len[i];
+        ++n_properties_;
+    }
+}
+
+// 添加一条vcf记录
+void LinearIndexCreator::AddFeature(const Feature& ft, uint64_t f_pos) {  // f_pos是vcf文件当前正要写入的位置
+    if (idx_.size() == 0 || idx_.back().tid != ft.tid) {
+        if (idx_.size() > 0) {
+            for (int i = 0; i < blocks_.size() - 1; ++i) {
+                blocks_[i].size = blocks_[i + 1].startPosition - blocks_[i].startPosition;
+                idx_.back().blocks.push_back(blocks_[i]);
+            }
+            blocks_.back().size = f_pos - blocks_.back().startPosition;
+            idx_.back().blocks.push_back(blocks_.back());
+        }
+        idx_.push_back(LinearIndex::ChrIndex(v_contig_name_[ft.tid], ft.tid, bin_width_));
+        blocks_.clear();
+        blocks_.push_back(Block(f_pos, 0));
+        longest_feature_ = 0;
+    }
+    while (ft.start > blocks_.size() * bin_width_) blocks_.push_back(Block(f_pos, 0));
+    if (ft.FeatureLen() > longest_feature_) {
+        longest_feature_ = ft.FeatureLen();
+        idx_.back().longestFeature = longest_feature_;
+    }
+    ++idx_.back().nFeatures;
+    ++FEATURE_COUNT_;
+    all_feature_len += ft.FeatureLen();
+}
+
+// 添加所有记录完毕
+void LinearIndexCreator::FinalizeIndex(uint64_t f_pos) {
+    if (f_pos == 0)
+        error("[%s] Error: finalize index failed\n", __func__);
+    if (blocks_.size() > 0) {
+        for (int i = 0; i < blocks_.size() - 1; ++i) {
+            blocks_[i].size = blocks_[i + 1].startPosition - blocks_[i].startPosition;
+            idx_.back().blocks.push_back(blocks_[i]);
+        }
+        blocks_.back().size = f_pos - blocks_.back().startPosition;
+        idx_.back().blocks.push_back(blocks_.back());
+        blocks_.clear();
+    }
+    FEATURE_LENGTH_MEAN_ = (double)all_feature_len / (double)FEATURE_COUNT_;
+    index_file_size_ = f_pos;
+}
+
+// 写入index文件
+void LinearIndexCreator::WriteIndex(const string& out_fn) {
+    ofstream out;
+    out.open(out_fn, ios::out | ios::binary);
+
+    BinaryUtil::WriteInt(out, MAGIC_NUMBER);
+    BinaryUtil::WriteInt(out, LinearIndex::INDEX_TYPE);
+    BinaryUtil::WriteInt(out, LinearIndex::INDEX_VERSION);
+    BinaryUtil::WriteStr(out, out_fn);
+    BinaryUtil::WriteLong(out, index_file_size_);
+    BinaryUtil::WriteLong(out, -1);
+    BinaryUtil::WriteStr(out, "");
+    BinaryUtil::WriteInt(out, flags_);
+    n_properties_ += 4;
+    BinaryUtil::WriteInt(out, n_properties_);
+    ostringstream oss;
+    oss << FEATURE_LENGTH_MEAN_;
+    BinaryUtil::WriteStr(out, "FEATURE_LENGTH_MEAN");
+    BinaryUtil::WriteStr(out, oss.str());
+    oss.str("");
+    oss << FEATURE_LENGTH_STD_DEV_;
+    BinaryUtil::WriteStr(out, "FEATURE_LENGTH_STD_DEV");
+    BinaryUtil::WriteStr(out, oss.str());
+    oss.str("");
+    oss << MEAN_FEATURE_VARIANCE_;
+    BinaryUtil::WriteStr(out, "MEAN_FEATURE_VARIANCE");
+    BinaryUtil::WriteStr(out, oss.str());
+    oss.str("");
+    oss << FEATURE_COUNT_;
+    BinaryUtil::WriteStr(out, "FEATURE_COUNT");
+    BinaryUtil::WriteStr(out, oss.str());
+    for (int i = 0; i < v_contig_name_.size(); ++i) {
+        const string& cn = v_contig_name_[i];
+        oss.str("");
+        oss << contig_len_[cn];
+        BinaryUtil::WriteStr(out, "DICT:" + cn);
+        BinaryUtil::WriteStr(out, oss.str());
+    }
+    BinaryUtil::WriteInt(out, idx_.size());
+    for (int i = 0; i < idx_.size(); ++i) {
+        const LinearIndex::ChrIndex& ci = idx_[i];
+        ci.write(out);
+    }
+    out.close();
+}
--- a/src/util/linear_index.h
+++ b/src/util/linear_index.h
@ -0,0 +1,129 @@
+/*
+     Description: vcf index utils
+
+     Copyright : All right reserved by ICT
+
+     Author : Zhang Zhonghai
+     Date : 2019/11/23
+*/
+
+#ifndef INDEX_H_
+#define INDEX_H_
+
+#include <htslib/sam.h>
+#include <stddef.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "utils.h"
+
+using std::ifstream;
+using std::ios;
+using std::ofstream;
+using std::ostringstream;
+using std::sort;
+using std::string;
+using std::unordered_map;
+using std::vector;
+
+using std::cout;
+using std::endl;
+
+struct Block {
+    Block(uint64_t start, uint64_t si) : startPosition(start), size(si) {}
+    uint64_t startPosition;
+    uint64_t size;
+};
+
+struct Feature {
+    int tid;
+    int start;  // 闭区间
+    int end;    // 闭区间
+    Feature(int ti, int b, int e) : tid(ti), start(b), end(e) {}
+    inline int FeatureLen() const { return end - start + 1; }
+};
+
+struct LinearIndex {
+    const static int MAX_FEATURES_PER_BIN = 100;
+    const static int INDEX_TYPE = 1;
+    const static int INDEX_VERSION = 3;
+    const static int MAX_BIN_WIDTH = 1024000;
+
+    LinearIndex() {}
+    LinearIndex(bam_hdr_t* hdr) : bam_hdr_(hdr) {}
+    void SetHeader(bam_hdr_t* hdr) { bam_hdr_ = hdr; }
+
+    class ChrIndex;
+    vector<ChrIndex> idx_;
+    vector<string> vkey_;
+    vector<string> vval_;
+    unordered_map<string, string> properties_;
+    bam_hdr_t* bam_hdr_ = NULL;  // 这个应该换成bcf_hdr_t
+    uint64_t vcf_fsize = 0;
+
+    // 染色体索引信息
+    struct ChrIndex {
+        string name;
+        int tid;
+        int binWidth;
+        int longestFeature = 0;
+        int nFeatures = 0;
+        vector<Block> blocks;
+        ChrIndex() {};
+        ChrIndex(string& n, int ti, int bw) : name(n), tid(ti), binWidth(bw) {}
+        inline bool operator<(const ChrIndex& ci) const { return tid < ci.tid; };
+        inline Block& operator[](int pos) { return blocks[pos]; }
+        inline int size() { return blocks.size(); }
+        void write(ofstream& out) const;
+    };
+
+    inline ChrIndex& operator[](int tid) { return idx_[tid]; }
+
+    // 闭区间
+    void SearchInterval(int64_t start, int64_t end, int64_t* file_pos, int64_t* content_len);
+
+    // 读入index文件信息
+    bool ReadIndex(const string& idx_fn);
+};
+
+// 根据vcf数据创建index文件
+struct LinearIndexCreator {
+    const static int INDEX_VERSION = LinearIndex::INDEX_VERSION;
+    const static int MAGIC_NUMBER = 1480870228;
+    const static int DEFAULT_BIN_WIDTH = 8000;
+    int bin_width_ = DEFAULT_BIN_WIDTH;
+    string input_vcf_fn_;
+    string output_index_fn_;
+    int longest_feature_ = 0;
+    uint64_t index_file_size_ = 0;
+    int flags_ = 0;
+    int n_properties_ = 0;
+    float FEATURE_LENGTH_MEAN_ = 0.0;
+    float FEATURE_LENGTH_STD_DEV_ = 0.0;
+    float MEAN_FEATURE_VARIANCE_ = 0.0;
+    uint64_t FEATURE_COUNT_ = 0;
+    uint64_t all_feature_len = 0;
+
+    unordered_map<string, int> contig_name_to_id_;
+    unordered_map<string, int> contig_len_;
+    vector<string> v_contig_name_;
+    vector<LinearIndex::ChrIndex> idx_;
+    vector<Block> blocks_;
+
+    // 根据sam header初始化索引文件头部信息
+    void InitHeaderDict(bam_hdr_t* hdr);
+    // 添加一条记录
+    void AddFeature(const Feature& ft, uint64_t f_pos);  // f_pos是vcf文件当前正要写入的位置
+    // 添加记录完毕
+    void FinalizeIndex(uint64_t f_pos);
+    // 写入index文件
+    void WriteIndex(const string& out_fn);
+};
+
+#endif
--- a/src/util/murmur3.h
+++ b/src/util/murmur3.h
@ -1,5 +1,5 @@
 /*
-Description: Murmur
+Description: Murmur哈希

 Copyright : All right reserved by ICT

--- a/src/util/profiling.cpp
+++ b/src/util/profiling.cpp
@ -73,12 +73,12 @@ int DisplayProfiling(int nthread) {
    PRINT_GP(write);
    PRINT_GP(whole_process);
    
-    PRINT_TP(gen, nthread);
-    PRINT_TP(sort_frag, nthread);
-    PRINT_TP(sort_pair, nthread);
+    // PRINT_TP(gen, nthread);
+    // PRINT_TP(sort_frag, nthread);
+    // PRINT_TP(sort_pair, nthread);

    fprintf(stderr, "\n");
 #endif

    return 0;
-}
+}