检查optical duplication，用graph来检测，正在写这个

2024-08-22 02:28:36 +08:00 · 2024-08-22 02:28:36 +08:00 · 022be611cd
parent 38bc489004
commit 022be611cd
27 changed files with 2133 additions and 2007 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
 # for fast-markdup
 *.sam
 *.log
 # ---> C++
 # Prerequisites
 *.d
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -13,10 +13,10 @@
            "program": "${workspaceRoot}/build/bin/picard_cpp",
            "args": [
                "MarkDuplicates",
-                "--INPUT", "/mnt/d/data/100w.bam",
+                "--INPUT", "~/data/bam/100w.bam",
                "--OUTPUT", "out.bam",
                "--METRICS_FILE", "metrics.txt",
-                "--num_threads", "12",
+                "--num_threads", "1",
                "--max_mem", "4G",
                "--verbosity", "DEBUG",
                "--asyncio", "true",
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -88,6 +88,13 @@
        "typeindex": "cpp",
        "typeinfo": "cpp",
        "valarray": "cpp",
-        "variant": "cpp"
+        "variant": "cpp",
        "__split_buffer": "cpp",
        "executor": "cpp",
        "io_context": "cpp",
        "netfwd": "cpp",
        "timer": "cpp",
        "__nullptr": "cpp",
        "__node_handle": "c"
    }
 }
--- a/build.sh
+++ b/build.sh
@ -1,8 +1,8 @@
 #!/bin/bash
-dir="/home/zzh/work/GeneKit/picard_cpp/build"
+dir="/home/zzh/work/ngs/picard_cpp/build"
 [ -d "$dir" ] && rm -rf "$dir"
 mkdir "$dir"
 cd "$dir"
 #cmake .. -DCMAKE_BUILD_TYPE=Debug
 cmake .. -DCMAKE_BUILD_TYPE=Release
-make -j 8
+make -j 16
--- a/run.sh
+++ b/run.sh
@ -1,9 +1,14 @@
-time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
+input=~/data/bam/zy_normal.bam
 #input=~/data/bam/zy_tumor.bam
 #input=~/data/bam/100w.bam
 time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
    MarkDuplicates \
-    --INPUT /mnt/d/data/zy_tumor.bam \
+    --INPUT $input \
-    --OUTPUT /mnt/d/data/out.bam \
+    --OUTPUT ~/data/bam/out.bam \
    --INDEX_FORMAT BAI \
    --num_threads 1 \
-    --max_mem 4G \
+    --max_mem 2G \
    --verbosity DEBUG \
    --asyncio true #\
    #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
--- a/src/common/hts/bam_buf.cpp
+++ b/src/common/hts/bam_buf.cpp
@ -13,116 +13,87 @@
 * BamBuf类
 */
 // 读取数据直到读完，或者缓冲区满
-int BamBuf::ReadBam()
+int BamBuf::ReadBam() {
 {
    int read_num = 0;
-    if (handle_last)
+    if (handle_last) {             // 处理上次读入的最后一个bam
-    { // 处理上次读入的最后一个bam
+        if (has_enough_space()) {  // 必须调用，在边界处调整memffset
        if (has_enough_space())
        { // 必须调用，在边界处调整memffset
            ++read_num;
            append_one_bam();
-        }
+        } else {
        else
        {
            return read_num;  // 还是没空间
        }
    }
-    while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0)
+    while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) {
    {
        bw->end_pos_ = BamWrap::BamEndPos(bw->b);
-        if (has_enough_space())
+        if (has_enough_space()) {  // 还有空间
        { // 还有空间
            append_one_bam();
            ++read_num;  // 放进缓存才算读取到
-        }
+        } else {
        else
        {
            break;
        }
    }
-    if (read_stat_ >= 0)
+    if (read_stat_ >= 0) {
    {
        handle_last = true;
-    }
+    } else {
    else
    {
        handle_last = false;
    }
    return read_num;
 }
 // 初始化缓存
-void BamBuf::Init(samFile *fp,
+void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
                  sam_hdr_t *hdr,
                  int64_t mem_size)
 {
    this->fp = fp;
    this->hdr = hdr;
    this->mem_size = mem_size;
    this->mem = (uint8_t *)malloc(mem_size);
    this->bw = (BamWrap *)malloc(sizeof(BamWrap));
    this->bw->b = bam_init1();
-    if (bw == NULL ||
+    if (bw == NULL || this->mem == NULL || this->bw->b == NULL) {
        this->mem == NULL ||
        this->bw->b == NULL)
    {
        fprintf(stderr, "allocate memory failed! Abort\n");
        exit(-1);
    }
 }
-void BamBuf::ClearBeforeIdx(size_t idxInBv)
+void BamBuf::ClearBeforeIdx(size_t idxInBv) {
 {
    if (idxInBv < 1)
        return;
    int i = 0, j = idxInBv;
-    for (; j < bv.size(); ++i, ++j)
+    for (; j < bv.size(); ++i, ++j) {
    {
        bv[i] = bv[j];
    }
    bv.resize(i);
    prepare_read();
 }
-void BamBuf::ClearAll()
+void BamBuf::ClearAll() {
 {
    bv.clear();
    prepare_read();
 }
 // 为下一次读取做准备, 计算一些边界条件
-inline void BamBuf::prepare_read()
+inline void BamBuf::prepare_read() {
 {
    // 计算余留的下次计算可能用到的bam所占的位置
-    if (bv.size() > 0)
+    if (bv.size() > 0) {
    {
        BamWrap *bw = bv[0];
        legacy_start = (int64_t)bw - (int64_t)mem;
        bw = bv.back();
        legacy_end = (int64_t)bw + bw->length() - (int64_t)mem;
-    }
+    } else {
    else
    {
        legacy_start = legacy_end = 0;
        mem_offset = 0;  // 上次没剩下，那就从头存储
    }
 }
 // 检查缓存是否还有空间
-inline bool BamBuf::has_enough_space()
+inline bool BamBuf::has_enough_space() {
 {
    const uint32_t bam_len = bw->length();
    int64_t potential_end = mem_offset + bam_len;
    if (legacy_start <= legacy_end)
        legacy_start += mem_size;
-    if (potential_end >= legacy_start)
+    if (potential_end >= legacy_start) {
    {
        return false;
    }
-    if (potential_end >= mem_size)
+    if (potential_end >= mem_size) {
    {
        mem_offset = 0;
    }
    int64_t virtual_offset = mem_offset;
@ -133,8 +104,7 @@ inline bool BamBuf::has_enough_space()
 }
 // 处理一个读取后的bam
-inline void BamBuf::append_one_bam()
+inline void BamBuf::append_one_bam() {
 {
    BamWrap *bwp = (BamWrap *)(mem + mem_offset);
    *bwp = *bw;
    bwp->b = (bam1_t *)((char *)bwp + sizeof(*bwp));
@ -148,12 +118,9 @@ inline void BamBuf::append_one_bam()
 }
 // 处理上次读入的最后一个read
-inline bool BamBuf::handle_last_read()
+inline bool BamBuf::handle_last_read() {
-{
+    if (handle_last) {             // 处理上次读入的最后一个bam
-    if (handle_last)
+        if (has_enough_space()) {  // 必须调用，在边界处调整memffset
    { // 处理上次读入的最后一个bam
        if (has_enough_space())
        { // 必须调用，在边界处调整memffset
            append_one_bam();
            handle_last = false;
            return true;
@ -166,49 +133,35 @@ inline bool BamBuf::handle_last_read()
 * AsyncIoBamBuf 类
 */
 // 初始化缓存
-void AsyncIoBamBuf::Init(samFile *fp,
+void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) {
-                         sam_hdr_t *hdr,
+    if (use_async_io_) {
                         int64_t mem_size)
 {
    if (use_async_io_)
    {
        buf1_.Init(fp, hdr, mem_size >> 1);
        buf2_.Init(fp, hdr, mem_size >> 1);
        pi_ = &buf1_;
        po_ = &buf2_;
        tid_ = (pthread_t *)malloc(sizeof(pthread_t));
-    }
+    } else {
    else
    {
        buf1_.Init(fp, hdr, mem_size);
        pi_ = &buf1_;
    }
 }
 // 读取数据
-int AsyncIoBamBuf::ReadBam()
+int AsyncIoBamBuf::ReadBam() {
-{
+    if (use_async_io_) {
    if (use_async_io_)
    {
        hasThread = true;
        return async_read_bam();
-    }
+    } else {
    else
    {
        return sync_read_bam();
    }
 }
-int AsyncIoBamBuf::sync_read_bam()
+int AsyncIoBamBuf::sync_read_bam() {
 {
    int read_num = 0;
-    if (clear_all_)
+    if (clear_all_) {
    {
        clear_all_ = false;
        pi_->ClearAll();
-    }
+    } else if (clear_before_idx_ > 0) {
    else if (clear_before_idx_ > 0) 
    {
        pi_->ClearBeforeIdx(clear_before_idx_);
        clear_before_idx_ = 0;
    }
@ -217,23 +170,18 @@ int AsyncIoBamBuf::sync_read_bam()
    return read_num;
 }
-int AsyncIoBamBuf::async_read_bam()
+int AsyncIoBamBuf::async_read_bam() {
 {
    int read_num = 0;
-    if (first_read_)
+    if (first_read_) {
    {
        read_num = pi_->ReadBam();
        first_read_ = false;
        refresh_bam_arr();
-    }
+    } else {
    else
    {
        // join, 交换缓冲区指针
        pthread_join(*tid_, 0);
        resize_buf();
-        if (need_read_)
+        if (need_read_) {  // 需要交换指针
        { // 需要交换指针
            BamBuf *tmp = pi_;
            pi_ = po_;
            po_ = tmp;
@ -246,72 +194,52 @@ int AsyncIoBamBuf::async_read_bam()
    return read_num;
 }
-void *AsyncIoBamBuf::async_read(void *data)
+void *AsyncIoBamBuf::async_read(void *data) {
 {
    AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data;
-    if (ab->need_read_ && ab->ReadStat() >= 0)
+    if (ab->need_read_ && ab->ReadStat() >= 0) {  // 需要读取
    { // 需要读取
        ab->last_read_num_ = ab->po_->ReadBam();
-    }
+    } else {
    else
    {
        ab->last_read_num_ = 0;
    }
    pthread_exit(0);
 }
-// 为下一次读取做准备, 计算一些边界条件，延迟操作，因为此时可能po_对应的buf正在读取
+// 为下一次读取做准备,
-void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv)
+// 计算一些边界条件，延迟操作，因为此时可能po_对应的buf正在读取
-{
+void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) {
    clear_before_idx_ = idxInBv;
 }
 // 清空上一次所有读入的数据，延迟操作，因为此时可能po_对应的buf正在读取
-void AsyncIoBamBuf::ClearAll()
+void AsyncIoBamBuf::ClearAll() { clear_all_ = true; }
 {
    clear_all_ = true;
 }
-inline void AsyncIoBamBuf::resize_buf()
+inline void AsyncIoBamBuf::resize_buf() {
-{
+    if (clear_all_) {  // 清理上一轮的数据
    if (clear_all_)
    { // 清理上一轮的数据
        clear_all_ = false;
        po_->ClearBeforeIdx(legacy_size_);
        pi_->ClearAll();
-        if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
+        if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存
        {
            last_read_num_ += 1;
            legacy_size_ = pi_->Size();  // 应该只有一个read
            need_read_ = true;
-        }
+        } else { // 没空间存放，则不交换指针，或者文件已经读取完毕
        else // 没空间存放，则不交换指针，或者文件已经读取完毕
        {
            legacy_size_ = 0;
            need_read_ = false;
        }
-    }
+    } else if (clear_before_idx_ > 0) {
-    else if (clear_before_idx_ > 0)
+        if (clear_before_idx_ < legacy_size_) {
    {
        if (clear_before_idx_ < legacy_size_)
        {
            po_->ClearBeforeIdx(clear_before_idx_);
            legacy_size_ -= clear_before_idx_;
            // 不需要交换指针，不需要读取
            need_read_ = false;
-        }
+        } else {
        else
        {
            po_->ClearBeforeIdx(legacy_size_);
            pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_);
-            if (pi_->handle_last_read()) // 上次读取有一个read没放入缓存
+            if (pi_->handle_last_read()) {// 上次读取有一个read没放入缓存
            {
                last_read_num_ += 1;
                legacy_size_ = pi_->Size();  // 应该只有一个read
                need_read_ = true;
-            }
+            } else { // 没空间存放，则不交换指针，或者文件已经读取完毕
            else // 没空间存放，则不交换指针，或者文件已经读取完毕
            {
                legacy_size_ = 0;
                need_read_ = false;
            }
--- a/src/common/hts/bam_buf.h
+++ b/src/common/hts/bam_buf.h
@ -10,19 +10,18 @@
 #ifndef BAM_BUF_H_
 #define BAM_BUF_H_
-#include <vector>
+#include <htslib/sam.h>
-#include <stdint.h>
+#include <pthread.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <functional>
 #include <pthread.h>
 #include <fstream>
 #include <functional>
 #include <iostream>
 #include <sstream>
-#include <fstream>
+#include <vector>
 #include <htslib/sam.h>
 #include "bam_wrap.h"
@ -32,12 +31,12 @@ using namespace std;
 /*
 * 存放读入的bam数据
 */
-struct BamBuf
+struct BamBuf {
 {
    sam_hdr_t *hdr;          // sam文件的header信息
    samFile *fp;             // sam文件指针
    BamWrap *bw = nullptr;   // 用来循环读入bam
-    uint8_t *mem = nullptr;   // 用来存放bam的数据, 程序结束后自动释放，所以没在析构函数里释放
+    uint8_t *mem = nullptr;  // 用来存放bam的数据,
                             // 程序结束后自动释放，所以没在析构函数里释放
    int64_t mem_offset = 0;  // 下一次要存放的位置
    int64_t mem_size;        // 缓存大小
    int read_stat_ = 0;      // 读取状态，是否读完
@ -47,9 +46,7 @@ struct BamBuf
    bool handle_last = false;  // 上次最后读入的bam是否需要处理
    // 初始化缓存
-    void Init(samFile *fp,
+    void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
              sam_hdr_t *hdr,
              int64_t mem_size);
    // 读取数据直到读完，或者缓冲区满
    int ReadBam();
    // 为下一次读取做准备, 计算一些边界条件
@ -57,27 +54,24 @@ struct BamBuf
    // 清空上一次所有读入的数据
    void ClearAll();
    inline int64_t Size() { return bv.size(); }  // 包含多少个read
-    inline int ReadStat() { return read_stat_; } // 文件的读取状态，是否可读（读取完全）
+    inline int ReadStat() {
-    ~BamBuf()
+        return read_stat_;
-    {
+    }  // 文件的读取状态，是否可读（读取完全）
-        if (this->mem != nullptr)
+    ~BamBuf() {
-        {
+        if (this->mem != nullptr) {
            free(this->mem);
        }
-        if (this->bw != nullptr) 
+        if (this->bw != nullptr) {
        {
            bam_destroy1(bw->b);
            free(bw);
        }
    }
    void FreeMemory()  // 释放开辟的内存
    {
-        if (this->mem != nullptr)
+        if (this->mem != nullptr) {
        {
            free(this->mem);
        }
-        if (this->bw != nullptr)
+        if (this->bw != nullptr) {
        {
            bam_destroy1(bw->b);
            free(bw);
        }
@ -102,31 +96,31 @@ struct BamBuf
 /*
 * io异步缓冲区
 */
-struct AsyncIoBamBuf
+struct AsyncIoBamBuf {
 {
    BamBuf buf1_;
    BamBuf buf2_;
    BamBuf *pi_;  // 当前用的buf
    BamBuf *po_;  // 后台在读取的buf
    pthread_t *tid_ = NULL;
    bool hasThread = false;
-    int64_t legacy_size_ = 0; // 上一轮运算之后，缓存中还剩余的上次读取的read数量
+    int64_t legacy_size_ =
        0;  // 上一轮运算之后，缓存中还剩余的上次读取的read数量
    bool first_read_ = true;
    int last_read_num_ = 0;  // 上一次读取了多少reads
    bool need_read_ = true;
    bool use_async_io_ = true;
-    int64_t clear_before_idx_ = 0; // 用户异步读取，下一轮读取之前清理掉clear_before_idx_之前的所有reads
+    int64_t clear_before_idx_ =
-    bool clear_all_ = false;       // 用于异步读取，下一轮读取之前清理掉之前的所有reads
+        0;  // 用户异步读取，下一轮读取之前清理掉clear_before_idx_之前的所有reads
    bool clear_all_ =
        false;  // 用于异步读取，下一轮读取之前清理掉之前的所有reads
    vector<BamWrap *> bam_arr_;  // 用来访问buf中的bam
    AsyncIoBamBuf() {}
    AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {}
    // 析构
-    ~AsyncIoBamBuf()
+    ~AsyncIoBamBuf() {
-    {
+        if (tid_ != NULL) {
        if (tid_ != NULL)
        {
            if (hasThread)
                pthread_join(*tid_, 0);
            free(tid_);
@ -136,9 +130,7 @@ struct AsyncIoBamBuf
    }
    // 初始化缓存
-    void Init(samFile *fp,
+    void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size);
              sam_hdr_t *hdr,
              int64_t mem_size);
    // 读取数据
    int ReadBam();
@ -150,21 +142,17 @@ struct AsyncIoBamBuf
    // 包含的read数量
    inline int64_t Size() { return legacy_size_ + pi_->Size(); }
    inline int ReadStat() { return pi_->read_stat_; }
-    inline BamWrap *operator[](int64_t pos)
+    inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; }
    {
        return bam_arr_[pos];
    }
    // 获取某一段reads
-    inline vector<BamWrap*> Slice(size_t startIdx, size_t endIdx)
+    inline vector<BamWrap *> Slice(size_t startIdx, size_t endIdx) {
    {
        if (endIdx > startIdx) {
            auto begItr = bam_arr_.begin();
-            return std::move(vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
+            return std::move(
                vector<BamWrap *>(begItr + startIdx, begItr + endIdx));
        }
        return std::move(vector<BamWrap *>());
    }
-    void FreeMemory()
+    void FreeMemory() {
    {
        buf1_.FreeMemory();
        buf2_.FreeMemory();
    }
@ -176,11 +164,9 @@ struct AsyncIoBamBuf
    // 异步读取线程函数
    static void *async_read(void *data);
    void resize_buf();
-    inline void refresh_bam_arr()
+    inline void refresh_bam_arr() {
    {
        bam_arr_.resize(this->Size());
-        for (int i = 0; i < bam_arr_.size(); ++i)
+        for (int i = 0; i < bam_arr_.size(); ++i) {
        {
            if (i < legacy_size_)
                bam_arr_[i] = (*po_)[i];
            else
--- a/src/common/hts/bam_wrap.h
+++ b/src/common/hts/bam_wrap.h
@ -9,15 +9,14 @@
 #ifndef BAM_WRAP_H_
 #define BAM_WRAP_H_
-#include <map>
+#include <htslib/sam.h>
 #include <string>
 #include <vector>
 #include <sstream>
 #include <limits.h>
 #include <math.h>
-#include <htslib/sam.h>
+#include <map>
 #include <sstream>
 #include <string>
 #include <vector>
 using namespace std;
@ -28,10 +27,9 @@ using namespace std;
 /*
 * sam read的封装
 */
-struct BamWrap
+struct BamWrap {
 {
    // 将contig左移后加上pos作为全局位置
-    const static int MAX_CONTIG_LEN_SHIFT = 30;
+    const static int MAX_CONTIG_LEN_SHIFT = 40;  // 将染色体id左移多少位，和位点拼合在一起
    const static int READ_MAX_LENGTH = 200;
    const static int READ_MAX_DEPTH = 1000;  // 这只是用来初始化空间用的，深度大于这个值也没关系
@ -40,40 +38,21 @@ struct BamWrap
    int64_t end_pos_;  // bam的全局结束位置, 闭区间
    // 全局开始位置
-    inline int64_t start_pos()
+    inline int64_t start_pos() { return bam_global_pos(b); }
    {
        return bam_global_pos(b);
    }
    // 全局结束位置
-    inline int64_t end_pos()
+    inline int64_t end_pos() { return end_pos_; }
    {
        return end_pos_;
    }
    // 和reference对应的序列长度
-    inline int16_t read_len()
+    inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }
    {
        return (end_pos_ - start_pos() + 1);
    }
    // 在contig内的开始位置
-    inline int32_t contig_pos()
+    inline int32_t contig_pos() { return b->core.pos; }
    {
        return b->core.pos;
    }
    // 在contig内部的结束位置
-    inline int32_t contig_end_pos()
+    inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
    {
        return bam_pos(end_pos_);
    }
    // 序列的长度（AGTC字母个数）
-    inline int16_t seq_len()
+    inline int16_t seq_len() { return b->core.l_qseq; }
    {
        return b->core.l_qseq;
    }
    // 算上开头的softclip
-    inline int32_t softclip_start()
+    inline int32_t softclip_start() {
    {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
        const char c = bam_cigar_opchr(cigar[0]);
@ -84,8 +63,7 @@ struct BamWrap
    }
    // 算上结尾的softclip
-    inline int32_t softclip_end()
+    inline int32_t softclip_end() {
    {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
        const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
@ -96,8 +74,7 @@ struct BamWrap
    }
    // 算上结尾的softclip
-    inline int32_t right_softclip_len()
+    inline int32_t right_softclip_len() {
    {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
        const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
@ -108,14 +85,12 @@ struct BamWrap
    }
    // 获取序列
-    inline std::string sequence()
+    inline std::string sequence() {
    {
        ostringstream oss;
        char *seq = (char *)bam_get_seq(b);
        const bam1_core_t &bc = b->core;
        const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
-        for (int i = 0; i < bc.l_qseq; ++i)
+        for (int i = 0; i < bc.l_qseq; ++i) {
        {
            char base = base_to_char[bam_seqi(seq, i)];
            oss << base;
        }
@ -123,18 +98,13 @@ struct BamWrap
    }
    // 获取名字
-    inline const char *query_name()
+    inline const char *query_name() { return bam_get_qname(b); }
    {
        return bam_get_qname(b);
    }
    // 获取cigar 字符串
-    inline string cigar_str()
+    inline string cigar_str() {
    {
        ostringstream oss;
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
-        for (int i = 0; i < bc.n_cigar; ++i)
+        for (int i = 0; i < bc.n_cigar; ++i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
            oss << len << c;
@ -143,21 +113,14 @@ struct BamWrap
    }
    // 占用的内存大小
-    inline int16_t length()
+    inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }
    {
        return sizeof(*this) +
               sizeof(bam1_t) +
               b->l_data;
    }
    // 获取cigar中insert的总长度
-    inline int32_t insert_cigar_len()
+    inline int32_t insert_cigar_len() {
    {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
        int ret = 0;
-        for (int i = 0; i < bc.n_cigar; ++i)
+        for (int i = 0; i < bc.n_cigar; ++i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
            if (c == 'I')
@ -167,13 +130,11 @@ struct BamWrap
    }
    // 获取cigar中delete的总长度
-    inline int32_t del_cigar_len()
+    inline int32_t del_cigar_len() {
    {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
        int ret = 0;
-        for (int i = 0; i < bc.n_cigar; ++i)
+        for (int i = 0; i < bc.n_cigar; ++i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
            if (c == 'D')
@ -183,13 +144,11 @@ struct BamWrap
    }
    // 计算sam read的终点位置
-    static inline int64_t BamEndPos(const bam1_t *b)
+    static inline int64_t BamEndPos(const bam1_t *b) {
    {
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
        int start_offset = -1;
-        for (int i = 0; i < bc.n_cigar; ++i)
+        for (int i = 0; i < bc.n_cigar; ++i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
            if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X')
@ -198,31 +157,22 @@ struct BamWrap
        return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
    };
-    bool HasWellDefinedFragmentSize()
+    bool HasWellDefinedFragmentSize() {
    {
        const bam1_core_t &bc = b->core;
        bool hasWellDefinedFragmentSize = true;
-        if (bc.isize == 0 ||
+        if (bc.isize == 0 || !(bc.flag & BAM_FPAIRED) || ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
-            !(bc.flag & BAM_FPAIRED) ||
+            ((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) {
            ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
            ((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE)))
        {
            hasWellDefinedFragmentSize = false;
-        }
+        } else if (bc.flag & BAM_FREVERSE) {
        else if (bc.flag & BAM_FREVERSE)
        {
            hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false;
-        }
+        } else {
        else
        {
            hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false;
        }
        return hasWellDefinedFragmentSize;
    }
    // 计算bam的adapterBoundary
-    int GetAdapterBoundary()
+    int GetAdapterBoundary() {
    {
        const bam1_core_t &bc = b->core;
        int adapterBoundary;
        if (!HasWellDefinedFragmentSize())
@ -235,34 +185,29 @@ struct BamWrap
    }
    // 获取开头的I的长度
-    inline int GetHeadInsertLen()
+    inline int GetHeadInsertLen() {
    {
        int insLen = 0;
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
-        for (int i = 0; i < bc.n_cigar; ++i)
+        for (int i = 0; i < bc.n_cigar; ++i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
-            if (c == 'I')
+            if (c == 'I') {
            {
                insLen = len;
                break;
-            }
+            } else if (c != 'H' && c != 'S')
            else if (c != 'H' && c != 'S')
                break;
        }
        return insLen;
    }
-    // 获取soft clip开始位置(能处理H和S相连的情况，有这种情况么？, 注意开头的I要当做S？)
+    // 获取soft clip开始位置(能处理H和S相连的情况，有这种情况么？,
-    inline int64_t GetSoftStart()
+    // 注意开头的I要当做S？)
-    {
+    inline int64_t GetSoftStart() {
        int64_t softStart = b->core.pos;
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
-        for (int i = 0; i < bc.n_cigar; ++i)
+        for (int i = 0; i < bc.n_cigar; ++i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
            if (c == 'S' || c == 'I')
@ -274,13 +219,11 @@ struct BamWrap
    }
    // 获取unclipped开始位置(包括hardclip)
-    inline int64_t GetUnclippedStart()
+    inline int64_t GetUnclippedStart() {
    {
        int64_t start = b->core.pos;
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
-        for (int i = 0; i < bc.n_cigar; ++i)
+        for (int i = 0; i < bc.n_cigar; ++i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
            if (c == 'S' || c == 'H')
@ -292,13 +235,11 @@ struct BamWrap
    }
    // 获取unclipped结束位置(包括hardclip)
-    inline int64_t GetUnclippedEnd()
+    inline int64_t GetUnclippedEnd() {
    {
        int64_t end_pos = bam_endpos(b);
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
-        for (int i = bc.n_cigar - 1; i >= 0; --i)
+        for (int i = bc.n_cigar - 1; i >= 0; --i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
            if (c == 'S' || c == 'H')
@ -311,12 +252,10 @@ struct BamWrap
    /* 获取碱基质量分数的加和 */
    /** Calculates a score for the read which is the sum of scores over Q15. */
-    inline int GetSumOfBaseQualities()
+    inline int GetSumOfBaseQualities() {
    {
        int score = 0;
        uint8_t *qual = bam_get_qual(b);
-        for (int i = 0; i < b->core.l_qseq; ++i)
+        for (int i = 0; i < b->core.l_qseq; ++i) {
        {
            if (qual[i] >= 15)
                score += qual[i];
        }
@ -327,93 +266,63 @@ struct BamWrap
    /* 与flag相关的检测 */
    /* 没有比对上 unmapped */
-    inline bool GetReadUnmappedFlag() 
+    inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }
    {
        return b->core.flag & BAM_FUNMAP;
    }
    /* Template having multiple segments in sequencing  */
-    inline bool GetReadPairedFlag()
+    inline bool GetReadPairedFlag() { return b->core.flag & BAM_FPAIRED; }
    {
        return b->core.flag & BAM_FPAIRED;
    }
    /**
     * the read fails platform/vendor quality checks.
     */
-    inline bool GetReadFailsVendorQualityCheckFlag()
+    inline bool GetReadFailsVendorQualityCheckFlag() { return b->core.flag & BAM_FQCFAIL; }
    {
        return b->core.flag & BAM_FQCFAIL;
    }
    /**
     * the mate is unmapped.
     */
-    bool GetMateUnmappedFlag()
+    bool GetMateUnmappedFlag() { return b->core.flag & BAM_FMUNMAP; }
    {
        return b->core.flag & BAM_FMUNMAP;
    }
    /**
-     * @return whether the alignment is secondary (an alternative alignment of the read).
+     * @return whether the alignment is secondary (an alternative alignment of
     * the read).
     */
-    bool IsSecondaryAlignment()
+    bool IsSecondaryAlignment() { return b->core.flag & BAM_FSECONDARY; }
    {
        return b->core.flag & BAM_FSECONDARY;
    }
    /**
-     * @return whether the alignment is supplementary (a split alignment such as a chimeric alignment).
+     * @return whether the alignment is supplementary (a split alignment such as
     * a chimeric alignment).
     */
-    bool GetSupplementaryAlignmentFlag()
+    bool GetSupplementaryAlignmentFlag() { return b->core.flag & BAM_FSUPPLEMENTARY; }
    {
        return b->core.flag & BAM_FSUPPLEMENTARY;
    }
    /*
-     * Tests if this record is either a secondary and/or supplementary alignment;
+     * Tests if this record is either a secondary and/or supplementary
     * alignment;
     */
-    bool IsSecondaryOrSupplementary()
+    bool IsSecondaryOrSupplementary() { return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); }
    {
        return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag();
    }
    /**
     * the read is the first read in a pair.
     */
-    bool GetFirstOfPairFlag()
+    bool GetFirstOfPairFlag() { return b->core.flag & BAM_FREAD1; }
    {
        return b->core.flag & BAM_FREAD1;
    }
    /**
     * strand of the query (false for forward; true for reverse strand).
     */
-    bool GetReadNegativeStrandFlag()
+    bool GetReadNegativeStrandFlag() { return b->core.flag & BAM_FREVERSE; }
    {
        return b->core.flag & BAM_FREVERSE;
    }
    /**
     * strand of the mate (false for forward; true for reverse strand).
     */
-    bool GetMateNegativeStrandFlag()
+    bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }
    {
        return b->core.flag & BAM_FMREVERSE;
    }
    /* 其他的一些信息 */
-    inline int GetReferenceLength()
+    inline int GetReferenceLength() {
    {
        int length = 0;
        const uint32_t *cigar = bam_get_cigar(b);
        const bam1_core_t &bc = b->core;
-        for (int i = 0; i < bc.n_cigar; ++i)
+        for (int i = 0; i < bc.n_cigar; ++i) {
        {
            const char c = bam_cigar_opchr(cigar[i]);
            const int len = bam_cigar_oplen(cigar[i]);
-            switch (c)
+            switch (c) {
            {
            case 'M':
            case 'D':
            case 'N':
@ -429,24 +338,20 @@ struct BamWrap
    }
    // 计算bam的全局位置，算上染色体序号和比对位置
-    static inline int64_t bam_global_pos(bam1_t *b)
+    static inline int64_t bam_global_pos(bam1_t *b) {
    {
        return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
    }
-    static inline int64_t bam_global_pos(int tid, int pos)
+    static inline int64_t bam_global_pos(int tid, int pos) {
    {
        return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
    }
    // 根据全局位置获取bam的染色体序号
-    static inline int32_t bam_tid(int64_t global_pos)
+    static inline int32_t bam_tid(int64_t global_pos) {
    {
        const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
        const int64_t high_tid = global_pos & mask;
        return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
    }
    // 根据全局位置获取bam的比对位置(染色体内)
-    static inline int32_t bam_pos(int64_t global_pos)
+    static inline int32_t bam_pos(int64_t global_pos) {
    {
        const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
        return (int32_t)(global_pos & mask);
    }
--- a/src/common/utils/debug.cpp
+++ b/src/common/utils/debug.cpp
--- a/src/common/utils/debug.h
+++ b/src/common/utils/debug.h
--- a/src/common/utils/global_arg.cpp
+++ b/src/common/utils/global_arg.cpp
@ -23,8 +23,7 @@ using std::vector;
 struct option *GlobalArg::GLOBAL_OPT = nullptr;
 // 初始化参数
-void GlobalArg::initGlobalOptions()
+void GlobalArg::initGlobalOptions() {
 {
    vector<struct option> v;
    v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
    v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
@ -51,11 +50,9 @@ void GlobalArg::initGlobalOptions()
 }
 // 解析参数
-void GlobalArg::parseArgument(int argNum)
+void GlobalArg::parseArgument(int argNum) {
 {
    using namespace ns_ga;
-    switch (argNum)
+    switch (argNum) {
    {
    case OPT_INPUT:
        in_fn = optarg;
        break;
@ -65,8 +62,7 @@ void GlobalArg::parseArgument(int argNum)
    case OPT_NUM_THREADS:
        num_threads = std::stoi(optarg);
        break;
-        case OPT_MAX_MEM:
+    case OPT_MAX_MEM: {
        {
        char *q;
        size_t mem_arg = strtol(optarg, &q, 0);
        if (*q == 'k' || *q == 'K')
@ -77,14 +73,12 @@ void GlobalArg::parseArgument(int argNum)
            mem_arg <<= 30;
        if (mem_arg >= max_mem)
            max_mem = mem_arg;
-            else
+        else {
            {
            std::cerr << "[Warn] Too small mem size, use default" << std::endl;
        }
        break;
    }
-        case OPT_LOG_LEVEL:
+    case OPT_LOG_LEVEL: {
        {
        if (strcmp("ERROR", optarg) == 0)
            verbosity = ns_ga::ERROR;
        else if (strcmp("WARNING", optarg) == 0)
@ -95,8 +89,7 @@ void GlobalArg::parseArgument(int argNum)
            verbosity = ns_ga::DEBUG;
        break;
    }
-        case OPT_ASYNCIO:
+    case OPT_ASYNCIO: {
        {
        if (strcmp("true", optarg) == 0)
            use_asyncio = true;
        else if (strcmp("false", optarg) == 0)
--- a/src/common/utils/global_arg.h
+++ b/src/common/utils/global_arg.h
@ -9,19 +9,19 @@ Date : 2023/10/23
 #ifndef GLOBAL_ARG_H_
 #define GLOBAL_ARG_H_
 #include <string>
 #include <map>
 #include <vector>
 #include <getopt.h>
 #include <stdio.h>
 #include <map>
 #include <string>
 #include <vector>
 using std::map;
 using std::string;
 using std::vector;
 namespace ns_ga {
-    enum GlobalOptEnum
+enum GlobalOptEnum {
    {
    _START_NUM = 1,
    OPT_INPUT,
    OPT_OUTPUT,
@ -32,28 +32,23 @@ namespace ns_ga {
    OPT_VERSION,
    OPT_HELP,
    _END_NUM
-    };
+};
-    // log level
+// log level
-    enum LogLevelEnum
+enum LogLevelEnum { ERROR, WARNING, INFO, DEBUG };
-    {
+}  // namespace ns_ga
        ERROR,
        WARNING,
        INFO,
        DEBUG
    };
 }
 /* 全局共享的一些参数 */
-struct GlobalArg
+struct GlobalArg {
-{
+    const static int GLOBAL_ARG_CNT =
-    const static int GLOBAL_ARG_CNT = ns_ga::GlobalOptEnum::_END_NUM - ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1
+        ns_ga::GlobalOptEnum::_END_NUM -
        ns_ga::GlobalOptEnum::_START_NUM;  // 这里不需要减1
    static struct option *GLOBAL_OPT;
    string in_fn;                                 // input bam filename
    string out_fn;                                // output bam filename
    int num_threads = 1;                          // 线程个数
-    size_t max_mem = ((size_t)2) << 30; // 最小2G
+    size_t max_mem = ((size_t)1) << 30;           // 最小1G
    ns_ga::LogLevelEnum verbosity = ns_ga::INFO;  // 打印信息级别
    bool use_asyncio = true;                      // 是否使用异步io
@ -64,8 +59,7 @@ struct GlobalArg
    GlobalArg &operator=(const GlobalArg &) = delete;
    // 获取单例
-    static GlobalArg &Instance()
+    static GlobalArg &Instance() {
    {
        static GlobalArg instance;
        return instance;
    }
@ -76,8 +70,7 @@ struct GlobalArg
    void parseArgument(int argNum);
    // 获取对应参数在数组（option和help info）中的索引
-    int getArgIndx(ns_ga::GlobalOptEnum opt)
+    int getArgIndx(ns_ga::GlobalOptEnum opt) {
    {
        return opt - ns_ga::GlobalOptEnum::OPT_INPUT;
    }
@ -95,11 +88,9 @@ struct GlobalArg
        printf("--verbosity   = %d\n", verbosity);
        printf("--asyncio     = %d\n", use_asyncio);
    }
-private : 
+
-    GlobalArg()
+private:
-    {
+    GlobalArg() { initGlobalOptions(); };
        initGlobalOptions();
    };
 };
 #endif
--- a/src/common/utils/murmur3.h
+++ b/src/common/utils/murmur3.h
@ -7,36 +7,34 @@ Author : Zhang Zhonghai
 Date : 2023/11/6
 */
 #include <string>
 #include <random>
 #include <string>
 using std::string;
 /**
- * Provides an implementation of the Murmur3_32 hash algorithm that has desirable properties in terms of randomness
+ * Provides an implementation of the Murmur3_32 hash algorithm that has
- * and uniformity of the distribution of output values that make it a useful hashing algorithm for downsampling.
+ * desirable properties in terms of randomness and uniformity of the
 * distribution of output values that make it a useful hashing algorithm for
 * downsampling.
 */
-struct Murmur3
+struct Murmur3 {
 {
    int seed_ = 0;
    /** Hashes a character stream to an int using Murmur3. */
-    int HashUnencodedChars(const string &input)
+    int HashUnencodedChars(const string &input) {
    {
        int h1 = this->seed_;
        // step through the CharSequence 2 chars at a time
        const int length = input.size();
-        for (int i = 1; i < length; i += 2)
+        for (int i = 1; i < length; i += 2) {
        {
            int k1 = input.at(i - 1) | (input.at(i) << 16);
            k1 = mixK1(k1);
            h1 = mixH1(h1, k1);
        }
        // deal with any remaining characters
-        if ((length & 1) == 1)
+        if ((length & 1) == 1) {
        {
            int k1 = input.at(length - 1);
            k1 = mixK1(k1);
            h1 ^= k1;
@ -45,14 +43,12 @@ struct Murmur3
        return fmix(h1, 2 * length);
    }
-    static Murmur3 &Instance()
+    static Murmur3 &Instance() {
    {
        static Murmur3 instance;
        return instance;
    }
-    static int mixK1(int k1)
+    static int mixK1(int k1) {
    {
        const int c1 = 0xcc9e2d51;
        const int c2 = 0x1b873593;
        k1 *= c1;
@ -61,8 +57,7 @@ struct Murmur3
        return k1;
    }
-    static int mixH1(int h1, int k1)
+    static int mixH1(int h1, int k1) {
    {
        h1 ^= k1;
        h1 = h1 << 13;
        h1 = h1 * 5 + 0xe6546b64;
@ -70,8 +65,7 @@ struct Murmur3
    }
    // Finalization mix - force all bits of a hash block to avalanche
-    static int fmix(int h1, int length)
+    static int fmix(int h1, int length) {
    {
        h1 ^= length;
        h1 ^= (unsigned int)h1 >> 16;
        h1 *= 0x85ebca6b;
@ -82,8 +76,7 @@ struct Murmur3
    }
 private:
-    Murmur3() 
+    Murmur3() {
    {
        auto &&rd = std::random_device{};
        seed_ = rd();
    }
--- a/src/common/utils/profiling.cpp
+++ b/src/common/utils/profiling.cpp
--- a/src/common/utils/profiling.h
+++ b/src/common/utils/profiling.h
--- a/src/sam/markdups/dup_metrics.h
+++ b/src/sam/markdups/dup_metrics.h
@ -0,0 +1,70 @@
 #pragma once
 #include <string>
 #include <stdint.h>
 using std::string;
 /*
 * 标记冗余过程中的一些数据统计
 */
 struct DuplicationMetrics {
    /**
     * The library on which the duplicate marking was performed.
     */
    string LIBRARY = "";
    /**
     * The number of mapped reads examined which did not have a mapped mate pair,
     * either because the read is unpaired, or the read is paired to an unmapped mate.
     */
    uint64_t UNPAIRED_READS_EXAMINED = 0;
    /**
     * The number of mapped read pairs examined. (Primary, non-supplemental)
     */
    uint64_t READ_PAIRS_EXAMINED = 0;
    /**
     * The number of reads that were either secondary or supplementary
     */
    uint64_t SECONDARY_OR_SUPPLEMENTARY_RDS = 0;
    /**
     * The total number of unmapped reads examined. (Primary, non-supplemental)
     */
    uint64_t UNMAPPED_READS = 0;
    /**
     * The number of fragments that were marked as duplicates.
     */
    uint64_t UNPAIRED_READ_DUPLICATES = 0;
    /**
     * The number of read pairs that were marked as duplicates.
     */
    uint64_t READ_PAIR_DUPLICATES = 0;
    /**
     * The number of read pairs duplicates that were caused by optical duplication.
     * Value is always < READ_PAIR_DUPLICATES, which counts all duplicates regardless of source.
     */
    uint64_t READ_PAIR_OPTICAL_DUPLICATES = 0;
    /**
     * The fraction of mapped sequence that is marked as duplicate.
     */
    double PERCENT_DUPLICATION = 0.0;
    /**
     * The estimated number of unique molecules in the library based on PE duplication.
     */
    uint64_t ESTIMATED_LIBRARY_SIZE = 0;
    // 其他的统计数据
    // addSingletonToCount需要记录的数据
    uint64_t DuplicateCountHist = 0;
    uint64_t NonOpticalDuplicateCountHist = 0;
    // track optical duplicates 需要记录的数据
    uint64_t OpticalDuplicatesCountHist = 0;
    uint64_t OpticalDuplicatesByLibraryId = 0;
    // 统计相关的函数
    void AddSingletonToCount() { 
        ++this->DuplicateCountHist;
        ++this->NonOpticalDuplicateCountHist;
    }
 };
--- a/src/sam/markdups/markdups.cpp
+++ b/src/sam/markdups/markdups.cpp
@ -1,75 +1,74 @@
 /*
-Description: 标记bam文件中的冗余信息，只处理按照坐标排序后的bam，且bam为单一样本数据
+Description:
 标记bam文件中的冗余信息，只处理按照坐标排序后的bam，且bam为单一样本数据
 Copyright : All right reserved by ICT
 Author : Zhang Zhonghai
 Date : 2023/10/23
 */
 #include "markdups_arg.h"
 // 有太多define冲突，放到最后include
 #include <common/hts/bam_buf.h>
 #include <common/utils/global_arg.h>
 #include <common/utils/murmur3.h>
 #include <common/utils/thpool.h>
 #include <common/utils/timer.h>
 #include <common/utils/util.h>
 #include <common/utils/murmur3.h>
 #include <common/utils/yarn.h>
 #include <htslib/sam.h>
 #include <htslib/thread_pool.h>
 #include <sam/utils/read_ends.h>
 #include <sam/utils/read_name_parser.h>
 #include <htslib/sam.h>
 #include <htslib/thread_pool.h>
 #include <iostream>
 #include <vector>
 #include <set>
 #include <queue>
 #include <set>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "markdups_arg.h"
 #include "md_funcs.h"
 #include "parallel_md.h"
 #include "serial_md.h"
 #include "shared_args.h"
 #include "dup_metrics.h"
 using namespace std;
 using std::cout;
 #define SMA_TAG_PG "PG"
-#define BAM_BLOCK_SIZE 2 * 1024 * 1024
+#define BAM_BLOCK_SIZE 16L * 1024 * 1024
 #define NO_SUCH_INDEX INT64_MAX
-static Timer tm_arr[20]; // 用来测试性能
+Timer tm_arr[20];  // 用来测试性能
 /* 全局本地变量 */
-static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
+vector<ReadNameParser> g_vRnParser;  // 每个线程一个read name parser
-static samFile *g_inBamFp;                 // 输入的bam文件
+samFile *g_inBamFp;                  // 输入的bam文件
-static sam_hdr_t *g_inBamHeader;           // 输入的bam文件头信息
+sam_hdr_t *g_inBamHeader;            // 输入的bam文件头信息
-static samFile *g_outBamFp = nullptr;      // 输出文件, sam或者bam格式
+samFile *g_outBamFp = nullptr;       // 输出文件, sam或者bam格式
-static sam_hdr_t *g_outBamHeader;          // 输出文件的header
+sam_hdr_t *g_outBamHeader;           // 输出文件的header
 /* 参数对象作为全局对象，免得多次作为参数传入函数中 */
-static GlobalArg &g_gArg = GlobalArg::Instance();
+GlobalArg &g_gArg = GlobalArg::Instance();
-static MarkDupsArg g_mdArg;
+static MarkDupsArg g_mdArg_;
-
+MarkDupsArg &g_mdArg = g_mdArg_;
-
+static GlobalDataArg gData_;
-#include "md_funcs.h"
+GlobalDataArg &gData = gData_;
-#include "serial_md.h"
+DuplicationMetrics gMetrics_;
-#include "parallel_md.h"
+DuplicationMetrics &gMetrics = gMetrics_;
 /*
- * mark duplicate 入口，假定bam是按照比对后的坐标排序的，同一个样本的话不需要考虑barcode的问题
+ * mark duplicate
 * 入口，假定bam是按照比对后的坐标排序的，同一个样本的话不需要考虑barcode的问题
 */
-int MarkDuplicates(int argc, char *argv[])
+int MarkDuplicates(int argc, char *argv[]) {
 {
    Timer::log_time("程序开始");
    Timer time_all;
    /* 读取命令行参数 */
    g_mdArg.parseArgument(argc, argv, &g_gArg);  // 解析命令行参数
-    if (g_gArg.num_threads < 1) // 线程数不能小于1
+    if (g_gArg.num_threads < 1)
-        g_gArg.num_threads = 1;
+        g_gArg.num_threads = 1;  // 线程数不能小于1
    /* 初始化一些参数和变量*/
    g_vRnParser.resize(g_gArg.num_threads);
@ -78,23 +77,23 @@ int MarkDuplicates(int argc, char *argv[])
    /* 打开输入bam文件 */
    g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
-    if (!g_inBamFp)
+    if (!g_inBamFp) {
    {
        Error("[%s] load sam/bam file failed.\n", __func__);
        return -1;
    }
    hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
    g_inBamHeader = sam_hdr_read(g_inBamFp);  // 读取header
    // 获取样本名称(libraryId)
    gMetrics.LIBRARY = sam_hdr_line_name(g_inBamHeader, "RG", 0);
    /* 利用线程池对输入输出文件进行读写 */
    htsThreadPool htsPoolRead = {NULL, 0};   // 多线程读取，创建线程池
    htsThreadPool htsPoolWrite = {NULL, 0};  // 读写用不同的线程池
-    // htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
+    //htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
-    htsPoolRead.pool = hts_tpool_init(16);
+    //htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
-    // htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
+    htsPoolRead.pool = hts_tpool_init(32);
-    htsPoolWrite.pool = hts_tpool_init(16);
+    htsPoolWrite.pool = hts_tpool_init(32);
-    if (!htsPoolRead.pool || !htsPoolWrite.pool)
+    if (!htsPoolRead.pool || !htsPoolWrite.pool) {
    {
        Error("[%d] failed to set up thread pool", __LINE__);
        sam_close(g_inBamFp);
        return -1;
@ -106,86 +105,94 @@ int MarkDuplicates(int argc, char *argv[])
    sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
    g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
    g_outBamHeader = sam_hdr_dup(g_inBamHeader);
    // 用同样的线程池处理输出文件
    hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
-    hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
+    hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
    /* 冗余检查和标记 */
-    if (g_gArg.num_threads == 1)
+    if (g_gArg.num_threads == 1) {
    {
        serialMarkDups();  // 串行运行
-    }
+    } else {
    else
    {
        parallelMarkDups();  // 并行运行
    }
    /* 标记冗余, 将处理后的结果写入文件 */
    sam_close(g_inBamFp);  // 重新打开bam文件
    g_inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
-    if (!g_inBamFp)
+    if (!g_inBamFp) {
    {
        Error("[%s] load sam/bam file failed.\n", __func__);
        return -1;
    }
    hts_set_opt(g_inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
    hts_set_opt(g_inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
    g_inBamHeader = sam_hdr_read(g_inBamFp);  // 读取header
-    if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
+    if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0) {
    {
        Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
        sam_close(g_outBamFp);
        sam_close(g_inBamFp);
        return -1;
    }
    // 输出index文件
-    string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的
+    // string indexFn = g_gArg.out_fn + ".csi";  // 现在索引都是csi格式的
-    if (sam_idx_init(g_outBamFp, g_outBamHeader, 14 /*csi*/, indexFn.c_str()) < 0)
+    string indexFn = g_gArg.out_fn + ".bai";  // min_shift = 0 是bai格式
-    {
+    int index_min_shift = 0;
    if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) {
        indexFn = g_gArg.out_fn + ".csi";
        index_min_shift = 14;
    }
    if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) {
        Error("failed to open index \"%s\" for writing", indexFn.c_str());
        sam_close(g_outBamFp);
        sam_close(g_inBamFp);
        return -1;
    }
    // 读取输入文件
-    // BamBufType inBuf(false); // inBuf(g_gArg.use_asyncio);
+    // BamBufType inBuf(false);
    BamBufType inBuf(g_gArg.use_asyncio);
    inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
    DupIdxQueue idxQue;
    idxQue.Init(&gData.dupIdxArr);
    Timer tw;
-    while (inBuf.ReadStat() >= 0)
+    cout << "dupsize: " << idxQue.Size() << endl;
-    {
+    uint64_t bamIdx = 0;
    uint64_t dupIdx = idxQue.Pop();
    cout << "dup arr size: " << gData.dupIdxArr.size() << endl;
    cout << "first dup: " << dupIdx << endl;
    while (inBuf.ReadStat() >= 0) {
        Timer tw1;
        size_t readNum = inBuf.ReadBam();
        cout << "read: " << readNum << endl;
-        for (size_t i = 0; i < inBuf.Size(); ++i)
+        for (size_t i = 0; i < inBuf.Size(); ++i) {
        {
            /* 判断是否冗余 */
-            if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0)
+            if (bamIdx == dupIdx) {
-            {
+                // cout << "冗余" << bamIdx << endl;
                dupIdx = idxQue.Pop();
            }
            if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
                Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
                sam_close(g_outBamFp);
                sam_close(g_inBamFp);
                return -1;
            }
            ++bamIdx;
        }
        inBuf.ClearAll();
        cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
    }
-    if (sam_idx_save(g_outBamFp) < 0)
+    cout << "dupsize: " << idxQue.Size() << endl;
-    {
+    if (sam_idx_save(g_outBamFp) < 0) {
        Error("writing index failed");
        sam_close(g_outBamFp);
        sam_close(g_inBamFp);
        return -1;
    }
    cout << "write time: " << tw.seconds_elapsed() << " s" << endl;
    /* 关闭文件，收尾清理 */
    sam_close(g_outBamFp);
    sam_close(g_inBamFp);
    cout << "      总时间: " << time_all.seconds_elapsed() << endl;
-//    cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
+    //    cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
    Timer::log_time("程序结束");
    return 0;
-}
+    }
--- a/src/sam/markdups/markdups_arg.cpp
+++ b/src/sam/markdups/markdups_arg.cpp
@ -64,6 +64,7 @@ const static struct option kMdOpts[] = {
    {"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL},
    {"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
    {"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
    {"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT},
    {"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}};
 // 判断bool类型的参数
@ -220,6 +221,11 @@ void MarkDupsArg::parseArgument(int argc,
        case ns_md::CREATE_INDEX:
            setBoolArg(&CREATE_INDEX);
            break;
        case ns_md::INDEX_FORMAT:
            if (strcmp("CSI", optarg) == 0)
                INDEX_FORMAT = ns_md::IndexFormat::CSI;
            else
                INDEX_FORMAT = ns_md::IndexFormat::BAI;
        case ns_md::CREATE_MD5_FILE:
            setBoolArg(&CREATE_MD5_FILE);
            break;
@ -236,6 +242,7 @@ void MarkDupsArg::parseArgument(int argc,
 void MarkDupsArg::printArgValue()
 {
    printf("--READ_NAME_REGEX = %s\n", this->READ_NAME_REGEX.c_str());
    printf("--INDEX_FORMAT    = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi");
 }
 // 打印版本信息
@ -272,6 +279,8 @@ void MarkDupsArg::PrintHelp()
            "\n"
            "Optional Arguments:\n"
            "\n"
            "--INDEX_FORMAT <FORMAT>       Format for bam index file. Possible values: {BAI, CSI}\n"
            "\n"
            "--ADD_PG_TAG_TO_READS <Boolean>\n"
            "                              Add PG tag to each read in a SAM or BAM  Default value: true. Possible values: {true,\n"
            "                              false}\n"
--- a/src/sam/markdups/markdups_arg.h
+++ b/src/sam/markdups/markdups_arg.h
@ -21,9 +21,8 @@ using std::vector;
 class GlobalArg;
 namespace ns_md {
-    /* 用于markduplicate模块的参数，这个枚举用于getoption */
+/* 用于markduplicate模块的参数，这个枚举用于getoption */
-    enum MarkDupsArgEnum
+enum MarkDupsArgEnum {
    {
    _START_NUM = 100,
    MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,
    MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,
@ -55,13 +54,14 @@ namespace ns_md {
    COMPRESSION_LEVEL,
    MAX_RECORDS_IN_RAM,
    CREATE_INDEX,
    INDEX_FORMAT,
    CREATE_MD5_FILE,
    _END_NUM
-    };
+};
-    /* How strict to be when reading a SAM or BAM, beyond bare minimum validation. */
+/* How strict to be when reading a SAM or BAM, beyond bare minimum validation.
-    enum ValidationStringency
+ */
-    {
+enum ValidationStringency {
    /**
     * Do the right thing, throw an exception if something looks wrong.
     */
@ -76,36 +76,37 @@ namespace ns_md {
    SILENT,
    DEFAULT_STRINGENCY = SILENT
-    };
+};
-    /**
+/**
-     * Enum used to control how duplicates are flagged in the DT optional tag on each read.
+ * Enum used to control how duplicates are flagged in the DT optional tag on
 * each read.
 */
-    enum DuplicateTaggingPolicy
+enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All };
    {
        DontTag,
        OpticalOnly,
        All
    };
-    /* 排序的方式 */
+/* 排序的方式 */
-    enum SortOrder
+enum SortOrder {
    {
    unsorted,
    queryname,
    coordinate,
    duplicate,  // NB: this is not in the SAM spec!
    unknown
-    };
+};
-    /* 计算reads分数的方式（比那个read得分更高） */
+/* 计算reads分数的方式（比那个read得分更高） */
-    enum ScoringStrategy
+enum ScoringStrategy {
    {
    SUM_OF_BASE_QUALITIES,
    TOTAL_MAPPED_REFERENCE_LENGTH,
    RANDOM
-    };
+};
-}
+
 /* 索引文件的格式 （bai或者csi） */
 enum IndexFormat {
    BAI,
    CSI
 };
 }  // namespace ns_md
 /* markduplicate 需要的参数*/
 struct MarkDupsArg
@ -287,6 +288,8 @@ struct MarkDupsArg
    /* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
    bool CREATE_INDEX = false;
    ns_md::IndexFormat INDEX_FORMAT = ns_md::IndexFormat::BAI;
    /* "Whether to create an MD5 digest for any BAM or FASTQ files created.  ", common = true */
    bool CREATE_MD5_FILE = false;
--- a/src/sam/markdups/md_funcs.cpp
+++ b/src/sam/markdups/md_funcs.cpp
@ -0,0 +1,456 @@
 #include "md_funcs.h"
 #include <common/hts/bam_buf.h>
 #include <common/utils/debug.h>
 #include <common/utils/murmur3.h>
 #include <common/utils/profiling.h>
 #include <common/utils/timer.h>
 #include <common/utils/util.h>
 #include <sam/utils/read_ends.h>
 #include <stdint.h>
 #include <iostream>
 #include <map>
 #include <set>
 #include <vector>
 #include <math.h>
 #include <unordered_map>
 #include "dup_metrics.h"
 #include "markdups_arg.h"
 #include "shared_args.h"
 using std::cerr;
 using std::endl;
 using std::map;
 using std::set;
 using std::unordered_map;
 using std::vector;
 /* 清除key位置的数据 */
 void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
    auto &msIdx = *pmsIdx;
    if (msIdx.find(key) != msIdx.end())
        msIdx[key].clear();  // 清除该位点的冗余结果
 }
 /* 删除key位置的数据 */
 void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
    auto &msIdx = *pmsIdx;
    if (msIdx.find(key) != msIdx.end())
        msIdx.erase(key);
 }
 /*
 * 计算read的分数
 */
 int16_t computeDuplicateScore(BamWrap &bw) {
    int16_t score = 0;
    switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) {
    case ns_md::SUM_OF_BASE_QUALITIES:
        // two (very) long reads worth of high-quality bases can go over
        // Short.MAX_VALUE/2 and risk overflow.
        score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
        break;
    case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
        if (!bw.GetReadUnmappedFlag())
            // no need to remember the score since this scoring mechanism is
            // symmetric
            score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
        break;
    case ns_md::RANDOM:
        // The RANDOM score gives the same score to both reads so that they get
        // filtered together. it's not critical do use the readName since the
        // scores from both ends get added, but it seem to be clearer this way.
        score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
        // subtract Short.MIN_VALUE/4 from it to end up with a number between
        // 0 and Short.MAX_VALUE/2. This number can be then discounted in case
        // the read is not passing filters. We need to stay far from overflow so
        // that when we add the two scores from the two read mates we do not
        // overflow since that could cause us to chose a failing read-pair
        // instead of a passing one.
        score -= INT16_MIN / 4;
    default:
        break;
    }
    // make sure that filter-failing records are heavily discounted. (the
    // discount can happen twice, once for each mate, so need to make sure we do
    // not subtract more than Short.MIN_VALUE overall.)
    score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
    return score;
 }
 /*
 * Builds a read ends object that represents a single read.
 * 用来表示一个read的特征结构
 */
 void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) {
    auto &k = *pKey;
    auto &bc = bw.b->core;
    k.read1FirstOfPair = bw.GetFirstOfPairFlag();
    k.read1ReferenceIndex = bc.tid;
    k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
    k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
    k.read1IndexInFile = index;
    k.score = computeDuplicateScore(bw);
    // Doing this lets the ends object know that it's part of a pair
    if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) {
        k.read2ReferenceIndex = bc.mtid;
    }
    // Fill in the location information for optical duplicates
    rnParser.AddLocationInformation(bw.query_name(), pKey);
    // cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
    // 计算位置key
    k.posKey =
        BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate);  // << 1 | k.orientation;
 }
 /**
 * Takes a list of ReadEndsForMarkDuplicates objects and identify the
 * representative read based on quality score. For all members of the duplicate
 * set, add the read1 index-in-file of the representative read to the records of
 * the first and second in a pair. This value becomes is used for the 'DI' tag.
 */
 void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
 /* 处理一组pairend的readends，标记冗余 */
 void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
                        DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
    if (vpRe.size() < 2) {
        if (vpRe.size() == 1) {
            // addSingletonToCount(libraryIdGenerator);
        }
        return;
    }
    // cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
    auto &vDupIdx = dupIdx->AtPos(posKey);
    auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
    int maxScore = 0;
    const ReadEnds *pBest = nullptr;
    /** All read ends should have orientation FF, FR, RF, or RR **/
    for (auto pe : vpRe)  // 找分数最高的readend
    {
        if (pe->score > maxScore || pBest == nullptr) {
            maxScore = pe->score;
            pBest = pe;
        }
    }
    if (!g_mdArg.READ_NAME_REGEX.empty())  // 检查光学冗余
    {
        // trackOpticalDuplicates
    }
    for (auto pe : vpRe)  // 对非best read标记冗余
    {
        if (pe != pBest)  // 非best
        {
            vDupIdx.push_back(pe->read1IndexInFile);  // 添加read1
            if (pe->read2IndexInFile != pe->read1IndexInFile)
                vDupIdx.push_back(pe->read2IndexInFile);  // 添加read2
        }
    }
    if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
        addRepresentativeReadIndex(vpRe);
    }
 }
 /* 处理一组非paired的readends，标记冗余 */
 void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
                            DupContainer<int64_t> *dupIdx) {
    auto &vDupIdx = dupIdx->AtPos(posKey);
    if (containsPairs) {
        for (auto pe : vpRe) {
            if (!pe->IsPaired()) {
                vDupIdx.push_back(pe->read1IndexInFile);
            }
        }
    } else {
        int maxScore = 0;
        const ReadEnds *pBest = nullptr;
        for (auto pe : vpRe) {
            if (pe->score > maxScore || pBest == nullptr) {
                maxScore = pe->score;
                pBest = pe;
            }
        }
        for (auto pe : vpRe) {
            if (pe != pBest) {
                vDupIdx.push_back(pe->read1IndexInFile);
            }
        }
    }
 }
 /* 处理位于某个坐标的pairend reads */
 void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
                        vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
                        DupContainer<int64_t> *opticalDupIdx) {
    if (readEnds.size() > 1) { // 有潜在的冗余
        vpCache.clear();
        //        std::sort(readEnds.begin(), readEnds.end());
        const ReadEnds *pReadEnd = nullptr;
        for (auto &re : readEnds) {
            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))  // 跟前一个一样
                vpCache.push_back(&re);  // 处理一个潜在的冗余组
            else {
                markDuplicatePairs(posKey, vpCache, dupIdx,
                                   opticalDupIdx);  // 不一样
                vpCache.clear();
                vpCache.push_back(&re);
                pReadEnd = &re;
            }
        }
        markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
    }
 }
 /* 处理位于某个坐标的 reads */
 void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
                        vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
    if (readEnds.size() > 1)  // 有潜在的冗余
    {
        vpCache.clear();
        //        std::sort(readEnds.begin(), readEnds.end());
        const ReadEnds *pReadEnd = nullptr;
        bool containsPairs = false;
        bool containsFrags = false;
        for (auto &re : readEnds) {
            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
                vpCache.push_back(&re);
                containsPairs = containsPairs || re.IsPaired();
                containsFrags = containsFrags || !re.IsPaired();
            } else {
                if (vpCache.size() > 1 && containsFrags) {
                    markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
                }
                vpCache.clear();
                vpCache.push_back(&re);
                pReadEnd = &re;
                containsPairs = re.IsPaired();
                containsFrags = !re.IsPaired();
            }
        }
        if (vpCache.size() > 1 && containsFrags) {
            markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
        }
    }
 }
 /* 对找到的pairend read end添加一些信息 */
 void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
    auto &pairedEnds = *pPairedEnds;
    int64_t bamIdx = fragEnd.read1IndexInFile;
    const int matesRefIndex = fragEnd.read1ReferenceIndex;
    const int matesCoordinate = fragEnd.read1Coordinate;
    // Set orientationForOpticalDuplicates, which always goes by the first then
    // the second end for the strands.  NB: must do this before updating the
    // orientation later.
    if (fragEnd.read1FirstOfPair) {
        pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
            fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
    } else {
        pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
            pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
    }
    // If the other read is actually later, simply add the other read's data as
    // read2, else flip the reads
    if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
        (matesRefIndex == pairedEnds.read1ReferenceIndex &&
         matesCoordinate >= pairedEnds.read1Coordinate)) {
        pairedEnds.read2ReferenceIndex = matesRefIndex;
        pairedEnds.read2Coordinate = matesCoordinate;
        pairedEnds.read2IndexInFile = bamIdx;
        pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
                                                              fragEnd.IsNegativeStrand());
        // if the two read ends are in the same position, pointing in opposite
        // directions, the orientation is undefined and the procedure above will
        // depend on the order of the reads in the file. To avoid this, we set
        // it explicitly (to FR):
        if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
            pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
            pairedEnds.orientation == ReadEnds::RF) {
            pairedEnds.orientation = ReadEnds::FR;
        }
    } else {
        pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
        pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
        pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
        pairedEnds.read1ReferenceIndex = matesRefIndex;
        pairedEnds.read1Coordinate = matesCoordinate;
        pairedEnds.read1IndexInFile = bamIdx;
        pairedEnds.orientation = ReadEnds::GetOrientationByte(
            fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
        pairedEnds.posKey = fragEnd.posKey;
    }
    pairedEnds.score += fragEnd.score;
 }
 static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
    return lhs != rhs &&  // no comparing an object to itself (checked using object identity)!
           (lhs->tile != ReadEnds::NO_VALUE) &&
           (rhs->tile != ReadEnds::NO_VALUE) &&  // no comparing objects without locations
           lhs->tile == rhs->tile &&             // and the same tile
           abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
 }
 /**
 * Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of
 * one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for
 * optical duplication and the rest are flagged as optical duplicates.  The set of reads that are considered
 * optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the
 * read appeared in the input list of physical locations.
 *
 * @param list   a list of reads that are determined to be duplicates of one another
 * @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be
 *               annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not
 *               contained within the list! (always not be null!)
 * @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates
 */
 static void findOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe,
                           vector<bool> *pOpticalDuplicateFlags) {
    const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
    const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000;
    vector<bool> &opticalDuplicateFlags = *pOpticalDuplicateFlags;
    opticalDuplicateFlags.push_back(true);
    int len = readEndsArr.size();
    // If there is only one or zero reads passed in (so there are obviously no optical duplicates),
    // or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm),
    // then just return an array of all false
    if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) {
        return;
    }
    if (len >= 4) {
        /**
         * Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive
         * optical duplicates
         * getOpticalDuplicatesFlagWithGraph
         */
        // Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other,
        // we will then use the union-find algorithm to cluster the graph and find optical duplicate groups
        Graph<int> opticalDistanceRelationGraph;
        unordered_map<int, vector<int>> tileRGmap;
        int keeperIndex = -1;
        for (int i = 0; i < readEndsArr.size(); ++i) {
            const ReadEnds *currentLoc = readEndsArr[i];
            if (currentLoc == pBestRe)
                keeperIndex = i;
            if (currentLoc->tile != ReadEnds::NO_VALUE) {
                int key = currentLoc->tile; // 只处理一个样本，所以只有一个read group
                tileRGmap[key].push_back(i);
            }
        }
    } else {
        /**
         * Compute optical duplicates quickly in the standard case where we know that there won't be any transitive
         * distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a
         * readgroup or 3x with the keeper present
         * getOpticalDuplicatesFlagFast
         */
        // First go through and compare all the reads to the keeper
        for (int i = 0; i < len; ++i) {
            const ReadEnds *other = readEndsArr[i];
            opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE);
        }
        // Now go through and do each pairwise comparison not involving the actualKeeper
        for (int i = 0; i < len; ++i) {
            const ReadEnds *lhs = readEndsArr[i];
            if (lhs == pBestRe)  // no comparisons to actualKeeper since those are all handled above
                continue;
            for (int j = i + 1; j < len; ++j) {
                const ReadEnds *rhs = readEndsArr[j];
                if (rhs == pBestRe)  // no comparisons to actualKeeper since those are all handled above
                    continue;
                if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j])
                    continue;  // both already marked, no need to check
                if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) {
                    // At this point we want to mark either lhs or rhs as duplicate. Either could have been marked
                    // as a duplicate of the keeper (but not both - that's checked above), so be careful about which
                    // one to now mark as a duplicate.
                    int index = opticalDuplicateFlags[j] ? i : j;
                    opticalDuplicateFlags[index] = true;
                }
            }
        }
    }
 }
 /**
 * Looks through the set of reads and identifies how many of the duplicates are
 * in fact optical duplicates, and stores the data in the instance level histogram.
 *
 * We expect only reads with FR or RF orientations, not a mixture of both.
 *
 * In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end
 * of the mate.  In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF
 * when we order orientation by the first mate sequenced (read #1 of the pair).
 */
 static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
    vector<bool> opticalDuplicateFlags(readEndsArr.size(), false);
    // find OpticalDuplicates
    findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
    int opticalDuplicates = 0;
    for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
        if (opticalDuplicateFlags[i]) {
            ++opticalDuplicates;
            ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
            pRe->isOpticalDuplicate = true;
        }
    }
    if (opticalDuplicates > 0)
        gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
    return opticalDuplicates;
 }
 /**
 * 记录光学原因造成的冗余
 */
 void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
    bool hasFR = false, hasRF = false;
    // Check to see if we have a mixture of FR/RF
    for (auto pRe : readEndsArr) {
        if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
            hasFR = true;
        else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
            hasRF = true;
    }
    // Check if we need to partition since the orientations could have changed
    int nOpticalDup;
    if (hasFR && hasRF) { // need to track them independently
        vector<const ReadEnds *> trackOpticalDuplicatesF;
        vector<const ReadEnds *> trackOpticalDuplicatesR;
        // Split into two lists: first of pairs and second of pairs, 
        // since they must have orientation and same starting end
        for (auto pRe: readEndsArr) {
            if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
                trackOpticalDuplicatesF.push_back(pRe);
            else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
                trackOpticalDuplicatesR.push_back(pRe);
            else
                cerr << "Found an unexpected orientation: " << pRe->orientation << endl;
        }
        // track the duplicates
        int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe);
        int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe);
        nOpticalDup = nOpticalDupF + nOpticalDupR;
    } else {  // No need to partition
        nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
    }
    // trackDuplicateCounts
    gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup;
    if (readEndsArr.size() > nOpticalDup)
        gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup;
    if (nOpticalDup)
        gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
 }
--- a/src/sam/markdups/md_funcs.h
+++ b/src/sam/markdups/md_funcs.h
@ -1,25 +1,38 @@
 #pragma once
 #include <robin-map/include/tsl/robin_map.h>
 #include <vector>
 #include <queue>
 #include <unordered_set>
 #include <unordered_map>
 using std::priority_queue;
 using std::unordered_map;
 using std::unordered_set;
 using std::vector;
 /* 前向声明 */
 class BamWrap;
 class ReadEnds;
 class ReadNameParser;
 /* 存放readend或者冗余idx，避免频繁的开辟和释放内存 */
 template <class T>
-struct DupContainer
+struct DupContainer {
 {
    vector<vector<T>> arr;  // 类似map<int64_t, set<ReadEnds>> 或 map<int64_t, set<int64_t>>
    vector<int64_t> pos;    // arr中每个元素对应的position
    // unordered_map<int64_t, int64_t> idx; // 某个位点对应在vector中的坐标
    tsl::robin_map<int64_t, int64_t> idx;  // 某个位点对应在vector中的坐标
    int64_t size = 0;                      // 实际使用的空间
    int64_t capacity = 0;                  // 内存容量
-    inline void Init()
+    inline void Init() {
    {
        idx.clear();
        size = 0;
    }
    inline void SortAtPos(int64_t p)  // 这里的pos表示位点
    {
-        if (idx.find(p) != idx.end())
+        if (idx.find(p) != idx.end()) {
        {
            const int64_t i = idx.at(p);
            std::sort(arr[i].begin(), arr[i].end());
        }
@ -28,10 +41,8 @@ struct DupContainer
    {
        std::sort(arr[i].begin(), arr[i].end());
    }
-    inline void RemoveAtPos(int64_t p)
+    inline void RemoveAtPos(int64_t p) {
-    {
+        if (idx.find(p) != idx.end()) {
        if (idx.find(p) != idx.end())
        {
            const int64_t i = idx.at(p);
            arr[i].clear();
        }
@ -40,21 +51,14 @@ struct DupContainer
    {
        arr[i].clear();
    }
-    inline void AddAtPos(int64_t p, T &val)
+    inline void AddAtPos(int64_t p, T &val) { AtPos(p).push_back(val); }
    {
        AtPos(p).push_back(val);
    }
-    inline vector<T> &AtPos(int64_t p)
+    inline vector<T> &AtPos(int64_t p) {
-    {
+        if (idx.find(p) != idx.end()) {
        if (idx.find(p) != idx.end())
        {
            const int64_t i = idx.at(p);
            return arr[i];
        }
-
+        if (size >= capacity) {
        if (size >= capacity)
        {
            capacity += 1;
            arr.push_back(vector<T>());
            pos.push_back(0);
@ -67,308 +71,131 @@ struct DupContainer
    }
 };
 /* 清除key位置的数据 */
 static inline void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx)
 {
    auto &msIdx = *pmsIdx;
    if (msIdx.find(key) != msIdx.end())
        msIdx[key].clear(); // 清除该位点的冗余结果
 }
-/* 删除key位置的数据 */
+/*
-static inline void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx)
+ * 优先队列，用最小堆来实现对所有冗余索引的排序
-{
+ */
-    auto &msIdx = *pmsIdx;
+struct PairArrIdIdx {
-    if (msIdx.find(key) != msIdx.end())
+    int arrId = 0;
-        msIdx.erase(key);
+    uint64_t arrIdx = 0;
-}
+    int64_t dupIdx = 0;
 };
 struct IdxGreaterThan {
    bool operator()(const PairArrIdIdx &a, const PairArrIdIdx &b) { return a.dupIdx > b.dupIdx; }
 };
 struct DupIdxQueue {
    // 将冗余索引和他对应的task vector对应起来
    // 由于是多个task来查找冗余，所以每次找到的冗余index都放在一个独立的vector中，vector之间可能有重叠，所以需要用一个最小堆来维护
    vector<vector<int64_t>> *dupIdx2DArr;
    priority_queue<PairArrIdIdx, vector<PairArrIdIdx>, IdxGreaterThan> minHeap;
    uint64_t popNum = 0;
    int Init(vector<vector<int64_t>> *_dupIdx2DArr) { 
        dupIdx2DArr = _dupIdx2DArr;
        if (dupIdx2DArr == nullptr) {
            return -1;
        }
        for (int i = 0; i < dupIdx2DArr->size(); ++i) {
            auto &v = (*dupIdx2DArr)[i];
            if (!v.empty()) {
                minHeap.push({i, 1, v[0]});
            }
        }
        return 0;
    }
    int64_t Pop() {
        int64_t ret = -1;
        if (!minHeap.empty()) {
            auto idx = minHeap.top();
            minHeap.pop();
            ++popNum;
            ret = idx.dupIdx;
            auto &v = (*dupIdx2DArr)[idx.arrId];
            if (v.size() > idx.arrIdx) {
                minHeap.push({idx.arrId, idx.arrIdx + 1, v[idx.arrIdx]});
            }
        }
        return ret;
    }
    uint64_t Size() {
        uint64_t len = 0;
        if (dupIdx2DArr != nullptr) {
            for (auto &v : *dupIdx2DArr) {
                len += v.size();
            }
        }
        return len - popNum;
    }
 };
 /*
 * 用来检测optical duplication的graph
 */
 template <class Node>
 struct Graph {         // 用set？
    unordered_set<Node> nodes; // 图中的结点
    unordered_map<Node*, unordered_set<Node*>> neighbors;  // 邻接列表
    Node *addNode(const Node &singleton) {
        if (nodes.find(singleton) == nodes.end()) {
            Node *n = const_cast<Node *>(&(*nodes.insert(singleton).first));
            neighbors[n].clear();
            return n;
        }
        return const_cast<Node *>(&(*nodes.find(singleton)));
    }
 };
 /*
 * 计算read的分数
 */
-static int16_t computeDuplicateScore(BamWrap &bw)
+int16_t computeDuplicateScore(BamWrap &bw);
 {
    int16_t score = 0;
    switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
    {
    case ns_md::SUM_OF_BASE_QUALITIES:
        // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
        // and risk overflow.
        score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
        break;
    case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
        if (!bw.GetReadUnmappedFlag())
            // no need to remember the score since this scoring mechanism is symmetric
            score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
        break;
    case ns_md::RANDOM:
        // The RANDOM score gives the same score to both reads so that they get filtered together.
        // it's not critical do use the readName since the scores from both ends get added, but it seem
        // to be clearer this way.
        score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
        // subtract Short.MIN_VALUE/4 from it to end up with a number between
        // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
        // not passing filters. We need to stay far from overflow so that when we add the two
        // scores from the two read mates we do not overflow since that could cause us to chose a
        // failing read-pair instead of a passing one.
        score -= INT16_MIN / 4;
    default:
        break;
    }
    // make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
    // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
    score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
    return score;
 }
 /*
- * Builds a read ends object that represents a single read. 用来表示一个read的特征结构
+ * Builds a read ends object that represents a single read.
 * 用来表示一个read的特征结构
 */
-static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
+void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
 {
    auto &k = *pKey;
    auto &bc = bw.b->core;
    k.read1FirstOfPair = bw.GetFirstOfPairFlag();
    k.read1ReferenceIndex = bc.tid;
    k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
    k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
-    k.read1IndexInFile = index;
+/*
-    k.score = computeDuplicateScore(bw);
+ * 处理一组pairend的readends，标记冗余
-    // Doing this lets the ends object know that it's part of a pair
+ */
-    if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
+void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
-    {
+                        DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
-        k.read2ReferenceIndex = bc.mtid;
+
-    }
+/*
-    // Fill in the location information for optical duplicates
+ * 处理一组非paired的readends，标记冗余
-    rnParser.AddLocationInformation(bw.query_name(), pKey);
+ */
-    // cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
+void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
-    // 计算位置key
+                            DupContainer<int64_t> *dupIdx);
-    k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
+
-}
+/*
 * 处理位于某个坐标的pairend reads
 */
 void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
                 DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
 /*
 * 处理位于某个坐标的非配对的frag reads
 */
 void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
                 DupContainer<int64_t> *dupIdx);
 /*
 * 对找到的pairend read end添加一些信息
 */
 void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds);
 /**
- * Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
+ * Looks through the set of reads and identifies how many of the duplicates are
- * quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
+ * in fact optical duplicates, and stores the data in the instance level histogram.
- * read to the records of the first and second in a pair. This value becomes is used for
+ * Additionally sets the transient isOpticalDuplicate flag on each read end that is
- * the 'DI' tag.
+ * identified as an optical duplicate.
 * 记录光学原因造成的冗余
 */
-static void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe)
+void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe);
 {
 }
 /* 处理一组pairend的readends，标记冗余 */
 static void markDuplicatePairs(int64_t posKey,
                               vector<const ReadEnds *> &vpRe,
                               DupContainer<int64_t> *dupIdx,
                               DupContainer<int64_t> *opticalDupIdx)
 {
    if (vpRe.size() < 2)
    {
        if (vpRe.size() == 1)
        {
            // addSingletonToCount(libraryIdGenerator);
        }
        return;
    }
    // cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
    auto &vDupIdx = dupIdx->AtPos(posKey);
    auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
    int maxScore = 0;
    const ReadEnds *pBest = nullptr;
    /** All read ends should have orientation FF, FR, RF, or RR **/
    for (auto pe : vpRe) // 找分数最高的readend
    {
        if (pe->score > maxScore || pBest == nullptr)
        {
            maxScore = pe->score;
            pBest = pe;
        }
    }
    if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
    {
        // trackOpticalDuplicates
    }
    for (auto pe : vpRe) // 对非best read标记冗余
    {
        if (pe != pBest) // 非best
        {
            vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
            if (pe->read2IndexInFile != pe->read1IndexInFile)
                vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
        }
    }
    if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
    {
        addRepresentativeReadIndex(vpRe);
    }
 }
 /* 处理一组非paired的readends，标记冗余 */
 static void markDuplicateFragments(int64_t posKey,
                                   vector<const ReadEnds *> &vpRe,
                                   bool containsPairs,
                                   DupContainer<int64_t> *dupIdx)
 {
    auto &vDupIdx = dupIdx->AtPos(posKey);
    if (containsPairs)
    {
        for (auto pe : vpRe)
        {
            if (!pe->IsPaired())
            {
                vDupIdx.push_back(pe->read1IndexInFile);
            }
        }
    }
    else
    {
        int maxScore = 0;
        const ReadEnds *pBest = nullptr;
        for (auto pe : vpRe)
        {
            if (pe->score > maxScore || pBest == nullptr)
            {
                maxScore = pe->score;
                pBest = pe;
            }
        }
        for (auto pe : vpRe)
        {
            if (pe != pBest)
            {
                vDupIdx.push_back(pe->read1IndexInFile);
            }
        }
    }
 }
 /* 处理位于某个坐标的pairend reads */
 static inline void handlePairs(int64_t posKey,
                               vector<ReadEnds> &readEnds,
                               vector<const ReadEnds *> &vpCache,
                               DupContainer<int64_t> *dupIdx,
                               DupContainer<int64_t> *opticalDupIdx)
 {
    if (readEnds.size() > 1) // 有潜在的冗余
    {
        vpCache.clear();
 //        std::sort(readEnds.begin(), readEnds.end());
        const ReadEnds *pReadEnd = nullptr;
        for (auto &re : readEnds)
        {
            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
                vpCache.push_back(&re);                                                           // 处理一个潜在的冗余组
            else
            {
                markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); // 不一样
                vpCache.clear();
                vpCache.push_back(&re);
                pReadEnd = &re;
            }
        }
        markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
    }
 }
 /* 处理位于某个坐标的 reads */
 static inline void handleFrags(
    int64_t posKey,
    vector<ReadEnds> &readEnds,
    vector<const ReadEnds *> &vpCache,
    DupContainer<int64_t> *dupIdx)
 {
    if (readEnds.size() > 1) // 有潜在的冗余
    {
        vpCache.clear();
 //        std::sort(readEnds.begin(), readEnds.end());
        const ReadEnds *pReadEnd = nullptr;
        bool containsPairs = false;
        bool containsFrags = false;
        for (auto &re : readEnds)
        {
            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
            {
                vpCache.push_back(&re);
                containsPairs = containsPairs || re.IsPaired();
                containsFrags = containsFrags || !re.IsPaired();
            }
            else
            {
                if (vpCache.size() > 1 && containsFrags)
                {
                    markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
                }
                vpCache.clear();
                vpCache.push_back(&re);
                pReadEnd = &re;
                containsPairs = re.IsPaired();
                containsFrags = !re.IsPaired();
            }
        }
        if (vpCache.size() > 1 && containsFrags)
        {
            markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
        }
    }
 }
 /* 对找到的pairend read end添加一些信息 */
 static inline void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds)
 {
    auto &pairedEnds = *pPairedEnds;
    int64_t bamIdx = fragEnd.read1IndexInFile;
    const int matesRefIndex = fragEnd.read1ReferenceIndex;
    const int matesCoordinate = fragEnd.read1Coordinate;
    // Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands.  NB: must do this
    // before updating the orientation later.
    if (fragEnd.read1FirstOfPair)
    {
        pairedEnds.orientationForOpticalDuplicates =
            ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
    }
    else
    {
        pairedEnds.orientationForOpticalDuplicates =
            ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
    }
    // If the other read is actually later, simply add the other read's data as read2, else flip the reads
    if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
        (matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
    {
        pairedEnds.read2ReferenceIndex = matesRefIndex;
        pairedEnds.read2Coordinate = matesCoordinate;
        pairedEnds.read2IndexInFile = bamIdx;
        pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
                                                              fragEnd.IsNegativeStrand());
        // if the two read ends are in the same position, pointing in opposite directions,
        // the orientation is undefined and the procedure above
        // will depend on the order of the reads in the file.
        // To avoid this, we set it explicitly (to FR):
        if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
            pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
            pairedEnds.orientation == ReadEnds::RF)
        {
            pairedEnds.orientation = ReadEnds::FR;
        }
    }
    else
    {
        pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
        pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
        pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
        pairedEnds.read1ReferenceIndex = matesRefIndex;
        pairedEnds.read1Coordinate = matesCoordinate;
        pairedEnds.read1IndexInFile = bamIdx;
        pairedEnds.orientation = ReadEnds::GetOrientationByte(fragEnd.IsNegativeStrand(),
                                                              pairedEnds.orientation == ReadEnds::R);
        pairedEnds.posKey = fragEnd.posKey;
    }
    pairedEnds.score += fragEnd.score;
 }
--- a/src/sam/markdups/parallel_md.cpp
+++ b/src/sam/markdups/parallel_md.cpp
--- a/src/sam/markdups/serial_md.cpp
+++ b/src/sam/markdups/serial_md.cpp
@ -0,0 +1,783 @@
 #include "serial_md.h"
 #include <common/hts/bam_buf.h>
 #include <common/utils/debug.h>
 #include <common/utils/global_arg.h>
 #include <common/utils/profiling.h>
 #include <common/utils/timer.h>
 #include <common/utils/util.h>
 #include <sam/utils/read_ends.h>
 #include <algorithm>
 #include <iostream>
 #include <set>
 #include <vector>
 #include "markdups_arg.h"
 #include "md_funcs.h"
 #include "shared_args.h"
 #include "dup_metrics.h"
 using std::cout;
 using std::set;
 using std::vector;
 /* 查找 */
 // template<class Itr, class T>
 // static inline Itr binaryFind(Itr first, Itr last, const T &val)
 // {
 //     first = std::lower_bound(first, last, val);
 //     return (first != last && *first == val) ? first : last;
 // }
 /* 排序 */
 static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
    size_t blockSize = 64 * 1024;
    if (arr.size() < blockSize) {
        std::sort(arr.begin(), arr.end());
        return;
    }
    size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
    size_t crossNum = 1024;
    size_t start, end, i, left, right;
    std::sort(arr.begin(), arr.begin() + blockSize);
    for (i = 1; i < blockNum; ++i) {
        start = i * blockSize;
        end = min(start + blockSize, arr.size());
        std::sort(arr.begin() + start, arr.begin() + end);
        left = crossNum;
        while (!(arr[start - left] < arr[start])) {
            left = left << 1;
            if (left >= blockSize) {
                std::sort(arr.begin(), arr.end());  // 退化到普通排序
                return;
            }
        }
        right = min(crossNum, end - start - 1);
        while (!(arr[start - 1] < arr[start + right])) {
            right = min(right << 1, end - start - 1);
            if (right == end - start - 1)
                break;
        }
        std::sort(arr.begin() + start - left, arr.begin() + start + right);
    }
 }
 /* 处理一组pairend的readends，标记冗余, 这个函数需要串行运行，因为需要做一些统计*/
 static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupIdx, set<int64_t> *opticalDupIdx,
                             set<int64_t> *notDupIdx = nullptr) {
    if (vpRe.size() < 2) {
        if (vpRe.size() == 1) {
            // addSingletonToCount(libraryIdGenerator);
            // 这个统计可能会有误差，因为当前位点可能还有没匹配上的read，导致当前位点的read（paired）数量为1
            // 可以通过后续的补充计算来解决这个问题，有必要么？
            gMetrics.AddSingletonToCount();
        }
        return;
    }
    int maxScore = 0;
    const ReadEnds *pBest = nullptr;
    /** All read ends should have orientation FF, FR, RF, or RR **/
    for (auto pe : vpRe) { // 找分数最高的readend
        if (pe->score > maxScore || pBest == nullptr) {
            maxScore = pe->score;
            pBest = pe;
        }
    }
    if (notDupIdx != nullptr) {
        notDupIdx->insert(pBest->read1IndexInFile);
        notDupIdx->insert(pBest->read2IndexInFile);
    }
    if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
        // trackOpticalDuplicates
        trackOpticalDuplicates(vpRe, pBest);
    }
    for (auto pe : vpRe)  // 对非best read标记冗余
    {
        if (pe != pBest)  // 非best
        {
            dupIdx->insert(pe->read1IndexInFile);  // 添加read1
            if (pe->read2IndexInFile != pe->read1IndexInFile)
                dupIdx->insert(pe->read2IndexInFile);  // 添加read2
        }
    }
    // 在输出的bam文件中添加tag
    if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
        // addRepresentativeReadIndex(vpRe); // 每次都更新就行，用最新的覆盖之前的（如果之前有）
    }
 }
 /* 处理一组非paired的readends，标记冗余 */
 static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs, set<int64_t> *dupIdx,
                             set<int64_t> *notDupIdx = nullptr) {
    if (containsPairs) {
        for (auto pe : vpRe) {
            if (!pe->IsPaired()) {
                dupIdx->insert(pe->read1IndexInFile);
            }
        }
    } else {
        int maxScore = 0;
        const ReadEnds *pBest = nullptr;
        for (auto pe : vpRe) {
            if (pe->score > maxScore || pBest == nullptr) {
                maxScore = pe->score;
                pBest = pe;
            }
        }
        if (notDupIdx != nullptr) {
            notDupIdx->insert(pBest->read1IndexInFile);
        }
        for (auto pe : vpRe) {
            if (pe != pBest) {
                dupIdx->insert(pe->read1IndexInFile);
            }
        }
    }
 }
 /* 找到与readend pos相等的所有readend */
 static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
    auto range = std::equal_range(src.begin(), src.end(), re,
                                  ReadEnds::PairsLittleThan);  // 只比对位点
    dst->insert(dst->end(), range.first, range.second);
 }
 /* 单线程生成readends (第一步)*/
 static void generateReadEnds(SerailMarkDupArg *arg) {
    auto &p = *arg;
    auto &rnParser = g_vRnParser[0];
    p.pairs.clear();
    p.frags.clear();
    p.unpairedDic.clear();
    p.unpairedPosArr.clear();
    /* 处理每个read，创建ReadEnd，并放入frag和pair中 */
    set<ReadEnds> reSet;
    ReadEnds lastRe;
    for (int i = 0; i < p.bams.size(); ++i)  // 循环处理每个read
    {
        BamWrap *bw = p.bams[i];
        const int64_t bamIdx = p.bamStartIdx + i;
        if (bw->GetReadUnmappedFlag()) {
            if (bw->b->core.tid == -1)
                // When we hit the unmapped reads with no coordinate, no reason
                // to continue (only in coordinate sort).
                break;
        } else if (!bw->IsSecondaryOrSupplementary())  // 是主要比对
        {
            ReadEnds fragEnd;
            tm_arr[8].acc_start();
            buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
            tm_arr[8].acc_end();
            p.frags.push_back(fragEnd);                                 // 添加进frag集合
            if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag())  // 是pairend而且互补的read也比对上了
            {
                string key = bw->query_name();
                if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
                    p.unpairedDic[key] = {p.taskSeq, fragEnd};
                } else  // 找到了pairend
                {
                    auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
                    modifyPairedEnds(fragEnd, &pairedEnds);
                    p.pairs.push_back(pairedEnds);
                    p.unpairedDic.erase(key);  // 删除找到的pairend
                }
            }
        }
    }
    tm_arr[9].acc_start();
    sortReadEndsArr(p.frags);
    // sort(p.frags.begin(), p.frags.end());
    tm_arr[9].acc_end();
    // cout << "sort pairs" << endl;
    tm_arr[10].acc_start();
    sort(p.pairs.begin(), p.pairs.end());
    tm_arr[10].acc_end();
    // 记录位点上的未匹配的read个数
    for (auto &e : p.unpairedDic) {
        auto posKey = e.second.unpairedRE.posKey;
        auto &unpairArrInfo = p.unpairedPosArr[posKey];
        unpairArrInfo.unpairedNum++;
        unpairArrInfo.taskSeq = p.taskSeq;
        unpairArrInfo.readNameSet.insert(e.first);
    }
    // cout << "依赖比例：" <<  (float)p.unpairedDic.size() / p.frags.size() <<
    // endl;
 }
 /* 处理pairs */
 static void processPairs(vector<ReadEnds> &readEnds, set<int64_t> *dupIdx, set<int64_t> *opticalDupIdx,
                         set<int64_t> *notDupIdx = nullptr) {
    vector<const ReadEnds *> vpCache;  // 有可能是冗余的reads
    const ReadEnds *pReadEnd = nullptr;
    for (auto &re : readEnds) {
        if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))  // 跟前一个一样
            vpCache.push_back(&re);  // 处理一个潜在的冗余组
        else {
            markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);  // 不一样
            vpCache.clear();
            vpCache.push_back(&re);
            pReadEnd = &re;
        }
    }
    markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
 }
 /* 处理frags */
 static void processFrags(vector<ReadEnds> &readEnds, set<int64_t> *dupIdx, set<int64_t> *notDupIdx = nullptr) {
    bool containsPairs = false;
    bool containsFrags = false;
    vector<const ReadEnds *> vpCache;  // 有可能是冗余的reads
    const ReadEnds *pReadEnd = nullptr;
    for (auto &re : readEnds) {
        if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
            vpCache.push_back(&re);
            containsPairs = containsPairs || re.IsPaired();
            containsFrags = containsFrags || !re.IsPaired();
        } else {
            if (vpCache.size() > 1 && containsFrags) {
                markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
            }
            vpCache.clear();
            vpCache.push_back(&re);
            pReadEnd = &re;
            containsPairs = re.IsPaired();
            containsFrags = !re.IsPaired();
        }
    }
    if (vpCache.size() > 1 && containsFrags) {
        markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
    }
 }
 /* 单线程markdup (第二步)*/
 static void markdups(SerailMarkDupArg *arg) {
    auto &p = *arg;
    p.pairDupIdx.clear();
    p.pairOpticalDupIdx.clear();
    p.fragDupIdx.clear();
    /* generateDuplicateIndexes，计算冗余read在所有read中的位置索引 */
    // 先处理 pair
    processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
    // 再处理frag
    processFrags(p.frags, &p.fragDupIdx);
 }
 /* 获取交叉部分的数据 */
 static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst,
                                    bool isPairCmp = false) {
    if (leftArr.empty() || rightArr.empty()) {
        cout << "bad size: " << leftArr.size() << '\t' << rightArr.size() << endl;
        return;
    }
    const size_t leftEndIdx = leftArr.size() - 1;
    const size_t rightStartIdx = 0;
    size_t leftSpan = 0;
    size_t rightSpan = 0;
    while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
        leftSpan += 1;
        if (leftSpan > leftEndIdx) {
            leftSpan = leftArr.size() - 1;
            break;
        }
    }
    while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
        rightSpan += 1;
        if (rightSpan == rightArr.size() - 1)
            break;
    }
    dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
    dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
    std::sort(dst->begin(), dst->end());
 }
 /* 将frags重叠部分的dup idx放进数据中 */
 static inline void refreshFragDupIdx(set<int64_t> &dupIdx, set<int64_t> &notDupIdx, SerailMarkDupArg *lastArg,
                                     SerailMarkDupArg *curArg) {
    auto &lp = *lastArg;
    auto &p = *curArg;
    for (auto idx : dupIdx) {
        lp.fragDupIdx.insert(idx);
        p.fragDupIdx.erase(idx);
    }
    for (auto idx : notDupIdx) {
        lp.fragDupIdx.erase(idx);
        p.fragDupIdx.erase(idx);
    }
 }
 /* 将pairs重叠部分的dup idx放进数据中 */
 static inline void refreshPairDupIdx(set<int64_t> &dupIdx, set<int64_t> &opticalDupIdx, set<int64_t> &notDupIdx,
                                     SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) {
    auto &lp = *lastArg;
    auto &p = *curArg;
    for (auto idx : dupIdx) {
        lp.pairDupIdx.insert(idx);
        p.pairDupIdx.erase(idx);
    }
    for (auto idx : opticalDupIdx) {
        lp.pairOpticalDupIdx.insert(idx);
        p.pairOpticalDupIdx.erase(idx);
    }
    for (auto idx : notDupIdx) {
        lp.pairDupIdx.erase(idx);
        lp.pairOpticalDupIdx.erase(idx);
        p.pairDupIdx.erase(idx);
        p.pairOpticalDupIdx.erase(idx);
    }
 }
 // 用来分别处理dup和optical dup
 static void refeshTaskDupInfo(set<int64_t> &dupIdx, set<int64_t> &opticalDupIdx, set<int64_t> &notDupIdx,
                              set<int64_t> &latterDupIdx, set<int64_t> &latterOpticalDupIdx,
                              set<int64_t> &latterNotDupIdx) {
    for (auto idx : dupIdx) latterDupIdx.insert(idx);
    for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx);
    for (auto idx : notDupIdx) latterNotDupIdx.insert(idx);
 }
 /* 最后合并数据并排序 */
 static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx, set<int64_t> &notDupIdx, vector<int64_t> &dupArr) {
    vector<int64_t> midArr;
    auto ai = dupArr.begin();
    auto bi = dupIdx.begin();
    auto ae = dupArr.end();
    auto be = dupIdx.end();
    int64_t val = 0;
    while (ai != ae && bi != be) {
        if (*ai < *bi) {
            val = *ai++;
        } else if (*bi < *ai) {
            val = *bi++;
        } else {
            val = *ai++;
            bi++;
        }
        if (notDupIdx.find(val) == notDupIdx.end()) {
            midArr.push_back(val);
        }
    }
    while (ai != ae) {
        val = *ai++;
        if (notDupIdx.find(val) == notDupIdx.end()) {
            midArr.push_back(val);
        }
    }
    while (bi != be) {
        val = *bi++;
        if (notDupIdx.find(val) == notDupIdx.end()) {
            midArr.push_back(val);
        }
    }
    dupArr = midArr;
 }
 /* 处理相邻的两个任务，有相交叉的数据  */
 static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) {
    auto &lp = *lastArg;
    auto &p = *curArg;
    auto &g = *gDataArg;
    vector<ReadEnds> reArr;
    set<int64_t> dupIdx;
    set<int64_t> notDupIdx;
    // 先处理重叠的frags
    getIntersectData(lp.frags, p.frags, &reArr);
    processFrags(reArr, &dupIdx, &notDupIdx);
    refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
    // 再处理重叠的pairs
    reArr.clear();
    dupIdx.clear();
    notDupIdx.clear();
    set<int64_t> opticalDupIdx;
    getIntersectData(lp.pairs, p.pairs, &reArr, true);
    processPairs(reArr, &dupIdx, &opticalDupIdx, &notDupIdx);
    refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
    // 处理之前未匹配的部分
    map<CalcKey, int64_t> recalcPos;
    set<CalcKey> alreadyAdd;  // 与该位点相同的pair都添加到数组里了
    set<int64_t> addToGlobal;
    int64_t prevLastPos = 0, nextFirstPos = 0;
    if (lp.frags.size() > 0)
        prevLastPos = lp.frags.back().posKey;
    if (p.frags.size() > 0)
        nextFirstPos = p.frags[0].posKey;
    // cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
    for (auto &prevUnpair : lp.unpairedDic) {  // 遍历上一个任务中的每个未匹配的read
        auto &readName = prevUnpair.first;
        auto &prevPosInfo = prevUnpair.second;
        auto prevFragEnd = prevPosInfo.unpairedRE;  // 未匹配的read end
        if (p.unpairedDic.find(readName) != p.unpairedDic.end()) {  // 在当前这个任务里找到了这个未匹配的read
            auto &nextPosInfo = p.unpairedDic[readName];
            auto &nextFragEnd = nextPosInfo.unpairedRE;
            int64_t prevPosKey = prevFragEnd.posKey;
            modifyPairedEnds(nextFragEnd, &prevFragEnd);  // 在某些clip情况下，poskey可能是后面的read
            int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
            CalcKey ck = {prevPosKey, nextPosKey};
            UnpairedPosInfo *prevUnpairInfoP = nullptr;
            UnpairedPosInfo *nextUnpairInfoP = nullptr;
            if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
                prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
            if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
                nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
            // pos分为两种情况，根据poskey(pair中两个read分别的pos)的位置确定
            //  1.
            //  prevpos在交叉部分之前，nextpos在交叉部分之后，这种情况不需要获取pairarr中的数据;
            //  2.
            //  prevpos在交叉部分之前，nextpos在交叉部分，需要获取lp中的相等read
            //  pair进行重新计算
            //     复杂情况1.
            //     g中包含prevPosKey对应的unpair，p中有对应的pair，此时应该把这些pair考虑进去
            //  3.
            //  prevpos在交叉部分，nextpos在交叉部分之后，需要获取p中的相等read
            //  pair进行重新计算
            //     复杂情况2. p中是否包含prevPosKey对应的unpair
            //  4.
            //  prevpos在交叉部分，nextpos在交叉部分，需要获取lp和p中的相等read
            //  pair进行重新计算
            bool addDataToPos = true;
            if (alreadyAdd.find(ck) != alreadyAdd.end()) {
                addDataToPos = false;  // 之前已经添加过了，后面就不用再添加数据了
            } else
                alreadyAdd.insert(ck);
            if (prevPosKey < nextFirstPos) {                   // prevpos在交叉部分之前
                auto &prevPairArr = prevUnpairInfoP->pairArr;  // prevUnpairInfoP肯定不是nullptr
                prevPairArr.push_back(prevFragEnd);
                if (nextPosKey <= prevLastPos && addDataToPos) {  // 第二种情况
                    getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
                }
                // 第一种情况，第二种情况下都会出现，复杂情况一
                auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
                if (gPosInfo != g.unpairedPosArr.end()) {  // 可能g和p有匹配的，刚好和该位点一致
                    auto &gUnpairInfo = gPosInfo->second;
                    auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
                    if (pPosInfo != p.unpairedPosArr.end()) {
                        auto &pUnpairInfo = pPosInfo->second;
                        for (auto &rn : gUnpairInfo.readNameSet) {  // 遍历每一个readname，看是否有匹配的
                            if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end()) {
                                auto pe = g.unpairedDic[rn].unpairedRE;
                                auto fe = p.unpairedDic[rn].unpairedRE;
                                modifyPairedEnds(fe, &pe);
                                prevPairArr.push_back(pe);
                                g.unpairedDic.erase(rn);
                                p.unpairedDic.erase(rn);
                                // cout << "找到了！" << rn << endl;
                            }
                        }
                    }
                }
                recalcPos[ck] = prevPosInfo.taskSeq;
                std::sort(prevPairArr.begin(), prevPairArr.end());
            } else { // prevpos在交叉部分
                if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
                    if (nextUnpairInfoP != nullptr) { // 且在pos点，next task有unpair，这样才把这些数据放到next task里
                        auto &nextPairArr = nextUnpairInfoP->pairArr;
                        nextPairArr.push_back(prevFragEnd);
                        auto &prevPairArr = prevUnpairInfoP->pairArr;
                        prevPairArr.push_back(prevFragEnd);
                        if (addDataToPos) {
                            getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
                        }
                        // 将数据放到next task里,（这个位点以后会可能还会计算到，目前方案是都计算，只是把冗余剔除）
                        recalcPos[ck] = nextPosInfo.taskSeq;
                        std::sort(prevPairArr.begin(), prevPairArr.end());
                    } else { // next task在该位点没有unpair，那就把数据放到prev task里
                        auto &prevPairArr = prevUnpairInfoP->pairArr;  // prevUnpairInfoP肯定不是nullptr
                        prevPairArr.push_back(prevFragEnd);
                        if (addDataToPos)  // 第二种情况
                            getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
                        recalcPos[ck] = prevPosInfo.taskSeq;
                        std::sort(prevPairArr.begin(), prevPairArr.end());
                    }
                } else {  // 第四种情况
                    if (prevUnpairInfoP == nullptr) {
                        prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
                        prevUnpairInfoP->taskSeq = lp.taskSeq;
                    }
                    auto &prevPairArr = prevUnpairInfoP->pairArr;
                    prevPairArr.push_back(prevFragEnd);
                    if (addDataToPos) {
                        getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
                        getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
                    }
                    recalcPos[ck] = prevPosInfo.taskSeq;
                    std::sort(prevPairArr.begin(), prevPairArr.end());
                }
            }
            p.unpairedDic.erase(readName); // 在next task里删除该read
        } else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
            auto &remainPosInfo = g.unpairedDic[readName];
            auto remainFragEnd = remainPosInfo.unpairedRE;
            int64_t remainPosKey = remainFragEnd.posKey;
            modifyPairedEnds(prevFragEnd, &remainFragEnd);  // 在某些clip情况下，poskey可能是后面的read
            auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
            auto &remainPairArr = remainUnpairInfo.pairArr;
            remainPairArr.push_back(remainFragEnd);
            CalcKey ck = {remainPosKey, prevFragEnd.posKey};
            recalcPos[ck] = remainPosInfo.taskSeq;
            std::sort(remainPairArr.begin(), remainPairArr.end());
            g.unpairedDic.erase(readName);
        } else { // 都没找到，那就保存到遗留数据里
            int64_t prevPosKey = prevFragEnd.posKey;
            g.unpairedDic.insert(prevUnpair);
            addToGlobal.insert(prevPosKey);
        }
    }
    // 最后再添加，以防开始赋值，后来这个位置要是又添加了新的数据
    for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
    map<int64_t, TaskSeqDupInfo> taskChanged;
    set<int64_t> posProcessed;
    for (auto &e : recalcPos) {
        auto posKey = e.first.read1Pos;
        if (posProcessed.find(posKey) != posProcessed.end())
            continue;
        posProcessed.insert(posKey);
        auto taskSeq = e.second;
        auto &t = taskChanged[taskSeq];
        // 在对应的任务包含的dup idx里修改结果数据
        vector<ReadEnds> *pairArrP = nullptr;
        if (taskSeq < lp.taskSeq)
            pairArrP = &g.unpairedPosArr[posKey].pairArr;
        else
            pairArrP = &lp.unpairedPosArr[posKey].pairArr;
        processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
        if (taskSeq < lp.taskSeq)
            g.unpairedPosArr.erase(posKey);
    }
    // 更新结果
    for (auto &e : taskChanged) {
        auto taskSeq = e.first;
        auto &t = e.second;
        if (taskSeq < lp.taskSeq) {
            refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq],
                              g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
        } else if (taskSeq == lp.taskSeq) {
            refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
        } else {
            refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp);  // 把结果放到p中
        }
    }
    // cout << "remain unpaired: " << g.unpairedDic.size() << '\t' <<
    // g.unpairedPosArr.size() << endl; cout << "calc g time: " <<
    // t.seconds_elapsed() << " s" << endl; 将dupidx放进全局数据
    g.latterDupIdxArr.push_back(set<int64_t>());
    g.latterOpticalDupIdxArr.push_back(set<int64_t>());
    g.latterNotDupIdxArr.push_back(set<int64_t>());
    g.dupIdxArr.push_back(vector<int64_t>());
    auto &vIdx = g.dupIdxArr.back();
    lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
    vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
    g.opticalDupIdxArr.push_back(vector<int64_t>());
    auto &vOpticalIdx = g.opticalDupIdxArr.back();
    vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
 }
 /* 当所有任务结束后，global data里还有未处理的数据 */
 static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
    auto &lp = *task;
    auto &g = *gDataArg;
    // 遗留的未匹配的pair
    for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
        auto &readName = prevUnpair.first;
        auto &prevPosInfo = prevUnpair.second;
        auto prevFragEnd = prevPosInfo.unpairedRE;  // 未匹配的read end
        if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
            auto &remainPosInfo = g.unpairedDic[readName];
            auto remainFragEnd = remainPosInfo.unpairedRE;
            int64_t remainPosKey = remainFragEnd.posKey;
            modifyPairedEnds(prevFragEnd, &remainFragEnd);  // 在某些clip情况下，poskey可能是后面的read
            auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
            remainUnpairInfo.pairArr.push_back(remainFragEnd);
            g.unpairedDic.erase(readName);
        }
    }
    map<int64_t, TaskSeqDupInfo> taskChanged;
    for (auto &e : g.unpairedPosArr) {
        auto posKey = e.first;
        auto taskSeq = e.second.taskSeq;
        auto &t = taskChanged[taskSeq];
        auto &arr = g.unpairedPosArr[posKey].pairArr;
        if (arr.size() > 1) {
            std::sort(arr.begin(), arr.end());
            processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
        }
    }
    // 更新结果
    vector<int64_t> addDup;
    map<int64_t, int64_t> ndPosVal;
    for (auto &e : taskChanged) {
        auto taskSeq = e.first;
        auto &t = e.second;
        refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx, g.latterDupIdxArr[taskSeq],
                          g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
    }
    cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
    g.unpairedPosArr.clear();
    g.unpairedDic.clear();
    // 将dupidx放进全局数据
    for (int i = 0; i < (int)g.dupIdxArr.size() - 1; ++i)
        refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
    for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i)
        refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
    g.dupIdxArr.push_back(vector<int64_t>());
    auto &vIdx = g.dupIdxArr.back();
    lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
    vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
    g.opticalDupIdxArr.push_back(vector<int64_t>());
    auto &vOpticalIdx = g.opticalDupIdxArr.back();
    vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
 }
 /* 串行处理数据，标记冗余 */
 void serialMarkDups() {
    tm_arr[5].acc_start();
    Timer::log_time("serial start");
    // 读取缓存初始化
    BamBufType inBamBuf(g_gArg.use_asyncio);
    inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
    // BamBufType inBamBuf(false);
    // inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
    int64_t processedBamNum = 0;
    SerailMarkDupArg smdArg1, smdArg2;
    SerailMarkDupArg *lastArgP = &smdArg1;
    SerailMarkDupArg *curArgP = &smdArg2;
    bool isFirstRound = true;
    int roundNum = 0;
    int64_t readNumSum = 0;
    while (inBamBuf.ReadStat() >= 0) {
        Timer t_round;
        // 读取bam文件中的read
        tm_arr[4].acc_start();
        size_t readNum = inBamBuf.ReadBam();
        readNumSum += readNum;
        tm_arr[4].acc_end();
        cout << "read num: " << readNum << '\t' << roundNum << endl;
        // lastArgP = curArgP;
        tm_arr[6].acc_start();
        curArgP->taskSeq = roundNum;
        curArgP->bamStartIdx = processedBamNum;
        curArgP->bams = inBamBuf.GetBamArr();
        tm_arr[6].acc_end();
        tm_arr[0].acc_start();
        Timer t1;
        generateReadEnds(curArgP);
        // cout << "calc read end time: " << t1.seconds_elapsed() << " s" <<
        // endl;
        tm_arr[0].acc_end();
        tm_arr[1].acc_start();
        t1.reinit();
        markdups(curArgP);
        // cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
        tm_arr[1].acc_end();
        if (!isFirstRound) {
            tm_arr[2].acc_start();
            t1.reinit();
            handleIntersectData(lastArgP, curArgP, &gData);
            // cout << "intersect time: " << t1.seconds_elapsed() << " s" <<
            // endl;
            //  addTaskIdxToSet(lastArgP, &gData);
            tm_arr[2].acc_end();
        } else {
            isFirstRound = false;
        }
        inBamBuf.ClearAll();  // 清理上一轮读入的数据
        processedBamNum += readNum;
        // 交换
        auto tmp = lastArgP;
        lastArgP = curArgP;
        curArgP = tmp;
        // cout << "round time: " << t_round.seconds_elapsed() << endl;
        roundNum++;
        if (roundNum % 100 == 0) {
            cout << "read sum: " << readNumSum << endl;
            cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
        }
    }
    // cout << "here" << endl;
    tm_arr[3].acc_start();
    // 处理剩下的全局数据
    handleLastTask(lastArgP, &gData);
    // cout << "here 2" << endl;
    tm_arr[3].acc_end();
    tm_arr[5].acc_end();
    // 统计所有冗余index数量
    int64_t dupNum = 0;
    map<int64_t, int> dup;
    int taskSeq = 0;
    for (auto &arr : gData.dupIdxArr) {
        for (auto idx : arr) {
            if (dup.find(idx) != dup.end()) {
                // cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t'
                // << idx << endl;
            }
            dup[idx] = taskSeq;
        }
        taskSeq++;
    }
    // #include <fstream>
    // ofstream out("tumor_dup.txt");
    // for (auto idx : dup)
    // {
    //     out << idx << endl;
    // }
    // out.close();
    for (auto &arr : gData.dupIdxArr) dupNum += arr.size();
    cout << "dup num     : " << dupNum << '\t' << dup.size() << endl;
    cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
    cout << "markdup     : " << tm_arr[1].acc_seconds_elapsed() << endl;
    cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
    cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
    cout << "read bam    : " << tm_arr[4].acc_seconds_elapsed() << endl;
    cout << "new arg     : " << tm_arr[6].acc_seconds_elapsed() << endl;
    cout << "del arg     : " << tm_arr[7].acc_seconds_elapsed() << endl;
    cout << "build ends  : " << tm_arr[8].acc_seconds_elapsed() << endl;
    cout << "sort frags  : " << tm_arr[9].acc_seconds_elapsed() << endl;
    cout << "sort pairs  : " << tm_arr[10].acc_seconds_elapsed() << endl;
    cout << "all         : " << tm_arr[5].acc_seconds_elapsed() << endl;
    Timer::log_time("serial end  ");
    // for (auto i : gData.dupArr)
    //     cout << i << endl;
 }
--- a/src/sam/markdups/serial_md.h
+++ b/src/sam/markdups/serial_md.h
@ -1,20 +1,28 @@
-#include <algorithm>
+#pragma once
 #include <common/hts/bam_buf.h>
 #include <robin-map/include/tsl/robin_map.h>
 #include <sam/utils/read_ends.h>
 #include <set>
 #include <string>
 #include <vector>
 using std::set;
 using std::string;
 using std::vector;
 /* 存放未匹配readend相同位点的所有readend */
-struct UnpairedREInfo
+struct UnpairedREInfo {
 {
    int64_t taskSeq;
    ReadEnds unpairedRE;
 };
 /* 对于一个pair数据，一个完整的计算点，包含read1的比对位置和read2的比对位置 */
-struct CalcKey
+struct CalcKey {
 {
    int64_t read1Pos;
    int64_t read2Pos;
-    bool operator<(const CalcKey &o) const
+    bool operator<(const CalcKey &o) const {
    {
        int comp = (int)(read1Pos - o.read1Pos);
        if (comp == 0)
            comp = (int)(read2Pos - o.read2Pos);
@ -23,16 +31,14 @@ struct CalcKey
 };
 /* 当遗留数据在当前任务找到了pair read后，进行冗余计算时候存放结果的数据结构 */
-struct TaskSeqDupInfo
+struct TaskSeqDupInfo {
 {
    set<int64_t> dupIdx;
    set<int64_t> opticalDupIdx;
    set<int64_t> notDupIdx;
 };
 /* 保存有未匹配pair位点的信息，包括read end数组和有几个未匹配的read end */
-struct UnpairedPosInfo
+struct UnpairedPosInfo {
 {
    int unpairedNum = 0;
    int64_t taskSeq;
    vector<ReadEnds> pairArr;
@ -45,13 +51,12 @@ typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name
 typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;  // 以位点为索引，保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
 /* 单线程处理冗余参数结构体 */
-struct SerailMarkDupArg
+struct SerailMarkDupArg {
-{
+    int64_t taskSeq; // 任务序号
    int64_t taskSeq;
    int64_t bamStartIdx;  // 当前vBam数组中第一个bam记录在整体bam中所处的位置
    vector<BamWrap *> bams;  // 存放待处理的bam read
-    vector<ReadEnds> pairs;
+    vector<ReadEnds> pairs; // 成对的reads
-    vector<ReadEnds> frags;
+    vector<ReadEnds> frags; // 暂未找到配对的reads
    set<int64_t> pairDupIdx;         // pair的冗余read的索引
    set<int64_t> pairOpticalDupIdx;  // optical冗余read的索引
    set<int64_t> fragDupIdx;         // frag的冗余read的索引
@ -60,8 +65,7 @@ struct SerailMarkDupArg
 };
 /* 全局保留的数据，因为有些paired数据比对到了不同的染色体，相距甚远 */
-struct GlobalDataArg
+struct GlobalDataArg {
 {
    UnpairedNameMap unpairedDic;  // 用来寻找pair end
    UnpairedPositionMap unpairedPosArr;
@ -75,869 +79,5 @@ struct GlobalDataArg
    vector<set<int64_t>> latterNotDupIdxArr;
 };
-static GlobalDataArg gData;
+// 串行运行mark duplicate
-
+void serialMarkDups();
 /* 查找 */
 // template<class Itr, class T>
 // static inline Itr binaryFind(Itr first, Itr last, const T &val)
 // {
 //     first = std::lower_bound(first, last, val);
 //     return (first != last && *first == val) ? first : last;
 // }
 /* 排序 */
 static inline void sortReadEndsArr(vector<ReadEnds> &arr)
 {
    size_t blockSize = 64 * 1024;
    blockSize = min(blockSize, arr.size());
    size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
    size_t crossNum = 1024;
    size_t start, end, i, left, right;
    std::sort(arr.begin(), arr.begin() + blockSize);
    for (i = 1; i < blockNum; ++i)
    {
        start = i * blockSize;
        end = min(start + blockSize, arr.size());
        std::sort(arr.begin() + start, arr.begin() + end);
        left = crossNum;
        while (!(arr[start - left] < arr[start]))
        {
            left = left << 1;
            if (left >= blockSize)
            {
                std::sort(arr.begin(), arr.end()); // 退化到普通排序
                return;
            }
        }
        right = min(crossNum, end - start - 1);
        while (!(arr[start - 1] < arr[start + right]))
        {
            right = min(right << 1, end - start - 1);
            if (right == end - start - 1)
                break;
        }
        std::sort(arr.begin() + start - left, arr.begin() + start + right);
    }
 }
 /* 处理一组pairend的readends，标记冗余 */
 static void markDupsForPairs(vector<const ReadEnds *> &vpRe,
                                  set<int64_t> *dupIdx,
                                  set<int64_t> *opticalDupIdx,
                                  set<int64_t> *notDupIdx = nullptr)
 {
    if (vpRe.size() < 2)
    {
        if (vpRe.size() == 1)
        {
            // addSingletonToCount(libraryIdGenerator);
        }
        return;
    }
    int maxScore = 0;
    const ReadEnds *pBest = nullptr;
    /** All read ends should have orientation FF, FR, RF, or RR **/
    for (auto pe : vpRe) // 找分数最高的readend
    {
        if (pe->score > maxScore || pBest == nullptr)
        {
            maxScore = pe->score;
            pBest = pe;
        }
    }
    if (notDupIdx != nullptr)
    {
        notDupIdx->insert(pBest->read1IndexInFile);
        notDupIdx->insert(pBest->read2IndexInFile);
    }
    if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
    {
        // trackOpticalDuplicates
    }
    for (auto pe : vpRe) // 对非best read标记冗余
    {
        if (pe != pBest) // 非best
        {
            dupIdx->insert(pe->read1IndexInFile); // 添加read1
            if (pe->read2IndexInFile != pe->read1IndexInFile)
                dupIdx->insert(pe->read2IndexInFile); // 添加read2
        }
    }
    // if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
    // {
    //     addRepresentativeReadIndex(vpRe);
    // }
 }
 /* 处理一组非paired的readends，标记冗余 */
 static void markDupsForFrags(vector<const ReadEnds *> &vpRe,
                             bool containsPairs,
                             set<int64_t> *dupIdx,
                             set<int64_t> *notDupIdx = nullptr)
 {
    if (containsPairs)
    {
        for (auto pe : vpRe)
        {
            if (!pe->IsPaired())
            {
                dupIdx->insert(pe->read1IndexInFile);
            }
        }
    }
    else
    {
        int maxScore = 0;
        const ReadEnds *pBest = nullptr;
        for (auto pe : vpRe)
        {
            if (pe->score > maxScore || pBest == nullptr)
            {
                maxScore = pe->score;
                pBest = pe;
            }
        }
        if (notDupIdx != nullptr)
        {
            notDupIdx->insert(pBest->read1IndexInFile);
        }
        for (auto pe : vpRe)
        {
            if (pe != pBest)
            {
                dupIdx->insert(pe->read1IndexInFile);
            }
        }
    }
 }
 /* 找到与readend pos相等的所有readend */
 static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst)
 {
    auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsLittleThan); // 只比对位点
    dst->insert(dst->end(), range.first, range.second);
 }
 /* 单线程生成readends (第一步)*/
 static void generateReadEnds(SerailMarkDupArg *arg)
 {
    auto &p = *arg;
    auto &rnParser = g_vRnParser[0];
    p.pairs.clear();
    p.frags.clear();
    p.unpairedDic.clear();
    p.unpairedPosArr.clear();
    /* 处理每个read，创建ReadEnd，并放入frag和pair中 */
    set<ReadEnds> reSet;
    ReadEnds lastRe;
    for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read
    {
        BamWrap *bw = p.bams[i];
        const int64_t bamIdx = p.bamStartIdx + i;
        if (bw->GetReadUnmappedFlag())
        {
            if (bw->b->core.tid == -1)
                // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
                break;
        }
        else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
        {
            ReadEnds fragEnd;
            tm_arr[8].acc_start();
            buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
            tm_arr[8].acc_end();
            p.frags.push_back(fragEnd);                                // 添加进frag集合
            if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
            {
                string key = bw->query_name();
                if (p.unpairedDic.find(key) == p.unpairedDic.end())
                {
                    p.unpairedDic[key] = {p.taskSeq, fragEnd};
                }
                else // 找到了pairend
                {
                    auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
                    modifyPairedEnds(fragEnd, &pairedEnds);
                    p.pairs.push_back(pairedEnds);
                    p.unpairedDic.erase(key); // 删除找到的pairend
                }
            }
        }
    }
    tm_arr[9].acc_start();
    sortReadEndsArr(p.frags);
    // sort(p.frags.begin(), p.frags.end());
    tm_arr[9].acc_end();
    // cout << "sort pairs" << endl;
    tm_arr[10].acc_start();
    sort(p.pairs.begin(), p.pairs.end());
    tm_arr[10].acc_end();
    // 记录位点上的未匹配的read个数
    for (auto &e : p.unpairedDic) {
        auto posKey = e.second.unpairedRE.posKey;
        auto &unpairArrInfo = p.unpairedPosArr[posKey];
        unpairArrInfo.unpairedNum++;
        unpairArrInfo.taskSeq = p.taskSeq;
        unpairArrInfo.readNameSet.insert(e.first);
    }
    cout << "依赖比例：" <<  (float)p.unpairedDic.size() / p.frags.size() << endl;
 }
 /* 处理pairs */
 static void processPairs(vector<ReadEnds> &readEnds,
                         set<int64_t> *dupIdx,
                         set<int64_t> *opticalDupIdx,
                         set<int64_t> *notDupIdx = nullptr)
 {
    vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
    const ReadEnds *pReadEnd = nullptr;
    for (auto &re : readEnds)
    {
        if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
            vpCache.push_back(&re);                                                           // 处理一个潜在的冗余组
        else
        {
            markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样
            vpCache.clear();
            vpCache.push_back(&re);
            pReadEnd = &re;
        }
    }
    markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
 }
 /* 处理frags */
 static void processFrags(vector<ReadEnds> &readEnds,
                         set<int64_t> *dupIdx,
                         set<int64_t> *notDupIdx = nullptr)
 {
    bool containsPairs = false;
    bool containsFrags = false;
    vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
    const ReadEnds *pReadEnd = nullptr;
    for (auto &re : readEnds)
    {
        if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
        {
            vpCache.push_back(&re);
            containsPairs = containsPairs || re.IsPaired();
            containsFrags = containsFrags || !re.IsPaired();
        }
        else
        {
            if (vpCache.size() > 1 && containsFrags)
            {
                markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
            }
            vpCache.clear();
            vpCache.push_back(&re);
            pReadEnd = &re;
            containsPairs = re.IsPaired();
            containsFrags = !re.IsPaired();
        }
    }
    if (vpCache.size() > 1 && containsFrags)
    {
        markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
    }
 }
 /* 单线程markdup (第二步)*/
 static void markdups(SerailMarkDupArg *arg)
 {
    auto &p = *arg;
    p.pairDupIdx.clear();
    p.pairOpticalDupIdx.clear();
    p.fragDupIdx.clear();
    /* generateDuplicateIndexes，计算冗余read在所有read中的位置索引 */
    // 先处理 pair
    processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
    // 再处理frag
    processFrags(p.frags, &p.fragDupIdx);
 }
 /* 获取交叉部分的数据 */
 static inline void getIntersectData(vector<ReadEnds> &leftArr,
                                    vector<ReadEnds> &rightArr,
                                    vector<ReadEnds> *dst,
                                    bool isPairCmp = false)
 {
    const size_t leftEndIdx = leftArr.size() - 1;
    const size_t rightStartIdx = 0;
    size_t leftSpan = 0;
    size_t rightSpan = 0;
    while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp))
    {
        leftSpan += 1;
        if (leftSpan > leftEndIdx)
        {
            leftSpan = leftArr.size() - 1;
            break;
        }
    }
    while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp))
    {
        rightSpan += 1;
        if (rightSpan == rightArr.size() - 1)
            break;
    }
    dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
    dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
    std::sort(dst->begin(), dst->end());
 }
 /* 将frags重叠部分的dup idx放进数据中 */
 static inline void refreshFragDupIdx(set<int64_t> &dupIdx,
                              set<int64_t> &notDupIdx, 
                              SerailMarkDupArg * lastArg, 
                              SerailMarkDupArg *curArg)
 {
    auto &lp = *lastArg;
    auto &p = *curArg;
    for (auto idx : dupIdx)
    {
        lp.fragDupIdx.insert(idx);
        p.fragDupIdx.erase(idx);
    }
    for (auto idx : notDupIdx)
    {
        lp.fragDupIdx.erase(idx);
        p.fragDupIdx.erase(idx);
    }
 }
 /* 将pairs重叠部分的dup idx放进数据中 */
 static inline void refreshPairDupIdx(set<int64_t> &dupIdx,
                              set<int64_t> &opticalDupIdx,
                              set<int64_t> &notDupIdx,
                              SerailMarkDupArg *lastArg,
                              SerailMarkDupArg *curArg)
 {
    auto &lp = *lastArg;
    auto &p = *curArg;
    for (auto idx : dupIdx)
    {
        lp.pairDupIdx.insert(idx);
        p.pairDupIdx.erase(idx);
    }
    for (auto idx : opticalDupIdx)
    {
        lp.pairOpticalDupIdx.insert(idx);
        p.pairOpticalDupIdx.erase(idx);
    }
    for (auto idx : notDupIdx)
    {
        lp.pairDupIdx.erase(idx);
        lp.pairOpticalDupIdx.erase(idx);
        p.pairDupIdx.erase(idx);
        p.pairOpticalDupIdx.erase(idx);
    }
 }
 // 用来分别处理dup和optical dup
 static void refeshTaskDupInfo(set<int64_t> &dupIdx,
                              set<int64_t> &opticalDupIdx,
                              set<int64_t> &notDupIdx,
                              set<int64_t> &latterDupIdx,
                              set<int64_t> &latterOpticalDupIdx,
                              set<int64_t> &latterNotDupIdx)
 {
    for (auto idx : dupIdx)
        latterDupIdx.insert(idx);
    for (auto idx : opticalDupIdx)
        latterOpticalDupIdx.insert(idx);
    for (auto idx : notDupIdx)
        latterNotDupIdx.insert(idx);
 }
 /* 最后合并数据并排序 */
 static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx,
                                   set<int64_t> &notDupIdx,
                                   vector<int64_t> &dupArr)
 {
    vector<int64_t> midArr;
    auto ai = dupArr.begin();
    auto bi = dupIdx.begin();
    auto ae = dupArr.end();
    auto be = dupIdx.end();
    int64_t val = 0;
    while (ai != ae && bi != be)
    {
        if (*ai < *bi)
        {
            val = *ai++;
        }
        else if (*bi < *ai)
        {
            val = *bi++;
        }
        else
        {
            val = *ai++;
            bi++;
        }
        if (notDupIdx.find(val) == notDupIdx.end())
        {
            midArr.push_back(val);
        }
    }
    while (ai != ae)
    {
        val = *ai++;
        if (notDupIdx.find(val) == notDupIdx.end())
        {
            midArr.push_back(val);
        }
    }
    while (bi != be)
    {
        val = *bi++;
        if (notDupIdx.find(val) == notDupIdx.end())
        {
            midArr.push_back(val);
        }
    }
    dupArr = midArr;
 }
 /* 处理相邻的两个任务，有相交叉的数据  */
 static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg)
 {
    auto &lp = *lastArg;
    auto &p = *curArg;
    auto &g = *gDataArg;
    vector<ReadEnds> reArr;
    set<int64_t> dupIdx;
    set<int64_t> notDupIdx;
    // 先处理重叠的frags
    getIntersectData(lp.frags, p.frags, &reArr);
    processFrags(reArr, &dupIdx, &notDupIdx);
    refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
    // 再处理重叠的pairs
    reArr.clear();
    dupIdx.clear();
    notDupIdx.clear();
    set<int64_t> opticalDupIdx;
    getIntersectData(lp.pairs, p.pairs, &reArr, true);
    processPairs(reArr, &dupIdx, &opticalDupIdx, &notDupIdx);
    refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
    // 处理之前未匹配的部分
    map<CalcKey, int64_t> recalcPos;
    set<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
    set<int64_t> addToGlobal;
    int64_t prevLastPos = 0, nextFirstPos = 0;
    if (lp.frags.size() > 0)
        prevLastPos = lp.frags.back().posKey;
    if (p.frags.size() > 0)
        nextFirstPos = p.frags[0].posKey;
    // cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
    for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
    {
        auto &readName = prevUnpair.first;
        auto &prevPosInfo = prevUnpair.second;
        auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
        if (p.unpairedDic.find(readName) != p.unpairedDic.end())           // 在当前这个任务里找到了这个未匹配的read
        {
            auto &nextPosInfo = p.unpairedDic[readName];
            auto &nextFragEnd = nextPosInfo.unpairedRE;
            int64_t prevPosKey = prevFragEnd.posKey;
            modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下，poskey可能是后面的read
            int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
            CalcKey ck = {prevPosKey, nextPosKey};
            UnpairedPosInfo *prevUnpairInfoP = nullptr;
            UnpairedPosInfo *nextUnpairInfoP = nullptr;
            if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
                prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
            if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
                nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
            // pos分为两种情况，根据poskey(pair中两个read分别的pos)的位置确定
            //  1. prevpos在交叉部分之前，nextpos在交叉部分之后，这种情况不需要获取pairarr中的数据;
            //  2. prevpos在交叉部分之前，nextpos在交叉部分，需要获取lp中的相等read pair进行重新计算
            //     复杂情况1. g中包含prevPosKey对应的unpair，p中有对应的pair，此时应该把这些pair考虑进去
            //  3. prevpos在交叉部分，nextpos在交叉部分之后，需要获取p中的相等read pair进行重新计算
            //     复杂情况2. p中是否包含prevPosKey对应的unpair
            //  4. prevpos在交叉部分，nextpos在交叉部分，需要获取lp和p中的相等read pair进行重新计算
            bool addDataToPos = true;
            if (alreadyAdd.find(ck) != alreadyAdd.end())
            {
                addDataToPos = false; // 之前已经添加过了，后面就不用再添加数据了
            }
            else
                alreadyAdd.insert(ck);
            if (prevPosKey < nextFirstPos) // prevpos在交叉部分之前
            {
                auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
                prevPairArr.push_back(prevFragEnd);
                if (nextPosKey <= prevLastPos && addDataToPos) // 第二种情况
                {
                    getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
                }
                // 第一种情况，第二种情况下都会出现，复杂情况一
                auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
                if (gPosInfo != g.unpairedPosArr.end()) // 可能g和p有匹配的，刚好和该位点一致
                {
                    auto &gUnpairInfo = gPosInfo->second;
                    auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
                    if (pPosInfo != p.unpairedPosArr.end())
                    {
                        auto &pUnpairInfo = pPosInfo->second;
                        for (auto &rn : gUnpairInfo.readNameSet) // 遍历每一个readname，看是否有匹配的
                        {
                            if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end())
                            {
                                auto pe = g.unpairedDic[rn].unpairedRE;
                                auto fe = p.unpairedDic[rn].unpairedRE;
                                modifyPairedEnds(fe, &pe);
                                prevPairArr.push_back(pe);
                                g.unpairedDic.erase(rn);
                                p.unpairedDic.erase(rn);
                                // cout << "找到了！" << rn << endl;
                            }
                        }
                    }
                }
                recalcPos[ck] = prevPosInfo.taskSeq;
                std::sort(prevPairArr.begin(), prevPairArr.end());
            }
            else // prevpos在交叉部分
            {
                if (nextPosKey > prevLastPos) // nextpos在交叉部分之后
                { // 第三种情况
                    if (nextUnpairInfoP != nullptr) // 且在pos点，next task有unpair，这样才把这些数据放到next task里
                    {
                        auto &nextPairArr = nextUnpairInfoP->pairArr;
                        nextPairArr.push_back(prevFragEnd);
                        auto &prevPairArr = prevUnpairInfoP->pairArr;
                        prevPairArr.push_back(prevFragEnd);
                        if (addDataToPos)
                        {
                            getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
                        }
                        recalcPos[ck] = nextPosInfo.taskSeq; // 将数据放到next task里, （这个位点以后会可能还会计算到，目前方案是都计算，只是把冗余剔除）
                        std::sort(prevPairArr.begin(), prevPairArr.end());
                    }
                    else // next task在该位点没有unpair，那就把数据放到prev task里
                    {
                        auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
                        prevPairArr.push_back(prevFragEnd);
                        if (addDataToPos) // 第二种情况
                            getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
                        recalcPos[ck] = prevPosInfo.taskSeq;
                        std::sort(prevPairArr.begin(), prevPairArr.end());
                    }
                }
                else
                { // 第四种情况
                    if (prevUnpairInfoP == nullptr) {
                        prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
                        prevUnpairInfoP->taskSeq = lp.taskSeq;
                    }
                    auto &prevPairArr = prevUnpairInfoP->pairArr;
                    prevPairArr.push_back(prevFragEnd);
                    if (addDataToPos)
                    {
                        getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
                        getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
                    }
                    recalcPos[ck] = prevPosInfo.taskSeq;
                    std::sort(prevPairArr.begin(), prevPairArr.end());
                }
            }
            p.unpairedDic.erase(readName); // 在next task里删除该read
        }
        else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
        {
            auto &remainPosInfo = g.unpairedDic[readName];
            auto remainFragEnd = remainPosInfo.unpairedRE;
            int64_t remainPosKey = remainFragEnd.posKey;
            modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下，poskey可能是后面的read
            auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
            auto &remainPairArr = remainUnpairInfo.pairArr;
            remainPairArr.push_back(remainFragEnd);
            CalcKey ck = {remainPosKey, prevFragEnd.posKey};
            recalcPos[ck] = remainPosInfo.taskSeq;
            std::sort(remainPairArr.begin(), remainPairArr.end());
            g.unpairedDic.erase(readName);
        }
        else // 都没找到，那就保存到遗留数据里
        {
            int64_t prevPosKey = prevFragEnd.posKey;
            g.unpairedDic.insert(prevUnpair);
            addToGlobal.insert(prevPosKey);
        }
    }
    for (auto posKey : addToGlobal) // 最后再添加，以防开始赋值，后来这个位置要是又添加了新的数据
        g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
    map<int64_t, TaskSeqDupInfo> taskChanged;
    set<int64_t> posProcessed;
    for (auto &e : recalcPos)
    {
        auto posKey = e.first.read1Pos;
        if (posProcessed.find(posKey) != posProcessed.end())
            continue;
        posProcessed.insert(posKey);
        auto taskSeq = e.second;
        auto &t = taskChanged[taskSeq];
        // 在对应的任务包含的dup idx里修改结果数据
        vector<ReadEnds> *pairArrP = nullptr;
        if (taskSeq < lp.taskSeq)
            pairArrP = &g.unpairedPosArr[posKey].pairArr;
        else
            pairArrP = &lp.unpairedPosArr[posKey].pairArr;
        processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
        if (taskSeq < lp.taskSeq)
            g.unpairedPosArr.erase(posKey);
    }
    // 更新结果
    for (auto &e : taskChanged)
    {
        auto taskSeq = e.first;
        auto &t = e.second;
        if (taskSeq < lp.taskSeq)
        {
            refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
                              g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
        }
        else if (taskSeq == lp.taskSeq)
        {
            refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
        }
        else
        {
            refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中
        }
    }
    // cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
    // cout << "calc g time: " << t.seconds_elapsed() << " s" << endl;
    // 将dupidx放进全局数据
    g.latterDupIdxArr.push_back(set<int64_t>());
    g.latterOpticalDupIdxArr.push_back(set<int64_t>());
    g.latterNotDupIdxArr.push_back(set<int64_t>());
    g.dupIdxArr.push_back(vector<int64_t>());
    auto &vIdx = g.dupIdxArr.back();
    lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
    vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
    g.opticalDupIdxArr.push_back(vector<int64_t>());
    auto &vOpticalIdx = g.opticalDupIdxArr.back();
    vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
 }
 /* 当所有任务结束后，global data里还有未处理的数据 */
 static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg)
 {
    auto &lp = *task;
    auto &g = *gDataArg;
    // 遗留的未匹配的pair
    for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
    {
        auto &readName = prevUnpair.first;
        auto &prevPosInfo = prevUnpair.second;
        auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
        if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
        {
            auto &remainPosInfo = g.unpairedDic[readName];
            auto remainFragEnd = remainPosInfo.unpairedRE;
            int64_t remainPosKey = remainFragEnd.posKey;
            modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下，poskey可能是后面的read
            auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
            remainUnpairInfo.pairArr.push_back(remainFragEnd);
            g.unpairedDic.erase(readName);
        }
    }
    map<int64_t, TaskSeqDupInfo> taskChanged;
    for (auto &e : g.unpairedPosArr)
    {
        auto posKey = e.first;
        auto taskSeq = e.second.taskSeq;
        auto &t = taskChanged[taskSeq];
        auto &arr = g.unpairedPosArr[posKey].pairArr;
        if (arr.size() > 1)
        {
            std::sort(arr.begin(), arr.end());
            processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
        }
    }
    // 更新结果
    vector<int64_t> addDup;
    map<int64_t, int64_t> ndPosVal;
    for (auto &e : taskChanged)
    {
        auto taskSeq = e.first;
        auto &t = e.second;
        refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
                          g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
    }
    cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
    g.unpairedPosArr.clear();
    g.unpairedDic.clear();
    // 将dupidx放进全局数据
    for (int i = 0; i < g.dupIdxArr.size() - 1; ++i)
        refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
    for (int i = 0; i < g.opticalDupIdxArr.size() - 1; ++i)
        refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
    g.dupIdxArr.push_back(vector<int64_t>());
    auto &vIdx = g.dupIdxArr.back();
    lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
    vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
    g.opticalDupIdxArr.push_back(vector<int64_t>());
    auto &vOpticalIdx = g.opticalDupIdxArr.back();
    vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
 }
 /* 串行处理数据，标记冗余 */
 static void serialMarkDups()
 {
    tm_arr[5].acc_start();
    Timer::log_time("serial start");
    // 读取缓存初始化
    BamBufType inBamBuf(g_gArg.use_asyncio);
    inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
    // BamBufType inBamBuf(false);
    // inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
    int64_t processedBamNum = 0;
    SerailMarkDupArg smdArg1, smdArg2;
    SerailMarkDupArg *lastArgP = &smdArg1;
    SerailMarkDupArg *curArgP = &smdArg2;
    bool isFirstRound = true;
    int roundNum = 0;
    while (inBamBuf.ReadStat() >= 0)
    {
        Timer t_round;
        // 读取bam文件中的read
        tm_arr[4].acc_start();
        size_t readNum = inBamBuf.ReadBam();
        tm_arr[4].acc_end();
        cout << "read num: " << readNum << endl;
        // lastArgP = curArgP;
        tm_arr[6].acc_start();
        curArgP->taskSeq = roundNum;
        curArgP->bamStartIdx = processedBamNum;
        curArgP->bams = inBamBuf.GetBamArr();
        tm_arr[6].acc_end();
        tm_arr[0].acc_start();
        Timer t1;
        generateReadEnds(curArgP);
        //cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl;
        tm_arr[0].acc_end();
        tm_arr[1].acc_start();
        t1.reinit();
        markdups(curArgP);
        //cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
        tm_arr[1].acc_end();
        if (!isFirstRound)
        {
            tm_arr[2].acc_start();
            t1.reinit();
            handleIntersectData(lastArgP, curArgP, &gData);
            //cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl;
            // addTaskIdxToSet(lastArgP, &gData);
            tm_arr[2].acc_end();
        }
        else
        {
            isFirstRound = false;
        }
        inBamBuf.ClearAll(); // 清理上一轮读入的数据
        processedBamNum += readNum;
        // 交换
        auto tmp = lastArgP;
        lastArgP = curArgP;
        curArgP = tmp;
        cout << "round time: " << t_round.seconds_elapsed() << endl;
        roundNum++;
        if (roundNum > 9){
 //            break;
        }
    }
    // cout << "here" << endl;
    tm_arr[3].acc_start();
    // 处理剩下的全局数据
    handleLastTask(lastArgP, &gData);
    // cout << "here 2" << endl;
    tm_arr[3].acc_end();
    tm_arr[5].acc_end();
    // 统计所有冗余index数量
    int64_t dupNum = 0;
    set<int64_t> dup;
    // int taskSeq = 0;
    // for (auto &arr : gData.dupIdxArr)
    // {
    //     for (auto idx : arr) {
    //         if (dup.find(idx) != dup.end())
    //         {
    //             cout << "dup index: " << taskSeq << '\t' << idx << endl;
    //         }
    //         dup.insert(idx);
    //     }
    //     taskSeq++;
    // }
    // #include <fstream>
    // ofstream out("tumor_dup.txt");
    // for (auto idx : dup)
    // {
    //     out << idx << endl;
    // }
    // out.close();
    for (auto &arr : gData.dupIdxArr)
        dupNum += arr.size();
    cout << "dup num     : " << dupNum << '\t' << dup.size() << endl;
    cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
    cout << "markdup     : " << tm_arr[1].acc_seconds_elapsed() << endl;
    cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
    cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
    cout << "read bam    : " << tm_arr[4].acc_seconds_elapsed() << endl;
    cout << "new arg     : " << tm_arr[6].acc_seconds_elapsed() << endl;
    cout << "del arg     : " << tm_arr[7].acc_seconds_elapsed() << endl;
    cout << "build ends  : " << tm_arr[8].acc_seconds_elapsed() << endl;
    cout << "sort frags  : " << tm_arr[9].acc_seconds_elapsed() << endl;
    cout << "sort pairs  : " << tm_arr[10].acc_seconds_elapsed() << endl;
    cout << "all         : " << tm_arr[5].acc_seconds_elapsed() << endl;
    Timer::log_time("serial end  ");
    //for (auto i : gData.dupArr)
    //    cout << i << endl;
 }
--- a/src/sam/markdups/shared_args.h
+++ b/src/sam/markdups/shared_args.h
@ -0,0 +1,26 @@
 #pragma once
 #include <common/utils/timer.h>
 #include <common/utils/util.h>
 #include <htslib/sam.h>
 #include <htslib/thread_pool.h>
 #include <sam/utils/read_ends.h>
 #include <sam/utils/read_name_parser.h>
 extern Timer tm_arr[20];  // 用来测试性能
 /* 全局本地变量 */
 extern vector<ReadNameParser> g_vRnParser;  // 每个线程一个read name parser
 extern samFile *g_inBamFp;                  // 输入的bam文件
 extern sam_hdr_t *g_inBamHeader;            // 输入的bam文件头信息
 extern samFile *g_outBamFp;       // 输出文件, sam或者bam格式
 extern sam_hdr_t *g_outBamHeader;           // 输出文件的header
 /* 参数对象作为全局对象，免得多次作为参数传入函数中 */
 class GlobalArg;
 extern GlobalArg &g_gArg;
 class MarkDupsArg;
 extern MarkDupsArg &g_mdArg;
 class GlobalDataArg;
 extern GlobalDataArg &gData;
 class DuplicationMetrics;
 extern DuplicationMetrics &gMetrics;
--- a/src/sam/utils/read_ends.h
+++ b/src/sam/utils/read_ends.h
@ -1,5 +1,6 @@
 /*
-Description: read ends结构体主要用来标记冗余，包含一些序列的测序过程中的物理信息等
+Description: read
 ends结构体主要用来标记冗余，包含一些序列的测序过程中的物理信息等
 Copyright : All right reserved by ICT
@ -13,18 +14,20 @@ Date : 2023/11/3
 #include <stdint.h>
 /**
- * Small interface that provides access to the physical location information about a cluster.
+ * Small interface that provides access to the physical location information
- * All values should be defaulted to -1 if unavailable.  ReadGroup and Tile should only allow
+ * about a cluster. All values should be defaulted to -1 if unavailable.
- * non-zero positive integers, x and y coordinates may be negative.
+ * ReadGroup and Tile should only allow non-zero positive integers, x and y
 * coordinates may be negative.
 */
-struct PhysicalLocation
+struct PhysicalLocation {
-{
+    static const int NO_VALUE = -1;
    /**
-     * Small class that provides access to the physical location information about a cluster.
+     * Small class that provides access to the physical location information
-     * All values should be defaulted to -1 if unavailable.  Tile should only allow
+     * about a cluster. All values should be defaulted to -1 if unavailable.
-     * non-zero positive integers, x and y coordinates must be non-negative.
+     * Tile should only allow non-zero positive integers, x and y coordinates
-     * This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
+     * must be non-negative. This is different from PhysicalLocationShort in
-     * thus, they do not overflow within a HiSeqX tile.
+     * that the x and y positions are ints, not shorts thus, they do not
     * overflow within a HiSeqX tile.
     */
    int16_t tile = -1;
    int32_t x = -1;
@ -32,28 +35,33 @@ struct PhysicalLocation
 };
 /* 包含了所有read ends信息，如picard里边的 ReadEndsForMarkDuplicates*/
-struct ReadEnds : PhysicalLocation
+struct ReadEnds : PhysicalLocation {
 {
    static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
    /* 保留一些bam记录中的数据 */
    bool read1FirstOfPair = true;
    /* ReadEnds中的成员变量 */
-    /** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
+    /** Little struct-like class to hold read pair (and fragment) end data for
     * duplicate marking. */
    // int16_t libraryId; // 没用，不考虑多样本
    int8_t orientation = -1;
    int32_t read1ReferenceIndex = -1;
    int32_t read1Coordinate = -1;
    int32_t read2ReferenceIndex = -1;
-    int32_t read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
+    // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
    int32_t read2Coordinate = -1;
    /* Additional information used to detect optical dupes */
-    // int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read group，normal或者tumor
+    // int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read
-    /** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
+    // group，normal或者tumor
    /** For optical duplicate detection the orientation matters regard to 1st or
     * 2nd end of a mate */
    int8_t orientationForOpticalDuplicates = -1;
-    /** A *transient* flag marking this read end as being an optical duplicate. */
+    /** A *transient* flag marking this read end as being an optical duplicate.
     */
    bool isOpticalDuplicate = false;
    /* ReadEndsForMarkDuplicates中的成员变量 */
-    /** Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar **/
+    /** Little struct-like class to hold read pair (and fragment) end data for
     * MarkDuplicatesWithMateCigar **/
    int16_t score = 0;
    int64_t read1IndexInFile = -1;
    int64_t read2IndexInFile = -1;
@ -62,23 +70,22 @@ struct ReadEnds : PhysicalLocation
    /* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
    // int32_t barcode = 0; // primary barcode for this read (and pair)
    // int32_t readOneBarcode = 0; // read one barcode, 0 if not present
-    // int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not paired
+    // int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not
    // paired
    /* zzh增加的成员变量 */
-    int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
+    int64_t posKey = -1;  // 根据位置信息生成的关键字 return (int64_t)tid <<
                          // MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
    /* 根据pairend read的比对方向，来确定整体的比对方向 */
-    static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand)
+    static int8_t GetOrientationByte(bool read1NegativeStrand,
-    {
+                                     bool read2NegativeStrand) {
-        if (read1NegativeStrand)
+        if (read1NegativeStrand) {
        {
            if (read2NegativeStrand)
                return RR;
            else
                return RF;
-        }
+        } else {
        else
        {
            if (read2NegativeStrand)
                return FR;
            else
@ -87,47 +94,38 @@ struct ReadEnds : PhysicalLocation
    }
    /* 比较两个readends是否一样（有个冗余） */
-    static bool AreComparableForDuplicates(const ReadEnds &lhs, const ReadEnds &rhs, bool compareRead2)
+    static bool AreComparableForDuplicates(const ReadEnds &lhs,
-    {
+                                           const ReadEnds &rhs,
                                           bool compareRead2) {
        bool areComparable = true;
        areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
                        lhs.read1Coordinate == rhs.read1Coordinate &&
                        lhs.orientation == rhs.orientation;
-        if (areComparable && compareRead2)
+        if (areComparable && compareRead2) {
-        {
+            areComparable =
-            areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
+                lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
                lhs.read2Coordinate == rhs.read2Coordinate;
        }
        return areComparable;
    }
    /* 比对方向是否正向 */
-    bool IsPositiveStrand() const
+    bool IsPositiveStrand() const { return orientation == F; }
    {
        return orientation == F;
    }
    /* pairend是否合适的比对上了 */
-    bool IsPaired() const
+    bool IsPaired() const { return read2ReferenceIndex != -1; }
    {
        return read2ReferenceIndex != -1;
    }
-    bool IsNegativeStrand() const
+    bool IsNegativeStrand() const { return orientation == R; }
    {
        return orientation == R;
    }
    // 对于相交的数据进行比对，a是否小于b，根据AreComparableForDuplicates函数得来
-    static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false)
+    static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b,
-    {
+                                      bool compareRead2 = false) {
        int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
        if (comp == 0)
            comp = a.read1Coordinate - b.read1Coordinate;
        if (comp == 0)
            comp = a.orientation - b.orientation;
-        if (compareRead2)
+        if (compareRead2) {
        {
            if (comp == 0)
                comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
            if (comp == 0)
@ -137,14 +135,12 @@ struct ReadEnds : PhysicalLocation
    }
    // 找某一个位置的所有readend时需要
-    static bool pairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs)
+    static bool PairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs) {
    {
        return ReadLittleThan(lhs, rhs, true);
    }
    // 比较函数
-    bool operator < (const ReadEnds &o) const
+    bool operator<(const ReadEnds &o) const {
    {
        int comp = read1ReferenceIndex - o.read1ReferenceIndex;
        if (comp == 0)
            comp = read1Coordinate - o.read1Coordinate;
--- a/src/sam/utils/read_name_parser.h
+++ b/src/sam/utils/read_name_parser.h
@ -9,11 +9,12 @@ Date : 2023/11/6
 #ifndef READ_NAME_PARSER_H_
 #define READ_NAME_PARSER_H_
 #include "read_ends.h"
 #include <common/utils/util.h>
 #include <stdint.h>
 #include <string>
 #include "read_ends.h"
 // #include <regex>
 #include <boost/regex.hpp>
@ -24,25 +25,30 @@ using std::string;
 /**
 * Provides access to the physical location information about a cluster.
- * All values should be defaulted to -1 if unavailable.  ReadGroup and Tile should only allow
+ * All values should be defaulted to -1 if unavailable.  ReadGroup and Tile
- * non-zero positive integers, x and y coordinates may be negative.
+ * should only allow non-zero positive integers, x and y coordinates may be
- * 非线程安全
+ * negative. 非线程安全
 */
-struct ReadNameParser
+struct ReadNameParser {
 {
    /**
-     * The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
+     * The read name regular expression (regex) is used to extract three pieces
-     * and y location.  Any read name regex should parse the read name to produce these and only these values.  An example regex is:
+     * of information from the read name: tile, x location, and y location.  Any
     * read name regex should parse the read name to produce these and only
     * these values.  An example regex is:
     *  (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
-     * which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
+     * which assumes that fields in the read name are delimited by ':' and the
-     * ignoring any trailing non-digit characters.
+     * last three fields correspond to the tile, x and y locations, ignoring any
     * trailing non-digit characters.
     *
-     * The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
+     * The default regex is optimized for fast parsing (see {@link
-     * three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'.  This should consider correctly read names
+     * #getLastThreeFields(String, char, int[])}) by searching for the last
-     * where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
+     * three fields, ignoring any trailing non-digit characters, assuming the
-     * Illumina technology.
+     * delimiter ':'.  This should consider correctly read names where we have 5
     * or 7 field with the last three fields being tile/x/y, as is the case for
     * the majority of read names produced by Illumina technology.
     */
-    const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
+    const string DEFAULT_READ_NAME_REGEX =
        "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
    string readNameStored = "";
    PhysicalLocation physicalLocationStored;
@ -53,74 +59,73 @@ struct ReadNameParser
    bool warnedAboutRegexNotMatching = true;
    ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
-    ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {}
+    ReadNameParser(const string &strReadNameRegex)
-    ReadNameParser(const string &strReadNameRegex, bool isWarn)
+        : ReadNameParser(strReadNameRegex, true) {}
-    {
+    ReadNameParser(const string &strReadNameRegex, bool isWarn) {
        readNameRegex = strReadNameRegex;
        if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
            useOptimizedDefaultParsing = true;
        else
            useOptimizedDefaultParsing = false;
-        readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
+        readNamePattern =
            boost::regex(strReadNameRegex, boost::regex_constants::optimize);
        warnedAboutRegexNotMatching = isWarn;
    }
    /* 重新设置readNameRegex */
-    void SetReadNameRegex(const string &strReadNameRegex)
+    void SetReadNameRegex(const string &strReadNameRegex) {
    {
        readNameRegex = strReadNameRegex;
        if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
            useOptimizedDefaultParsing = true;
        else
            useOptimizedDefaultParsing = false;
-        readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
+        readNamePattern =
            boost::regex(strReadNameRegex, boost::regex_constants::optimize);
        // readNamePattern = strReadNameRegex;
    }
    /* 添加测序时候的tile x y 信息 */
-    bool AddLocationInformation(const string &readName, PhysicalLocation *loc)
+    bool AddLocationInformation(const string &readName, PhysicalLocation *loc) {
-    {
+        if (!(readName == readNameStored)) {
-        if (!(readName == readNameStored))
+            if (ReadLocationInformation(readName, loc)) {
        {
            if (ReadLocationInformation(readName, loc))
            {
                readNameStored = readName;
                physicalLocationStored = *loc;
                return true;
            }
            // return false if read name cannot be parsed
            return false;
-        }
+        } else {
        else
        {
            *loc = physicalLocationStored;
            return true;
        }
    }
    /**
-     * Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
+     * Method used to extract tile/x/y from the read name and add it to the
-     * can be used later to determine optical duplication
+     * PhysicalLocationShort so that it can be used later to determine optical
     * duplication
     *
     * @param readName the name of the read/cluster
     * @param loc the object to add tile/x/y to
-     * @return true if the read name contained the information in parsable form, false otherwise
+     * @return true if the read name contained the information in parsable form,
     * false otherwise
     */
-    bool ReadLocationInformation(const string &readName, PhysicalLocation *loc) 
+    bool ReadLocationInformation(const string &readName,
-    {
+                                 PhysicalLocation *loc) {
        try {
-            // Optimized version if using the default read name regex (== used on purpose):
+            // Optimized version if using the default read name regex (== used
-            if (useOptimizedDefaultParsing)
+            // on purpose):
-            {
+            if (useOptimizedDefaultParsing) {
                const int fields = getLastThreeFields(readName, ':');
-                if (!(fields == 5 || fields == 7))
+                if (!(fields == 5 || fields == 7)) {
-                {
+                    if (warnedAboutRegexNotMatching) {
                    if (warnedAboutRegexNotMatching)
                    {
                        Warn(
-                            "Default READ_NAME_REGEX '%s' did not match read name '%s'."
+                            "Default READ_NAME_REGEX '%s' did not match read "
-                            "You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates.  "
+                            "name '%s'."
-                            "Note that this message will not be emitted again even if other read names do not match the regex.",
+                            "You may need to specify a READ_NAME_REGEX in "
                            "order to correctly identify optical duplicates.  "
                            "Note that this message will not be emitted again "
                            "even if other read names do not match the regex.",
                            readNameRegex.c_str(), readName.c_str());
                        warnedAboutRegexNotMatching = false;
                    }
@ -130,13 +135,9 @@ struct ReadNameParser
                loc->x = tmpLocationFields[1];
                loc->y = tmpLocationFields[2];
                return true;
-            }
+            } else if (readNameRegex.empty()) {
            else if (readNameRegex.empty())
            {
                return false;
-            }
+            } else {
            else
            {
                // Standard version that will use the regex
                cmatch m;
                if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
@ -144,28 +145,25 @@ struct ReadNameParser
                    loc->x = std::stoi(m[2].str());
                    loc->y = std::stoi(m[3].str());
                    return true;
-                }
+                } else {
-                else
+                    if (warnedAboutRegexNotMatching) {
                {
                    if (warnedAboutRegexNotMatching)
                    {
                        Warn(
                            "READ_NAME_REGEX '%s' did not match read name '%s'."
                            "Your regex may not be correct.  "
-                            "Note that this message will not be emitted again even if other read names do not match the regex.",
+                            "Note that this message will not be emitted again "
                            "even if other read names do not match the regex.",
                            readNameRegex.c_str(), readName.c_str());
                        warnedAboutRegexNotMatching = false;
                    }
                    return false;
                }
            }
-        }
+        } catch (const std::runtime_error &e) {
-        catch (const std::runtime_error &e)
+            if (warnedAboutRegexNotMatching) {
        {
            if (warnedAboutRegexNotMatching) 
            {
                Warn(
-                    "A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s",
+                    "A field parsed out of a read name was expected to contain "
                    "an integer and did not. READ_NAME_REGEX: %s; Read name: "
                    "%s; Error Msg: %s",
                    readNameRegex.c_str(), readName.c_str(), e.what());
                warnedAboutRegexNotMatching = false;
            }
@ -175,39 +173,39 @@ struct ReadNameParser
    }
    /**
-     * Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers.  Parsing a field
+     * Given a string, splits the string by the delimiter, and returns the the
-     * considers only a sequence of digits up until the first non-digit character.  The three values are stored in the passed-in array.
+     * last three fields parsed as integers.  Parsing a field considers only a
     * sequence of digits up until the first non-digit character.  The three
     * values are stored in the passed-in array.
     *
-     * @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
+     * @throws NumberFormatException if any of the tokens that should contain
     * numbers do not start with parsable numbers
     */
-    int getLastThreeFields(const string &readName, char delim)
+    int getLastThreeFields(const string &readName, char delim) {
    {
        int tokensIdx = 2;  // start at the last token
        int numFields = 0;
        int i, endIdx;
        endIdx = readName.size();
        // find the last three tokens only
-        for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--)
+        for (i = (int)readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) {
-        {
+            if (readName.at(i) == delim || 0 == i) {
            if (readName.at(i) == delim || 0 == i)
            {
                numFields++;
                const int startIdx = (0 == i) ? 0 : (i + 1);
-                tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx));
+                tmpLocationFields[tokensIdx] =
                    std::stoi(readName.substr(startIdx, endIdx - startIdx));
                tokensIdx--;
                endIdx = i;
            }
        }
        // continue to find the # of fields
-        while (0 <= i) 
+        while (0 <= i) {
        {
            if (readName.at(i) == delim || 0 == i)
                numFields++;
            i--;
        }
-        if (numFields < 3)
+        if (numFields < 3) {
-        {
+            tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] =
-            tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1;
+                -1;
            numFields = -1;
        }