picard_cpp/src/sam/markdups/markdups.cpp

/*
Description: 标记bam文件中的冗余信息，只处理按照坐标排序后的bam，且bam为单一样本数据

Copyright : All right reserved by ICT

Author : Zhang Zhonghai
Date : 2023/10/23
*/
#include "markdups_arg.h"
// 有太多define冲突，放到最后include


#include <common/hts/bam_buf.h>
#include <common/utils/global_arg.h>
#include <common/utils/thpool.h>
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <common/utils/murmur3.h>
#include <common/utils/yarn.h>
#include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h>

#include <htslib/sam.h>
#include <htslib/thread_pool.h>

#include <iostream>
#include <vector>
#include <set>
#include <queue>
#include <unordered_map>


using namespace std;
using std::cout;

#define SMA_TAG_PG "PG"

#define BAM_BLOCK_SIZE 2 * 1024 * 1024
#define NO_SUCH_INDEX INT64_MAX

static Timer tm_arr[10]; // 用来测试性能

/* 前向声明 */
class ThMarkDupArg;

/* 全局本地变量 */
static queue<ThMarkDupArg *> g_qpThMarkDupArg;   // 存放线程变量的队列
static lock_t *g_queueFirstLock = NEW_LOCK(-1);  // 队列的第一个任务是否完成
static lock_t *g_readyToReadLock = NEW_LOCK(-1); // 通知主线程是否可以进行下一次读取
static vector<ReadNameParser> g_vRnParser;       // 每个线程一个read name parser
static int g_numDuplicateIndices = 0;            // 找到的冗余read总数
static samFile *g_outBamFp = nullptr;            // 输出文件, sam或者bam格式
static sam_hdr_t *g_outBamHeader;                // 输出文件的header
static int g_maxJobNum = 0;                      // 每次读取新的数据后，新增的任务数量
static int g_jobNumForRead = 0;                  // 任务数量降到当前值时开始下一轮读取
static volatile int64_t g_bamLoadedNum = 0;      // 已经读入的read总数
static volatile int64_t g_bamWritenNum = 0;         // 已经处理完，写入输出文件的read总数
static vector<int64_t> g_vDupIdx;  // 线程内部计算得出的
static vector<int64_t> g_vOpticalDupIdx;
static set<int64_t> g_sDupIdxLatter;
static set<int64_t> g_sOpticalDupIdxLatter;

/* 参数对象作为全局对象，免得多次作为参数传入函数中 */
static GlobalArg &g_gArg = GlobalArg::Instance();
static MarkDupsArg g_mdArg;

/*
 * 计算read的分数
 */
static int16_t computeDuplicateScore(BamWrap &bw)
{
    int16_t score = 0;
    switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
    {
    case ns_md::SUM_OF_BASE_QUALITIES:
        // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
        // and risk overflow.
        score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
        break;
    case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
        if (!bw.GetReadUnmappedFlag())
            // no need to remember the score since this scoring mechanism is symmetric
            score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
        break;
    case ns_md::RANDOM:
        // The RANDOM score gives the same score to both reads so that they get filtered together.
        // it's not critical do use the readName since the scores from both ends get added, but it seem
        // to be clearer this way.
        score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
        // subtract Short.MIN_VALUE/4 from it to end up with a number between
        // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
        // not passing filters. We need to stay far from overflow so that when we add the two
        // scores from the two read mates we do not overflow since that could cause us to chose a
        // failing read-pair instead of a passing one.
        score -= INT16_MIN / 4;
    default:
        break;
    }
    // make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
    // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
    score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;

    return score;
}

/*
 * Builds a read ends object that represents a single read. 用来表示一个read的特征结构
 */
static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
{
    auto &k = *pKey;
    auto &bc = bw.b->core;
    k.read1ReferenceIndex = bc.tid;
    k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
    k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
    k.read1IndexInFile = index;
    k.score = computeDuplicateScore(bw);
    // Doing this lets the ends object know that it's part of a pair
    if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
    {
        k.read2ReferenceIndex = bc.mtid;
    }
    // Fill in the location information for optical duplicates
    rnParser.AddLocationInformation(bw.query_name(), pKey);
    // cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
    // 计算位置key
    k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
}

/**
 * Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
 * quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
 * read to the records of the first and second in a pair. This value becomes is used for
 * the 'DI' tag.
 */
static void addRepresentativeReadIndex(vector<ReadEnds *> &vpRe)
{

}

/* 处理一组pairend的readends，标记冗余 */
static void markDuplicatePairs(vector<ReadEnds *> &vpRe, set<int64_t> *psDupIdx, set<int64_t> *psOpticalDupIdx)
{
    if (vpRe.size() < 2) {
        if (vpRe.size() == 1)
        {
            // addSingletonToCount(libraryIdGenerator);
        }
        return;
    }

    int maxScore = 0;
    ReadEnds *pBestRe = nullptr;
    /** All read ends should have orientation FF, FR, RF, or RR **/
    for (auto pe: vpRe) // 找分数最高的readend
    {
        if (pe->score > maxScore || pBestRe == nullptr)
        {
            maxScore = pe->score;
            pBestRe = pe;
        }
    }
    if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
    {
        // trackOpticalDuplicates
    }

    for (auto pe: vpRe) // 对非best read标记冗余
    {
        if (pe != pBestRe) // 非best
        {
            psDupIdx->insert(pe->read1IndexInFile); // 添加read1
            if (pe->read2IndexInFile != pe->read1IndexInFile)
                psDupIdx->insert(pe->read2IndexInFile); // 添加read2
        }
    }

    if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
    {
        addRepresentativeReadIndex(vpRe);
    }
}

/* 处理一组非paired的readends，标记冗余 */
static void markDuplicateFragments(vector<ReadEnds *> &vpRe,
                                   bool containsPairs,
                                   set<int64_t> *psDupIdx,
                                   set<int64_t> *psOpticalDupIdx)
{
    if (containsPairs)
    {
        for (auto pe: vpRe)
        {
            if (!pe->IsPaired())
            {
                psDupIdx->insert(pe->read1IndexInFile);
            }
        }
    }
    else
    {
        int maxScore = 0;
        ReadEnds *pBest = nullptr;
        for (auto pe : vpRe)
        {
            if (pe->score > maxScore || pBest == nullptr)
            {
                maxScore = pe->score;
                pBest = pe;
            }
        }

        for (auto pe : vpRe)
        {
            if (pe != pBest)
            {
                psDupIdx->insert(pe->read1IndexInFile);
            }
        }
    }
}

/* 多线程处理冗余参数结构体 */
struct ThMarkDupArg
{
    int64_t bamStartIdx;                          // 当前vBam数组中第一个bam记录在整体bam中所处的位置
    long seq;                                     // 当前任务在所有任务的排序
    bool more;                                    // 后面还有任务
    volatile bool finish;                         // 当前任务有没有处理完
    vector<BamWrap *> vBam;                       // 存放待处理的bam read
    map<int64_t, vector<ReadEnds>> mvPair;        // 以冗余位置为索引，保存所有pairend reads
    map<int64_t, vector<ReadEnds>> mvFrag;        // 保存所有reads，包括pairend
    map<int64_t, set<int64_t>> msDupIdx;              // 冗余read的索引
    map<int64_t, set<int64_t>> msOpticalDupIdx;       // optical冗余read的索引
    unordered_map<string, ReadEnds> umReadEnds;   // 用来寻找pair end
};

/*
 * 多线程查找和标记冗余函数
 */
void thread_markdups(void *arg, int tid)
{
    auto &p = *(ThMarkDupArg *)arg;

    /* 处理每个read，创建ReadEnd，并放入frag和pair中 */
    for (int i = 0; i < p.vBam.size(); ++i) // 循环处理每个read
    {
        BamWrap *bw = p.vBam[i];
        const int64_t bamIdx = p.bamStartIdx + i;
        if (bw->GetReadUnmappedFlag()) 
        {
            if (bw->b->core.tid == -1)
                // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
                break;
        }
        else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
        {
            ReadEnds fragEnd;
            buildReadEnds(*bw, bamIdx, g_vRnParser[tid], &fragEnd);
            p.mvFrag[fragEnd.posKey].push_back(fragEnd); // 添加进frag集合
            if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
            {
                string key = bw->query_name();
                if (p.umReadEnds.find(key) == p.umReadEnds.end())
                {
                    p.umReadEnds[key] = fragEnd;
                }
                else // 找到了pairend
                {
                    auto pairedEnds = p.umReadEnds.at(key);
                    p.umReadEnds.erase(key); // 删除找到的pairend
                    const int matesRefIndex = fragEnd.read1ReferenceIndex;
                    const int matesCoordinate = fragEnd.read1Coordinate;
                    // Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands.  NB: must do this
                    // before updating the orientation later.
                    if (bw->GetFirstOfPairFlag())
                    {
                        pairedEnds.orientationForOpticalDuplicates =
                            ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds::R);
                    }
                    else
                    {
                        pairedEnds.orientationForOpticalDuplicates = 
                            ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, bw->GetReadNegativeStrandFlag());
                    }
                    // If the other read is actually later, simply add the other read's data as read2, else flip the reads
                    if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
                        (matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
                    {
                        pairedEnds.read2ReferenceIndex = matesRefIndex;
                        pairedEnds.read2Coordinate = matesCoordinate;
                        pairedEnds.read2IndexInFile = bamIdx;
                        pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
                                                                             bw->GetReadNegativeStrandFlag());

                        // if the two read ends are in the same position, pointing in opposite directions,
                        // the orientation is undefined and the procedure above
                        // will depend on the order of the reads in the file.
                        // To avoid this, we set it explicitly (to FR):
                        if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
                            pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
                            pairedEnds.orientation == ReadEnds::RF)
                        {
                            pairedEnds.orientation = ReadEnds::FR;
                        }
                    }
                    else
                    {
                        pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
                        pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
                        pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
                        pairedEnds.read1ReferenceIndex = matesRefIndex;
                        pairedEnds.read1Coordinate = matesCoordinate;
                        pairedEnds.read1IndexInFile = bamIdx;
                        pairedEnds.orientation = ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(),
                                                                             pairedEnds.orientation == ReadEnds::R);
                    }

                    pairedEnds.score += computeDuplicateScore(*bw);
                    p.mvPair[pairedEnds.posKey].push_back(pairedEnds);
                }
            }
        }
    }
    /* generateDuplicateIndexes，计算冗余read在所有read中的位置索引 */
    // 先处理 pair
    int dupNum = 0;
    vector<ReadEnds *> vRePotentialDup; // 有可能是冗余的reads
    for (auto &e : p.mvPair) // 按比对的位置先后进行遍历
    {
        if (e.second.size() > 1) // 有潜在的冗余
        {
            vRePotentialDup.clear();
            ReadEnds *pReadEnd = nullptr;
            for (auto &re : e.second)
            {
                if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))
                    vRePotentialDup.push_back(&re);
                else
                {
                    markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
                    vRePotentialDup.clear();
                    vRePotentialDup.push_back(&re);
                    pReadEnd = &re;
                }
            }
            markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
        }
    }
    // 再处理frag
    bool containsPairs = false;
    bool containsFrags = false;
    for (auto &e : p.mvFrag)
    {
        if (e.second.size() > 1) // 有潜在的冗余
        {
            vRePotentialDup.clear();
            ReadEnds *pReadEnd = nullptr;
            for (auto &re : e.second)
            {
                if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
                {
                    vRePotentialDup.push_back(&re);
                    containsPairs = containsPairs || re.IsPaired();
                    containsFrags = containsFrags || !re.IsPaired();
                }
                else
                {
                    if (vRePotentialDup.size() > 1 && containsFrags)
                    {
                        markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
                    }
                    vRePotentialDup.clear();
                    vRePotentialDup.push_back(&re);
                    pReadEnd = &re;
                    containsPairs = re.IsPaired();
                    containsFrags = !re.IsPaired();
                }
            }
            if (vRePotentialDup.size() > 1 && containsFrags) {
                markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
            }
        }
    }

    // cout << tid << '\t' << "dup: " << dupNum << endl;
    // cout << tid << " all: no: " << p.vBam.size() << '\t' << p.umReadEnds.size() << endl;
    /* 本段数据处理完成，告诉输出线程 */
    POSSESS(g_queueFirstLock);
    p.finish = true;
    // cout << tid << ": process: " << p.seq << endl;
    auto front = g_qpThMarkDupArg.front();
    if (front->finish)
    {
        TWIST(g_queueFirstLock, TO, front->seq); // 通知写线程，当前队列头部完成的任务
    } else {
        RELEASE(g_queueFirstLock);
    }
}

/*
 * 多线程将结果写入文件，写之前需要合并相邻线程的未处理的结果
 */
void thread_write(void *)
{
    bool more = false;
    long seq = 0;
    long unPairedNum = 0;
    POSSESS(g_queueFirstLock);
    WAIT_FOR(g_queueFirstLock, TO_BE, seq++); // 等待首个任务完成
    auto lastP = g_qpThMarkDupArg.front();    // 取队首的数据
    auto umUnpairedReadEnds = lastP->umReadEnds; // 还未找到pair的read
    auto p = lastP;
    g_qpThMarkDupArg.pop(); // 删除队首
    TWIST(g_queueFirstLock, TO, seq); // 解锁
    more = lastP->more; // 是否还有下一个任务
    while (more) // 循环处理，将结果写入文件
    {
        POSSESS(g_queueFirstLock);
        if (g_qpThMarkDupArg.empty()) // 有可能新任务没来得及添加进队列
        {
            RELEASE(g_queueFirstLock);
            continue;
        }
        WAIT_FOR(g_queueFirstLock, TO_BE, seq); // 等待任务完成
        p = g_qpThMarkDupArg.front();
        if (!p->finish) // 有可能这个任务没有完成，是下边那个TWIST导致进到这里，因为这一段代码可能运行比较快
        {
            TWIST(g_queueFirstLock, TO, -1); // 此时队首任务没完成，-1可以让锁无法进入到这里，避免无效获得锁
            continue;
        }
        g_qpThMarkDupArg.pop();
        TWIST(g_queueFirstLock, TO, seq + 1);
        /* 处理结果数据 */ 
        // cout << "finish: " <<  seq - 1 << '\t' << "lastIdx: " << p->bamStartIdx+p->vBam.size() << endl;

        for (auto &e : p->umReadEnds) // 在当前任务中找有没有与上一个任务中没匹配的read，相匹配的pair
        {
            if (umUnpairedReadEnds.find(e.first) != umUnpairedReadEnds.end())
                umUnpairedReadEnds.erase(e.first); // 找到了pair
            else
                umUnpairedReadEnds.insert(e); // 没有pair，则添加
        }

        /* 更新写入read数量和状态 */
        POSSESS(g_readyToReadLock);
        g_bamWritenNum += lastP->vBam.size();
        // cout << "write: " << g_qpThMarkDupArg.size() << endl;
        if (g_qpThMarkDupArg.size() <= g_jobNumForRead)
        {
            TWIST(g_readyToReadLock, TO, 1);
        }
        else
        {
            RELEASE(g_readyToReadLock);
        }
        /* 准备下一轮循环 */
        delete lastP;
        more = p->more;
        lastP = p;
        seq++;
    }
    unPairedNum = umUnpairedReadEnds.size();

    cout << "Finally unpaired read num: " << unPairedNum << endl;

    // 处理最后一个数据
    POSSESS(g_readyToReadLock);
    g_bamWritenNum += lastP->vBam.size();
    TWIST(g_readyToReadLock, TO, 1);
    // cout << "last finish: " << seq - 1 << endl;
    pthread_exit(0);
}

/*
 * mark duplicate 入口，假定bam是按照比对后的坐标排序的，同一个样本的话不需要考虑barcode的问题
 */
int MarkDuplicates(int argc, char *argv[])
{
    Timer::log_time("程序开始");
    Timer time_all;
    
    /* 读取命令行参数 */
    g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
    if (g_gArg.num_threads < 1) // 线程数不能小于1
        g_gArg.num_threads = 1;

    /* 初始化一些参数和变量*/
    g_vRnParser.resize(g_gArg.num_threads);
    for (auto &parser : g_vRnParser)
        parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile，x，y信息
    
    /* 打开输入bam文件 */
    sam_hdr_t *inBamHeader;
    samFile *inBamFp;
    inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
    if (!inBamFp)
    {
        Error("[%s] load sam/bam file failed.\n", __func__);
        return -1;
    }
    hts_set_opt(inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
    inBamHeader = sam_hdr_read(inBamFp); // 读取header

    /* 利用线程池对输入输出文件进行读写 */
    htsThreadPool htsPoolRead = {NULL, 0};  // 多线程读取，创建线程池
    htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
    htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
    htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
    if (!htsPoolRead.pool || !htsPoolWrite.pool)
    {
        Error("[%d] failed to set up thread pool", __LINE__);
        return -1;
    }
    hts_set_opt(inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);

    /* 初始化输出文件 */
    char modeout[12] = "wb";
    sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
    g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
    g_outBamHeader = sam_hdr_dup(inBamHeader);
    if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
    {
        Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
        sam_close(g_outBamFp);
        return -1;
    }
    hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
    hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件

    // /* 读取缓存初始化 */
    BamBufType inBamBuf(g_gArg.use_asyncio);
    inBamBuf.Init(inBamFp, inBamHeader, g_gArg.max_mem);

    /* 循环读入信息，并处理 */
    g_maxJobNum = g_gArg.num_threads * 10;
    // g_maxJobNum = g_gArg.num_threads * 3;
    g_jobNumForRead = g_gArg.num_threads * 2;

    int64_t x_all = 0; // for test
    int64_t jobSeq = 0;
    int64_t processedBamNum = 0; // 记录每个轮次累计处理的reads数量，用来计算每个read在整个文件中的索引位置
    threadpool thpool = thpool_init(g_gArg.num_threads); // 创建mark dup所需的线程池
    thread *writeth = LAUNCH(thread_write, nullptr);     // 启动处理结果的的线程
    int bamRemainSize = 0; // 上一轮还剩下的bam数量，包含已经在任务里的和没有放进任务的
    int numReadsForEachJob = 0; // 每个线程处理的read数量，第一次读取的时候进行设置
    int lastRoundUnProcessed = 0; // 上一轮没有放进任务里的read数量
    int curRoundProcessed = 0; // 这一轮放进任务的read数量
    while (inBamBuf.ReadStat() >= 0)
    {
        /* 读取bam文件中的read */
        int readNum = inBamBuf.ReadBam();
        if (numReadsForEachJob == 0)
            numReadsForEachJob = readNum / g_maxJobNum; // 第一次读取bam的时候进行设置
        g_bamLoadedNum += readNum;

        cout << readNum << endl; // 这一轮读取的bam数量

        /* 多线程处理 任务数是线程数的10倍 */
        tm_arr[0].acc_start();
        curRoundProcessed = 0; // 当前轮次已经处理的reads数量
        int numNeedToProcess = inBamBuf.Size() - bamRemainSize + lastRoundUnProcessed; // 当前需要处理的bam数量
        for (int i = 0; numNeedToProcess >= numReadsForEachJob; ++i) // 只有待处理的reads数量大于一次任务的数量时，新建任务
        {
            int startIdx = i * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
            int endIdx = (i + 1) * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;

            ThMarkDupArg *thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed,
                                                    jobSeq++,
                                                    true,
                                                    false,
                                                    inBamBuf.Slice(startIdx, endIdx)});
            POSSESS(g_queueFirstLock);                               // 加锁
            g_qpThMarkDupArg.push(thArg);                            // 将新任务需要的参数添加到队列
            RELEASE(g_queueFirstLock);                               // 解锁
            thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
            curRoundProcessed += endIdx - startIdx;
            numNeedToProcess -= numReadsForEachJob;
        }
        processedBamNum += curRoundProcessed;
        lastRoundUnProcessed = numNeedToProcess;

        /* 等待可以继续读取的信号 */
        POSSESS(g_readyToReadLock);
        WAIT_FOR(g_readyToReadLock, TO_BE, 1);
        bamRemainSize = g_bamLoadedNum - g_bamWritenNum;

        while (bamRemainSize >= inBamBuf.Size() / 2)
        { // 要保留的多于现在有的bam数量的一半，那就等待write线程继续处理
            TWIST(g_readyToReadLock, TO, 0);
            POSSESS(g_readyToReadLock);
            WAIT_FOR(g_readyToReadLock, TO_BE, 1);
            bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
        }
        inBamBuf.ClearBeforeIdx(inBamBuf.Size() - bamRemainSize); // 清理掉已经处理完的reads
        // cout << g_bamLoadedNum << '\t' << g_bamWritenNum << '\t' << bamRemainSize << '\t' << inBamBuf.Size() << endl;
        TWIST(g_readyToReadLock, TO, 0);

    }
    /* 数据读完了，放一个空的任务，好让write thread停下来 */
    ThMarkDupArg *thArg = nullptr;
    if (lastRoundUnProcessed > 0) // 最后一轮还有没有添加进任务的read数据
    {
        thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed, jobSeq++, false, false,
                                  inBamBuf.Slice(inBamBuf.Size() - lastRoundUnProcessed, inBamBuf.Size())});
        processedBamNum += lastRoundUnProcessed;
    }
    else 
    {
        thArg = new ThMarkDupArg({0, jobSeq++, false, false});
    }
    POSSESS(g_queueFirstLock);                               // 加锁
    g_qpThMarkDupArg.push(thArg);                            // 将新任务需要的参数添加到队列
    RELEASE(g_queueFirstLock);                               // 解锁
    thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务

    /* 同步所有线程 */
    thpool_wait(thpool);
    thpool_destroy(thpool);
    JOIN(writeth);

    cout <<"x_all: " << x_all << endl;
    cout << "loaded: " << g_bamLoadedNum << endl;
    cout << "writen: " << g_bamWritenNum << endl;
    cout << "processedBamNum: " << processedBamNum << endl;
    /* 标记冗余, 将处理后的结果写入文件 */

    /* 关闭文件，收尾清理 */
    sam_close(g_outBamFp);
    sam_close(inBamFp);

    cout << "read ends size: " << sizeof(ReadEnds) << endl;

    cout << "      总时间: " << time_all.seconds_elapsed() << endl;
    cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
    Timer::log_time("程序结束");
    return 0;
}
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
+								/*
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								Description: 标记bam文件中的冗余信息，只处理按照坐标排序后的bam，且bam为单一样本数据
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
 								Copyright : All right reserved by ICT
 								Author : Zhang Zhonghai
 								Date : 2023/10/23
 								*/
-												基本完成了参数的处理，帮助信息里有些参数需要删掉

											
										
										
											2023-11-01 10:48:02 +08:00
+								#include "markdups_arg.h"
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								// 有太多define冲突，放到最后include
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
 								#include <common/hts/bam_buf.h>
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								#include <common/utils/global_arg.h>
 								#include <common/utils/thpool.h>
 								#include <common/utils/timer.h>
 								#include <common/utils/util.h>
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								#include <common/utils/murmur3.h>
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								#include <common/utils/yarn.h>
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								#include <sam/utils/read_ends.h>
 								#include <sam/utils/read_name_parser.h>
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
 								#include <htslib/sam.h>
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								#include <htslib/thread_pool.h>
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
 								#include <iostream>
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								#include <vector>
 								#include <set>
 								#include <queue>
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								#include <unordered_map>
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
 								using namespace std;
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								using std::cout;
 								#define SMA_TAG_PG "PG"
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								#define BAM_BLOCK_SIZE 2 * 1024 * 1024
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								#define NO_SUCH_INDEX INT64_MAX
 								static Timer tm_arr[10]; // 用来测试性能
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
 								/* 前向声明 */
 								class ThMarkDupArg;
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								/* 全局本地变量 */
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								static queue<ThMarkDupArg *> g_qpThMarkDupArg;   // 存放线程变量的队列
 								static lock_t *g_queueFirstLock = NEW_LOCK(-1);  // 队列的第一个任务是否完成
 								static lock_t *g_readyToReadLock = NEW_LOCK(-1); // 通知主线程是否可以进行下一次读取
 								static vector<ReadNameParser> g_vRnParser;       // 每个线程一个read name parser
 								static int g_numDuplicateIndices = 0;            // 找到的冗余read总数
 								static samFile *g_outBamFp = nullptr;            // 输出文件, sam或者bam格式
 								static sam_hdr_t *g_outBamHeader;                // 输出文件的header
 								static int g_maxJobNum = 0;                      // 每次读取新的数据后，新增的任务数量
 								static int g_jobNumForRead = 0;                  // 任务数量降到当前值时开始下一轮读取
 								static volatile int64_t g_bamLoadedNum = 0;      // 已经读入的read总数
 								static volatile int64_t g_bamWritenNum = 0;         // 已经处理完，写入输出文件的read总数
 								static vector<int64_t> g_vDupIdx;  // 线程内部计算得出的
 								static vector<int64_t> g_vOpticalDupIdx;
 								static set<int64_t> g_sDupIdxLatter;
 								static set<int64_t> g_sOpticalDupIdxLatter;
 								/* 参数对象作为全局对象，免得多次作为参数传入函数中 */
 								static GlobalArg &g_gArg = GlobalArg::Instance();
 								static MarkDupsArg g_mdArg;
 								/*
 								 * 计算read的分数
 								 */
 								static int16_t computeDuplicateScore(BamWrap &bw)
 								{
 								    int16_t score = 0;
 								    switch (g_mdArg.DUPLICATE_SCORING_STRATEGY)
 								    {
 								    case ns_md::SUM_OF_BASE_QUALITIES:
 								        // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2
 								        // and risk overflow.
 								        score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
 								        break;
 								    case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
 								        if (!bw.GetReadUnmappedFlag())
 								            // no need to remember the score since this scoring mechanism is symmetric
 								            score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
 								        break;
 								    case ns_md::RANDOM:
 								        // The RANDOM score gives the same score to both reads so that they get filtered together.
 								        // it's not critical do use the readName since the scores from both ends get added, but it seem
 								        // to be clearer this way.
 								        score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
 								        // subtract Short.MIN_VALUE/4 from it to end up with a number between
 								        // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is
 								        // not passing filters. We need to stay far from overflow so that when we add the two
 								        // scores from the two read mates we do not overflow since that could cause us to chose a
 								        // failing read-pair instead of a passing one.
 								        score -= INT16_MIN / 4;
 								    default:
 								        break;
 								    }
 								    // make sure that filter-failing records are heavily discounted. (the discount can happen twice, once
 								    // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.)
 								    score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
 								    return score;
 								}
 								/*
 								 * Builds a read ends object that represents a single read. 用来表示一个read的特征结构
 								 */
 								static void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey)
 								{
 								    auto &k = *pKey;
 								    auto &bc = bw.b->core;
 								    k.read1ReferenceIndex = bc.tid;
 								    k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
 								    k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
 								    k.read1IndexInFile = index;
 								    k.score = computeDuplicateScore(bw);
 								    // Doing this lets the ends object know that it's part of a pair
 								    if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag())
 								    {
 								        k.read2ReferenceIndex = bc.mtid;
 								    }
 								    // Fill in the location information for optical duplicates
 								    rnParser.AddLocationInformation(bw.query_name(), pKey);
 								    // cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
 								    // 计算位置key
 								    k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
 								}
 								/**
 								 * Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
 								 * quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
 								 * read to the records of the first and second in a pair. This value becomes is used for
 								 * the 'DI' tag.
 								 */
 								static void addRepresentativeReadIndex(vector<ReadEnds *> &vpRe)
 								{
 								}
 								/* 处理一组pairend的readends，标记冗余 */
 								static void markDuplicatePairs(vector<ReadEnds *> &vpRe, set<int64_t> *psDupIdx, set<int64_t> *psOpticalDupIdx)
 								{
 								    if (vpRe.size() < 2) {
 								        if (vpRe.size() == 1)
 								        {
 								            // addSingletonToCount(libraryIdGenerator);
 								        }
 								        return;
 								    }
 								    int maxScore = 0;
 								    ReadEnds *pBestRe = nullptr;
 								    /** All read ends should have orientation FF, FR, RF, or RR **/
 								    for (auto pe: vpRe) // 找分数最高的readend
 								    {
 								        if (pe->score > maxScore || pBestRe == nullptr)
 								        {
 								            maxScore = pe->score;
 								            pBestRe = pe;
 								        }
 								    }
 								    if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
 								    {
 								        // trackOpticalDuplicates
 								    }
 								    for (auto pe: vpRe) // 对非best read标记冗余
 								    {
 								        if (pe != pBestRe) // 非best
 								        {
 								            psDupIdx->insert(pe->read1IndexInFile); // 添加read1
 								            if (pe->read2IndexInFile != pe->read1IndexInFile)
 								                psDupIdx->insert(pe->read2IndexInFile); // 添加read2
 								        }
 								    }
 								    if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
 								    {
 								        addRepresentativeReadIndex(vpRe);
 								    }
 								}
 								/* 处理一组非paired的readends，标记冗余 */
 								static void markDuplicateFragments(vector<ReadEnds *> &vpRe,
 								                                   bool containsPairs,
 								                                   set<int64_t> *psDupIdx,
 								                                   set<int64_t> *psOpticalDupIdx)
 								{
 								    if (containsPairs)
 								    {
 								        for (auto pe: vpRe)
 								        {
 								            if (!pe->IsPaired())
 								            {
 								                psDupIdx->insert(pe->read1IndexInFile);
 								            }
 								        }
 								    }
 								    else
 								    {
 								        int maxScore = 0;
 								        ReadEnds *pBest = nullptr;
 								        for (auto pe : vpRe)
 								        {
 								            if (pe->score > maxScore || pBest == nullptr)
 								            {
 								                maxScore = pe->score;
 								                pBest = pe;
 								            }
 								        }
 								        for (auto pe : vpRe)
 								        {
 								            if (pe != pBest)
 								            {
 								                psDupIdx->insert(pe->read1IndexInFile);
 								            }
 								        }
 								    }
 								}
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
 								/* 多线程处理冗余参数结构体 */
 								struct ThMarkDupArg
 								{
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    int64_t bamStartIdx;                          // 当前vBam数组中第一个bam记录在整体bam中所处的位置
 								    long seq;                                     // 当前任务在所有任务的排序
 								    bool more;                                    // 后面还有任务
 								    volatile bool finish;                         // 当前任务有没有处理完
 								    vector<BamWrap *> vBam;                       // 存放待处理的bam read
 								    map<int64_t, vector<ReadEnds>> mvPair;        // 以冗余位置为索引，保存所有pairend reads
 								    map<int64_t, vector<ReadEnds>> mvFrag;        // 保存所有reads，包括pairend
 								    map<int64_t, set<int64_t>> msDupIdx;              // 冗余read的索引
 								    map<int64_t, set<int64_t>> msOpticalDupIdx;       // optical冗余read的索引
 								    unordered_map<string, ReadEnds> umReadEnds;   // 用来寻找pair end
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								};
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
+								/*
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								 * 多线程查找和标记冗余函数
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
+								 */
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								void thread_markdups(void *arg, int tid)
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								{
 								    auto &p = *(ThMarkDupArg *)arg;
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    /* 处理每个read，创建ReadEnd，并放入frag和pair中 */
 								    for (int i = 0; i < p.vBam.size(); ++i) // 循环处理每个read
 								    {
 								        BamWrap *bw = p.vBam[i];
 								        const int64_t bamIdx = p.bamStartIdx + i;
 								        if (bw->GetReadUnmappedFlag())
 								        {
 								            if (bw->b->core.tid == -1)
 								                // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
 								                break;
 								        }
 								        else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
 								        {
 								            ReadEnds fragEnd;
 								            buildReadEnds(*bw, bamIdx, g_vRnParser[tid], &fragEnd);
 								            p.mvFrag[fragEnd.posKey].push_back(fragEnd); // 添加进frag集合
 								            if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
 								            {
 								                string key = bw->query_name();
 								                if (p.umReadEnds.find(key) == p.umReadEnds.end())
 								                {
 								                    p.umReadEnds[key] = fragEnd;
 								                }
 								                else // 找到了pairend
 								                {
 								                    auto pairedEnds = p.umReadEnds.at(key);
 								                    p.umReadEnds.erase(key); // 删除找到的pairend
 								                    const int matesRefIndex = fragEnd.read1ReferenceIndex;
 								                    const int matesCoordinate = fragEnd.read1Coordinate;
 								                    // Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands.  NB: must do this
 								                    // before updating the orientation later.
 								                    if (bw->GetFirstOfPairFlag())
 								                    {
 								                        pairedEnds.orientationForOpticalDuplicates =
 								                            ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds::R);
 								                    }
 								                    else
 								                    {
 								                        pairedEnds.orientationForOpticalDuplicates =
 								                            ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, bw->GetReadNegativeStrandFlag());
 								                    }
 								                    // If the other read is actually later, simply add the other read's data as read2, else flip the reads
 								                    if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
 								                        (matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate))
 								                    {
 								                        pairedEnds.read2ReferenceIndex = matesRefIndex;
 								                        pairedEnds.read2Coordinate = matesCoordinate;
 								                        pairedEnds.read2IndexInFile = bamIdx;
 								                        pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
 								                                                                             bw->GetReadNegativeStrandFlag());
 								                        // if the two read ends are in the same position, pointing in opposite directions,
 								                        // the orientation is undefined and the procedure above
 								                        // will depend on the order of the reads in the file.
 								                        // To avoid this, we set it explicitly (to FR):
 								                        if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
 								                            pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
 								                            pairedEnds.orientation == ReadEnds::RF)
 								                        {
 								                            pairedEnds.orientation = ReadEnds::FR;
 								                        }
 								                    }
 								                    else
 								                    {
 								                        pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
 								                        pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
 								                        pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
 								                        pairedEnds.read1ReferenceIndex = matesRefIndex;
 								                        pairedEnds.read1Coordinate = matesCoordinate;
 								                        pairedEnds.read1IndexInFile = bamIdx;
 								                        pairedEnds.orientation = ReadEnds::GetOrientationByte(bw->GetReadNegativeStrandFlag(),
 								                                                                             pairedEnds.orientation == ReadEnds::R);
 								                    }
 								                    pairedEnds.score += computeDuplicateScore(*bw);
 								                    p.mvPair[pairedEnds.posKey].push_back(pairedEnds);
 								                }
 								            }
 								        }
 								    }
 								    /* generateDuplicateIndexes，计算冗余read在所有read中的位置索引 */
 								    // 先处理 pair
 								    int dupNum = 0;
 								    vector<ReadEnds *> vRePotentialDup; // 有可能是冗余的reads
 								    for (auto &e : p.mvPair) // 按比对的位置先后进行遍历
 								    {
 								        if (e.second.size() > 1) // 有潜在的冗余
 								        {
 								            vRePotentialDup.clear();
 								            ReadEnds *pReadEnd = nullptr;
 								            for (auto &re : e.second)
 								            {
 								                if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))
 								                    vRePotentialDup.push_back(&re);
 								                else
 								                {
 								                    markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
 								                    vRePotentialDup.clear();
 								                    vRePotentialDup.push_back(&re);
 								                    pReadEnd = &re;
 								                }
 								            }
 								            markDuplicatePairs(vRePotentialDup, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
 								        }
 								    }
 								    // 再处理frag
 								    bool containsPairs = false;
 								    bool containsFrags = false;
 								    for (auto &e : p.mvFrag)
 								    {
 								        if (e.second.size() > 1) // 有潜在的冗余
 								        {
 								            vRePotentialDup.clear();
 								            ReadEnds *pReadEnd = nullptr;
 								            for (auto &re : e.second)
 								            {
 								                if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
 								                {
 								                    vRePotentialDup.push_back(&re);
 								                    containsPairs = containsPairs || re.IsPaired();
 								                    containsFrags = containsFrags || !re.IsPaired();
 								                }
 								                else
 								                {
 								                    if (vRePotentialDup.size() > 1 && containsFrags)
 								                    {
 								                        markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
 								                    }
 								                    vRePotentialDup.clear();
 								                    vRePotentialDup.push_back(&re);
 								                    pReadEnd = &re;
 								                    containsPairs = re.IsPaired();
 								                    containsFrags = !re.IsPaired();
 								                }
 								            }
 								            if (vRePotentialDup.size() > 1 && containsFrags) {
 								                markDuplicateFragments(vRePotentialDup, containsPairs, &p.msDupIdx[e.first], &p.msOpticalDupIdx[e.first]);
 								            }
 								        }
 								    }
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    // cout << tid << '\t' << "dup: " << dupNum << endl;
 								    // cout << tid << " all: no: " << p.vBam.size() << '\t' << p.umReadEnds.size() << endl;
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    /* 本段数据处理完成，告诉输出线程 */
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    POSSESS(g_queueFirstLock);
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    p.finish = true;
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    // cout << tid << ": process: " << p.seq << endl;
 								    auto front = g_qpThMarkDupArg.front();
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    if (front->finish)
 								    {
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        TWIST(g_queueFirstLock, TO, front->seq); // 通知写线程，当前队列头部完成的任务
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    } else {
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        RELEASE(g_queueFirstLock);
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    }
 								}
 								/*
 								 * 多线程将结果写入文件，写之前需要合并相邻线程的未处理的结果
 								 */
 								void thread_write(void *)
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
+								{
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    bool more = false;
 								    long seq = 0;
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    long unPairedNum = 0;
 								    POSSESS(g_queueFirstLock);
 								    WAIT_FOR(g_queueFirstLock, TO_BE, seq++); // 等待首个任务完成
 								    auto lastP = g_qpThMarkDupArg.front();    // 取队首的数据
 								    auto umUnpairedReadEnds = lastP->umReadEnds; // 还未找到pair的read
 								    auto p = lastP;
 								    g_qpThMarkDupArg.pop(); // 删除队首
 								    TWIST(g_queueFirstLock, TO, seq); // 解锁
 								    more = lastP->more; // 是否还有下一个任务
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    while (more) // 循环处理，将结果写入文件
 								    {
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        POSSESS(g_queueFirstLock);
 								        if (g_qpThMarkDupArg.empty()) // 有可能新任务没来得及添加进队列
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								        {
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								            RELEASE(g_queueFirstLock);
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								            continue;
 								        }
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        WAIT_FOR(g_queueFirstLock, TO_BE, seq); // 等待任务完成
 								        p = g_qpThMarkDupArg.front();
 								        if (!p->finish) // 有可能这个任务没有完成，是下边那个TWIST导致进到这里，因为这一段代码可能运行比较快
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								        {
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								            TWIST(g_queueFirstLock, TO, -1); // 此时队首任务没完成，-1可以让锁无法进入到这里，避免无效获得锁
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								            continue;
 								        }
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        g_qpThMarkDupArg.pop();
 								        TWIST(g_queueFirstLock, TO, seq + 1);
 								        /* 处理结果数据 */
 								        // cout << "finish: " <<  seq - 1 << '\t' << "lastIdx: " << p->bamStartIdx+p->vBam.size() << endl;
-												基本完成了参数的处理，帮助信息里有些参数需要删掉

											
										
										
											2023-11-01 10:48:02 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        for (auto &e : p->umReadEnds) // 在当前任务中找有没有与上一个任务中没匹配的read，相匹配的pair
 								        {
 								            if (umUnpairedReadEnds.find(e.first) != umUnpairedReadEnds.end())
 								                umUnpairedReadEnds.erase(e.first); // 找到了pair
 								            else
 								                umUnpairedReadEnds.insert(e); // 没有pair，则添加
 								        }
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        /* 更新写入read数量和状态 */
 								        POSSESS(g_readyToReadLock);
 								        g_bamWritenNum += lastP->vBam.size();
 								        // cout << "write: " << g_qpThMarkDupArg.size() << endl;
 								        if (g_qpThMarkDupArg.size() <= g_jobNumForRead)
 								        {
 								            TWIST(g_readyToReadLock, TO, 1);
 								        }
 								        else
 								        {
 								            RELEASE(g_readyToReadLock);
 								        }
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								        /* 准备下一轮循环 */
 								        delete lastP;
 								        more = p->more;
 								        lastP = p;
 								        seq++;
 								    }
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    unPairedNum = umUnpairedReadEnds.size();
 								    cout << "Finally unpaired read num: " << unPairedNum << endl;
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
 								    // 处理最后一个数据
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    POSSESS(g_readyToReadLock);
 								    g_bamWritenNum += lastP->vBam.size();
 								    TWIST(g_readyToReadLock, TO, 1);
 								    // cout << "last finish: " << seq - 1 << endl;
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    pthread_exit(0);
 								}
 								/*
 								 * mark duplicate 入口，假定bam是按照比对后的坐标排序的，同一个样本的话不需要考虑barcode的问题
 								 */
 								int MarkDuplicates(int argc, char *argv[])
 								{
 								    Timer::log_time("程序开始");
 								    Timer time_all;
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
 								    /* 读取命令行参数 */
 								    g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
 								    if (g_gArg.num_threads < 1) // 线程数不能小于1
 								        g_gArg.num_threads = 1;
 								    /* 初始化一些参数和变量*/
 								    g_vRnParser.resize(g_gArg.num_threads);
 								    for (auto &parser : g_vRnParser)
 								        parser.SetReadNameRegex(g_mdArg.READ_NAME_REGEX); // 用来解析read name中的tile，x，y信息
 								    /* 打开输入bam文件 */
 								    sam_hdr_t *inBamHeader;
 								    samFile *inBamFp;
 								    inBamFp = sam_open_format(g_gArg.in_fn.c_str(), "r", nullptr);
 								    if (!inBamFp)
 								    {
 								        Error("[%s] load sam/bam file failed.\n", __func__);
 								        return -1;
 								    }
 								    hts_set_opt(inBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
 								    inBamHeader = sam_hdr_read(inBamFp); // 读取header
 								    /* 利用线程池对输入输出文件进行读写 */
 								    htsThreadPool htsPoolRead = {NULL, 0};  // 多线程读取，创建线程池
 								    htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
 								    htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
 								    htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
 								    if (!htsPoolRead.pool || !htsPoolWrite.pool)
 								    {
 								        Error("[%d] failed to set up thread pool", __LINE__);
 								        return -1;
 								    }
 								    hts_set_opt(inBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
 								    /* 初始化输出文件 */
 								    char modeout[12] = "wb";
 								    sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL);
 								    g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout);
 								    g_outBamHeader = sam_hdr_dup(inBamHeader);
 								    if (sam_hdr_write(g_outBamFp, g_outBamHeader) != 0)
 								    {
 								        Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
 								        sam_close(g_outBamFp);
 								        return -1;
 								    }
 								    hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
 								    hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); // 用同样的线程池处理输出文件
 								    // /* 读取缓存初始化 */
 								    BamBufType inBamBuf(g_gArg.use_asyncio);
 								    inBamBuf.Init(inBamFp, inBamHeader, g_gArg.max_mem);
 								    /* 循环读入信息，并处理 */
 								    g_maxJobNum = g_gArg.num_threads * 10;
 								    // g_maxJobNum = g_gArg.num_threads * 3;
 								    g_jobNumForRead = g_gArg.num_threads * 2;
 								    int64_t x_all = 0; // for test
 								    int64_t jobSeq = 0;
 								    int64_t processedBamNum = 0; // 记录每个轮次累计处理的reads数量，用来计算每个read在整个文件中的索引位置
 								    threadpool thpool = thpool_init(g_gArg.num_threads); // 创建mark dup所需的线程池
 								    thread *writeth = LAUNCH(thread_write, nullptr);     // 启动处理结果的的线程
 								    int bamRemainSize = 0; // 上一轮还剩下的bam数量，包含已经在任务里的和没有放进任务的
 								    int numReadsForEachJob = 0; // 每个线程处理的read数量，第一次读取的时候进行设置
 								    int lastRoundUnProcessed = 0; // 上一轮没有放进任务里的read数量
 								    int curRoundProcessed = 0; // 这一轮放进任务的read数量
 								    while (inBamBuf.ReadStat() >= 0)
 								    {
 								        /* 读取bam文件中的read */
 								        int readNum = inBamBuf.ReadBam();
 								        if (numReadsForEachJob == 0)
 								            numReadsForEachJob = readNum / g_maxJobNum; // 第一次读取bam的时候进行设置
 								        g_bamLoadedNum += readNum;
 								        cout << readNum << endl; // 这一轮读取的bam数量
 								        /* 多线程处理 任务数是线程数的10倍 */
 								        tm_arr[0].acc_start();
 								        curRoundProcessed = 0; // 当前轮次已经处理的reads数量
 								        int numNeedToProcess = inBamBuf.Size() - bamRemainSize + lastRoundUnProcessed; // 当前需要处理的bam数量
 								        for (int i = 0; numNeedToProcess >= numReadsForEachJob; ++i) // 只有待处理的reads数量大于一次任务的数量时，新建任务
 								        {
 								            int startIdx = i * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
 								            int endIdx = (i + 1) * numReadsForEachJob + bamRemainSize - lastRoundUnProcessed;
 								            ThMarkDupArg *thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed,
 								                                                    jobSeq++,
 								                                                    true,
 								                                                    false,
 								                                                    inBamBuf.Slice(startIdx, endIdx)});
 								            POSSESS(g_queueFirstLock);                               // 加锁
 								            g_qpThMarkDupArg.push(thArg);                            // 将新任务需要的参数添加到队列
 								            RELEASE(g_queueFirstLock);                               // 解锁
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								            thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								            curRoundProcessed += endIdx - startIdx;
 								            numNeedToProcess -= numReadsForEachJob;
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								        }
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        processedBamNum += curRoundProcessed;
 								        lastRoundUnProcessed = numNeedToProcess;
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        /* 等待可以继续读取的信号 */
 								        POSSESS(g_readyToReadLock);
 								        WAIT_FOR(g_readyToReadLock, TO_BE, 1);
 								        bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
 								        while (bamRemainSize >= inBamBuf.Size() / 2)
 								        { // 要保留的多于现在有的bam数量的一半，那就等待write线程继续处理
 								            TWIST(g_readyToReadLock, TO, 0);
 								            POSSESS(g_readyToReadLock);
 								            WAIT_FOR(g_readyToReadLock, TO_BE, 1);
 								            bamRemainSize = g_bamLoadedNum - g_bamWritenNum;
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								        }
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								        inBamBuf.ClearBeforeIdx(inBamBuf.Size() - bamRemainSize); // 清理掉已经处理完的reads
 								        // cout << g_bamLoadedNum << '\t' << g_bamWritenNum << '\t' << bamRemainSize << '\t' << inBamBuf.Size() << endl;
 								        TWIST(g_readyToReadLock, TO, 0);
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    }
 								    /* 数据读完了，放一个空的任务，好让write thread停下来 */
 								    ThMarkDupArg *thArg = nullptr;
 								    if (lastRoundUnProcessed > 0) // 最后一轮还有没有添加进任务的read数据
 								    {
 								        thArg = new ThMarkDupArg({processedBamNum + curRoundProcessed, jobSeq++, false, false,
 								                                  inBamBuf.Slice(inBamBuf.Size() - lastRoundUnProcessed, inBamBuf.Size())});
 								        processedBamNum += lastRoundUnProcessed;
 								    }
 								    else
 								    {
 								        thArg = new ThMarkDupArg({0, jobSeq++, false, false});
 								    }
 								    POSSESS(g_queueFirstLock);                               // 加锁
 								    g_qpThMarkDupArg.push(thArg);                            // 将新任务需要的参数添加到队列
 								    RELEASE(g_queueFirstLock);                               // 解锁
 								    thpool_add_work(thpool, thread_markdups, (void *)thArg); // 添加新任务
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    /* 同步所有线程 */
 								    thpool_wait(thpool);
 								    thpool_destroy(thpool);
 								    JOIN(writeth);
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    cout <<"x_all: " << x_all << endl;
 								    cout << "loaded: " << g_bamLoadedNum << endl;
 								    cout << "writen: " << g_bamWritenNum << endl;
 								    cout << "processedBamNum: " << processedBamNum << endl;
 								    /* 标记冗余, 将处理后的结果写入文件 */
 								    /* 关闭文件，收尾清理 */
 								    sam_close(g_outBamFp);
 								    sam_close(inBamFp);
-												基本完成了参数的处理，帮助信息里有些参数需要删掉

											
										
										
											2023-11-01 10:48:02 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    cout << "read ends size: " << sizeof(ReadEnds) << endl;
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
-												并行处理框架搭建完成，基本完成了工作线程处理逻辑

											
										
										
											2023-11-09 21:07:58 +08:00
+								    cout << "      总时间: " << time_all.seconds_elapsed() << endl;
 								    cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
-												重构，修改bambuf，支持清理某个read之前的缓存

											
										
										
											2023-11-06 12:38:30 +08:00
+								    Timer::log_time("程序结束");
-												配置了cmake和调试环境

											
										
										
											2023-10-23 23:07:00 +08:00
+								    return 0;
 								}