需要完善optical dup idx计算

2024-08-28 12:00:23 +08:00 · 2024-08-28 12:00:23 +08:00 · 7352a00f2c
parent 86954ffa85
commit 7352a00f2c
10 changed files with 97 additions and 63 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 # for fast-markdup
 *.sam
 *.bam
 *.log
 # ---> C++
 # Prerequisites
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -13,11 +13,11 @@
            "program": "${workspaceRoot}/build/bin/picard_cpp",
            "args": [
                "MarkDuplicates",
-                "--INPUT", "~/data/bam/100w.bam",
+                "--INPUT", "~/data/bam/zy_normal.bam",
                "--OUTPUT", "out.bam",
                "--METRICS_FILE", "metrics.txt",
                "--num_threads", "1",
-                "--max_mem", "4G",
+                "--max_mem", "100G",
                "--verbosity", "DEBUG",
                "--asyncio", "true",
            ],
--- a/run.sh
+++ b/run.sh
@ -1,6 +1,8 @@
-#input=~/data/bam/zy_normal.bam
+input=~/data/bam/zy_normal.bam
-input=~/data/bam/zy_tumor.bam
+#input=~/data/bam/zy_tumor.bam
 #input=~/data/bam/100w.bam
 #input=~/data/bam/1kw.sam
 #input=~/data/bam/n1kw.sam
 time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
    MarkDuplicates \
@ -8,7 +10,7 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
    --OUTPUT ~/data/bam/out.bam \
    --INDEX_FORMAT BAI \
    --num_threads 1 \
-    --max_mem 2G \
+    --max_mem 1G \
    --verbosity DEBUG \
    --asyncio true #\
    #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
--- a/src/common/utils/global_arg.cpp
+++ b/src/common/utils/global_arg.cpp
@ -71,7 +71,8 @@ void GlobalArg::parseArgument(int argNum) {
            mem_arg <<= 20;
        else if (*q == 'g' || *q == 'G')
            mem_arg <<= 30;
-        if (mem_arg >= max_mem)
+        //if (mem_arg >= max_mem)
        if (true)
            max_mem = mem_arg;
        else {
            std::cerr << "[Warn] Too small mem size, use default" << std::endl;
--- a/src/sam/markdups/markdups.cpp
+++ b/src/sam/markdups/markdups.cpp
@ -57,6 +57,10 @@ static GlobalDataArg gData_;
 GlobalDataArg &gData = gData_;
 DuplicationMetrics gMetrics_;
 DuplicationMetrics &gMetrics = gMetrics_;
 int zzhtestnum = 0;
 set<int64_t> zzhopticalSet;
 vector<int64_t> zzhopticalArr;
 /*
 * mark duplicate
 * 入口，假定bam是按照比对后的坐标排序的，同一个样本的话不需要考虑barcode的问题
@ -161,11 +165,11 @@ int MarkDuplicates(int argc, char *argv[]) {
    while (inBuf.ReadStat() >= 0) {
        Timer tw1;
        size_t readNum = inBuf.ReadBam();
-        cout << "read: " << readNum << endl;
+    //    cout << "read: " << readNum << endl;
        for (size_t i = 0; i < inBuf.Size(); ++i) {
            /* 判断是否冗余 */
            if (bamIdx == dupIdx) {
-                // cout << "冗余" << bamIdx << endl;
+                // cerr << bamIdx << endl;
                dupIdx = idxQue.Pop();
            }
            if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
--- a/src/sam/markdups/markdups_arg.h
+++ b/src/sam/markdups/markdups_arg.h
@ -213,7 +213,7 @@ struct MarkDupsArg
    ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
    /* "The scoring strategy for choosing the non-duplicate among candidates." */
-    ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH;
+    ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES;
    /* "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
                    "PG record creation.  This string may have a suffix appended to avoid collision with other " +
--- a/src/sam/markdups/md_funcs.cpp
+++ b/src/sam/markdups/md_funcs.cpp
@ -28,8 +28,6 @@ using std::set;
 using std::unordered_map;
 using std::vector;
 static int zzhtestnum = 0;
 /* 清除key位置的数据 */
 void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
    auto &msIdx = *pmsIdx;
@ -247,6 +245,7 @@ void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
 /* 对找到的pairend read end添加一些信息 */
 void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
    auto &pairedEnds = *pPairedEnds;
    int64_t bamIdx = fragEnd.read1IndexInFile;
    const int matesRefIndex = fragEnd.read1ReferenceIndex;
    const int matesCoordinate = fragEnd.read1Coordinate;
@ -459,14 +458,23 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
    findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
    int opticalDuplicates = 0;
    for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
        ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
        if (opticalDuplicateFlags[i]) {
            ++opticalDuplicates;
-            ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
+            // if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
            //     cout << "val: " << pRe->isOpticalDuplicate << endl;
            // }
            pRe->isOpticalDuplicate = true;
            zzhopticalSet.insert(pRe->read1IndexInFile);
            zzhopticalSet.insert(pRe->read2IndexInFile);
            zzhopticalArr.push_back(pRe->read1IndexInFile);
            zzhopticalArr.push_back(pRe->read2IndexInFile);
        } else {
            pRe->isOpticalDuplicate = false;
            zzhopticalSet.erase(pRe->read1IndexInFile);
            zzhopticalSet.erase(pRe->read2IndexInFile);
        }
    }
    if (opticalDuplicates > 0)
        gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
    return opticalDuplicates;
 }
@ -475,8 +483,11 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
 */
 void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
    bool hasFR = false, hasRF = false;
    int prevOpticalDupNum = 0;
    // Check to see if we have a mixture of FR/RF
    for (auto pRe : readEndsArr) {
        if (pRe->isOpticalDuplicate)
            ++prevOpticalDupNum;
        if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
            hasFR = true;
        else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
@ -513,5 +524,10 @@ void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnd
    if (nOpticalDup)
        gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
    gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
    //gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
    // cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
    // cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
 }
--- a/src/sam/markdups/serial_md.cpp
+++ b/src/sam/markdups/serial_md.cpp
@ -13,10 +13,10 @@
 #include <set>
 #include <vector>
 #include "dup_metrics.h"
 #include "markdups_arg.h"
 #include "md_funcs.h"
 #include "shared_args.h"
 #include "dup_metrics.h"
 using std::cout;
 using std::set;
@ -78,25 +78,28 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupId
    }
    int maxScore = 0;
    const ReadEnds *pBest = nullptr;
    int maxOperateTime = 0;
    /** All read ends should have orientation FF, FR, RF, or RR **/
-    for (auto pe : vpRe) { // 找分数最高的readend
+    for (auto pe : vpRe) {  // 找分数最高的readend
        maxOperateTime = max(maxOperateTime, pe->oprateTime);
        (const_cast<ReadEnds *>(pe))->oprateTime ++;
        if (pe->score > maxScore || pBest == nullptr) {
            maxScore = pe->score;
            pBest = pe;
        }
    }
    // cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl;
    // if (maxOperateTime == 0) ++zzhtestnum;
    if (notDupIdx != nullptr) {
        notDupIdx->insert(pBest->read1IndexInFile);
        notDupIdx->insert(pBest->read2IndexInFile);
    }
-    if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
+    if (!g_mdArg.READ_NAME_REGEX.empty()) {  // 检查光学冗余
        // trackOpticalDuplicates
        trackOpticalDuplicates(vpRe, pBest);
    }
-    for (auto pe : vpRe)  // 对非best read标记冗余
+    for (auto pe : vpRe) { // 对非best read标记冗余
-    {
+        if (pe != pBest) { // 非best
        if (pe != pBest)  // 非best
        {
            dupIdx->insert(pe->read1IndexInFile);  // 添加read1
            if (pe->read2IndexInFile != pe->read1IndexInFile)
                dupIdx->insert(pe->read2IndexInFile);  // 添加read2
@ -139,8 +142,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
 /* 找到与readend pos相等的所有readend */
 static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
-    auto range = std::equal_range(src.begin(), src.end(), re,
+    auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan);  // 只比对位点
                                  ReadEnds::PairsLittleThan);  // 只比对位点
    dst->insert(dst->end(), range.first, range.second);
 }
@ -158,24 +160,24 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
    // set<ReadEnds> reSet;
    // ReadEnds lastRe;
-    for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
+    for (int i = 0; i < p.bams.size(); ++i) {  // 循环处理每个read
        BamWrap *bw = p.bams[i];
        const int64_t bamIdx = p.bamStartIdx + i;
        if (bw->GetReadUnmappedFlag()) {
            if (bw->b->core.tid == -1)
                // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
                break;
-        } else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
+        } else if (!bw->IsSecondaryOrSupplementary()) {  // 是主要比对
            ReadEnds fragEnd;
            tm_arr[8].acc_start();
            buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
            tm_arr[8].acc_end();
-            p.frags.push_back(fragEnd);                                 // 添加进frag集合
+            p.frags.push_back(fragEnd);                                   // 添加进frag集合
-            if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
+            if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) {  // 是pairend而且互补的read也比对上了
                string key = bw->query_name();
                if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
                    p.unpairedDic[key] = {p.taskSeq, fragEnd};
-                } else { // 找到了pairend
+                } else {  // 找到了pairend
                    auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
                    modifyPairedEnds(fragEnd, &pairedEnds);
                    p.pairs.push_back(pairedEnds);
@ -185,8 +187,8 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
        }
    }
    tm_arr[9].acc_start();
-    //sortReadEndsArr(p.frags);
+    sortReadEndsArr(p.frags);
-    sort(p.frags.begin(), p.frags.end());
+    // sort(p.frags.begin(), p.frags.end());
    tm_arr[9].acc_end();
    // cout << "sort pairs" << endl;
    tm_arr[10].acc_start();
@ -277,7 +279,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
    while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
        leftSpan += 1;
-        if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了（可能会有bug，上上个也与下一个有交集呢？）
+        if (leftSpan > leftEndIdx) {  // 上一个的范围被下一个全部包围了（可能会有bug，上上个也与下一个有交集呢？）
            leftSpan = leftArr.size() - 1;
            break;
        }
@ -285,7 +287,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
    while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
        rightSpan += 1;
-        if (rightSpan == rightArr.size() - 1) // 同上，可能会有bug
+        if (rightSpan == rightArr.size() - 1)  // 同上，可能会有bug
            break;
    }
    dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
@ -432,21 +434,18 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
            //  1.
            //  prevpos在交叉部分之前，nextpos在交叉部分之后，这种情况不需要获取pairarr中的数据;
            //  2.
-            //  prevpos在交叉部分之前，nextpos在交叉部分，需要获取lp中的相等read
+            //  prevpos在交叉部分之前，nextpos在交叉部分，需要获取lp中的相等read pair进行重新计算
-            //  pair进行重新计算
+            //  复杂情况1. g中包含prevPosKey对应的unpair，p中有对应的pair，此时应该把这些pair考虑进去
            //     复杂情况1.
            //     g中包含prevPosKey对应的unpair，p中有对应的pair，此时应该把这些pair考虑进去
            //  3.
-            //  prevpos在交叉部分，nextpos在交叉部分之后，需要获取p中的相等read
+            //  prevpos在交叉部分，nextpos在交叉部分之后，需要获取p中的相等read pair进行重新计算
-            //  pair进行重新计算
+            //  复杂情况2. p中是否包含prevPosKey对应的unpair
            //     复杂情况2. p中是否包含prevPosKey对应的unpair
            //  4.
-            //  prevpos在交叉部分，nextpos在交叉部分，需要获取lp和p中的相等read
+            //  prevpos在交叉部分，nextpos在交叉部分，需要获取lp和p中的相等read pair进行重新计算
            //  pair进行重新计算
            bool addDataToPos = true;
            if (alreadyAdd.find(ck) != alreadyAdd.end()) {
-                addDataToPos = false;  // 之前已经添加过了，后面就不用再添加数据了
+                // 之前已经添加过了，后面就不用再添加数据了，因为同一个位置可能找到两个及以上的unpair数据，处理之前的数据时候可能已经添加了这些数据
                addDataToPos = false;
            } else
                alreadyAdd.insert(ck);
@ -478,9 +477,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
                }
                recalcPos[ck] = prevPosInfo.taskSeq;
                std::sort(prevPairArr.begin(), prevPairArr.end());
-            } else { // prevpos在交叉部分
+            } else {                             // prevpos在交叉部分
-                if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
+                if (nextPosKey > prevLastPos) {  // nextpos在交叉部分之后 第三种情况
-                    if (nextUnpairInfoP != nullptr) { // 且在pos点，next task有unpair，这样才把这些数据放到next task里
+                    if (nextUnpairInfoP != nullptr) {  // 且在pos点，next task有unpair，这样才把这些数据放到next task里
                        auto &nextPairArr = nextUnpairInfoP->pairArr;
                        nextPairArr.push_back(prevFragEnd);
                        auto &prevPairArr = prevUnpairInfoP->pairArr;
@ -492,7 +491,7 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
                        recalcPos[ck] = nextPosInfo.taskSeq;
                        std::sort(prevPairArr.begin(), prevPairArr.end());
-                    } else { // next task在该位点没有unpair，那就把数据放到prev task里
+                    } else {  // next task在该位点没有unpair，那就把数据放到prev task里
                        auto &prevPairArr = prevUnpairInfoP->pairArr;  // prevUnpairInfoP肯定不是nullptr
                        prevPairArr.push_back(prevFragEnd);
                        if (addDataToPos)  // 第二种情况
@ -515,8 +514,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
                    std::sort(prevPairArr.begin(), prevPairArr.end());
                }
            }
-            p.unpairedDic.erase(readName); // 在next task里删除该read
+            p.unpairedDic.erase(readName);                                 // 在next task里删除该read
-        } else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
+        } else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) {  // 在遗留数据中找到了匹配的read
            auto &remainPosInfo = g.unpairedDic[readName];
            auto remainFragEnd = remainPosInfo.unpairedRE;
            int64_t remainPosKey = remainFragEnd.posKey;
@ -529,14 +528,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
            std::sort(remainPairArr.begin(), remainPairArr.end());
            g.unpairedDic.erase(readName);
-        } else { // 都没找到，那就保存到遗留数据里
+        } else {  // 都没找到，那就保存到遗留数据里
            int64_t prevPosKey = prevFragEnd.posKey;
            g.unpairedDic.insert(prevUnpair);
            addToGlobal.insert(prevPosKey);
        }
    }
    // 最后再添加，以防开始赋值，后来这个位置要是又添加了新的数据
    for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
    map<int64_t, TaskSeqDupInfo> taskChanged;
    set<int64_t> posProcessed;
@ -557,8 +554,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
        if (taskSeq < lp.taskSeq)
            g.unpairedPosArr.erase(posKey);
    }
    // 更新结果
    // 最后再添加，以防开始赋值，后来这个位置要是又添加了新的数据
    // 放在这里，因为lp中的unpairedPosArr中的readends可能会被修改（比如optical duplicate）
    for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
    // 更新结果
    for (auto &e : taskChanged) {
        auto taskSeq = e.first;
        auto &t = e.second;
@ -594,12 +595,12 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
    auto &lp = *task;
    auto &g = *gDataArg;
    // 遗留的未匹配的pair
-    for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
+    for (auto &prevUnpair : lp.unpairedDic) {  // 遍历上一个任务中的每个未匹配的read
        auto &readName = prevUnpair.first;
        auto &prevPosInfo = prevUnpair.second;
        auto prevFragEnd = prevPosInfo.unpairedRE;  // 未匹配的read end
-        if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
+        if (g.unpairedDic.find(readName) != g.unpairedDic.end()) {  // 在遗留数据中找到了匹配的read
            auto &remainPosInfo = g.unpairedDic[readName];
            auto remainFragEnd = remainPosInfo.unpairedRE;
            int64_t remainPosKey = remainFragEnd.posKey;
@ -719,8 +720,8 @@ void serialMarkDups() {
        // cout << "round time: " << t_round.seconds_elapsed() << endl;
        roundNum++;
        if (roundNum % 100 == 0) {
-            cout << "read sum: " << readNumSum << endl;
+            //cout << "read sum: " << readNumSum << endl;
-            cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
+            //cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
        }
    }
    // cout << "here" << endl;
@ -769,10 +770,10 @@ void serialMarkDups() {
    cout << "sort frags  : " << tm_arr[9].acc_seconds_elapsed() << endl;
    cout << "sort pairs  : " << tm_arr[10].acc_seconds_elapsed() << endl;
    cout << "all         : " << tm_arr[5].acc_seconds_elapsed() << endl;
-    cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" 
+    cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t"
-                        << gMetrics.NonOpticalDuplicateCountHist << "\t"
+         << gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl;
-                        << gMetrics.OpticalDuplicatesCountHist << "\t"
+    cout << "optical dup: " << zzhopticalSet.size() << endl;
-                        << gMetrics.OpticalDuplicatesByLibraryId << endl;
+    cout << "optical arr dup: " << zzhopticalArr.size() << endl;
    Timer::log_time("serial end  ");
--- a/src/sam/markdups/shared_args.h
+++ b/src/sam/markdups/shared_args.h
@ -6,6 +6,8 @@
 #include <htslib/thread_pool.h>
 #include <sam/utils/read_ends.h>
 #include <sam/utils/read_name_parser.h>
 #include <set>
 using std::set;
 extern Timer tm_arr[20];  // 用来测试性能
 /* 全局本地变量 */
@ -24,3 +26,7 @@ class GlobalDataArg;
 extern GlobalDataArg &gData;
 class DuplicationMetrics;
 extern DuplicationMetrics &gMetrics;
 extern int zzhtestnum;
 extern set<int64_t> zzhopticalSet;
 extern vector<int64_t> zzhopticalArr;
--- a/src/sam/utils/read_ends.h
+++ b/src/sam/utils/read_ends.h
@ -77,6 +77,9 @@ struct ReadEnds : PhysicalLocation {
    int64_t posKey = -1;  // 根据位置信息生成的关键字 return (int64_t)tid <<
                          // MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
    /* 用来做一些判断，因为一些readends会做多次操作，比如task之间有重叠等等 */
    int oprateTime = 0;
    /* 根据pairend read的比对方向，来确定整体的比对方向 */
    static int8_t GetOrientationByte(bool read1NegativeStrand,
                                     bool read2NegativeStrand) {