删除了串行相关的代码，修复了read名字解析坐标时候出异常的catch问题

2024-11-22 00:46:11 +08:00 · 2024-11-22 00:46:11 +08:00 · 7352eb9070
parent 6de538670f
commit 7352eb9070
11 changed files with 20 additions and 2572 deletions
--- a/run.sh
+++ b/run.sh
@ -1,14 +1,14 @@
-#nthread=1
+nthread=1
 #nthread=2
 #nthread=4
 #nthread=8
 #nthread=16
-nthread=32
-#nthread=64
+#nthread=32
+#nthread=32
 #nthread=128
 #input=/home/zzh/data/bam/zy_normal.bam
-input=/home/zzh/data/bam/zy_tumor.bam
-#input=/home/zzh/data/wgs_na12878.bam
+#input=/home/zzh/data/bam/zy_tumor.bam
+input=/home/zzh/data/wgs_na12878.bam
 #input=~/data/bam/100w.bam
 #input=~/data/bam/t100w.sam
 #input=~/data/bam/1k.sam
--- a/src/sam/markdups/dup_metrics.h
+++ b/src/sam/markdups/dup_metrics.h
@ -1,9 +1,11 @@
 #pragma once

-#include <string>
 #include <stdint.h>
+
+#include <string>
 #include <vector>
-#include "serial_md.h"
+
+#include "md_types.h"

 using std::string;
 using std::vector;
--- a/src/sam/markdups/markdups.cpp
+++ b/src/sam/markdups/markdups.cpp
@ -30,9 +30,7 @@ Date : 2023/10/23
 #include "dup_metrics.h"
 #include "markdups_arg.h"
 #include "md_funcs.h"
-#include "parallel_md.h"
 #include "pipeline_md.h"
-#include "serial_md.h"
 #include "shared_args.h"

 using namespace std;
@ -177,14 +175,7 @@ int MarkDuplicates(int argc, char *argv[]) {
    hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);

    /* 冗余检查和标记 */
-//    if (g_gArg.num_threads == 1) {
-//        // serialMarkDups();  // 串行运行
-//        parallelMarkDups();  // 并行运行
-//    } else {
-//        parallelMarkDups();  // 并行运行
-//    }
    pipelineMarkDups();
-    //parallelMarkDups();

    /* 标记冗余, 将处理后的结果写入文件 */
    sam_close(g_inBamFp);  // 重新打开bam文件
--- a/src/sam/markdups/md_funcs.cpp
+++ b/src/sam/markdups/md_funcs.cpp
@ -28,20 +28,6 @@ using std::set;
 using std::unordered_map;
 using std::vector;

-/* 清除key位置的数据 */
-void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
-    auto &msIdx = *pmsIdx;
-    if (msIdx.find(key) != msIdx.end())
-        msIdx[key].clear();  // 清除该位点的冗余结果
-}
-
-/* 删除key位置的数据 */
-void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
-    auto &msIdx = *pmsIdx;
-    if (msIdx.find(key) != msIdx.end())
-        msIdx.erase(key);
-}
-
 /*
 * 计算read的分数
 */
@ -108,140 +94,6 @@ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnd
        BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate);  // << 1 | k.orientation;
 }

-/**
- * Takes a list of ReadEndsForMarkDuplicates objects and identify the
- * representative read based on quality score. For all members of the duplicate
- * set, add the read1 index-in-file of the representative read to the records of
- * the first and second in a pair. This value becomes is used for the 'DI' tag.
- */
-void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
-
-/* 处理一组pairend的readends，标记冗余 */
-void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
-                        DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
-    if (vpRe.size() < 2) {
-        if (vpRe.size() == 1) {
-            // addSingletonToCount(libraryIdGenerator);
-        }
-        return;
-    }
-    // cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
-    auto &vDupIdx = dupIdx->AtPos(posKey);
-    auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
-
-    int maxScore = 0;
-    const ReadEnds *pBest = nullptr;
-    /** All read ends should have orientation FF, FR, RF, or RR **/
-    for (auto pe : vpRe)  // 找分数最高的readend
-    {
-        if (pe->score > maxScore || pBest == nullptr) {
-            maxScore = pe->score;
-            pBest = pe;
-        }
-    }
-    if (!g_mdArg.READ_NAME_REGEX.empty())  // 检查光学冗余
-    {
-        // trackOpticalDuplicates
-    }
-    for (auto pe : vpRe)  // 对非best read标记冗余
-    {
-        if (pe != pBest)  // 非best
-        {
-            vDupIdx.push_back(pe->read1IndexInFile);  // 添加read1
-            if (pe->read2IndexInFile != pe->read1IndexInFile)
-                vDupIdx.push_back(pe->read2IndexInFile);  // 添加read2
-        }
-    }
-
-    if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
-        addRepresentativeReadIndex(vpRe);
-    }
-}
-
-/* 处理一组非paired的readends，标记冗余 */
-void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
-                            DupContainer<int64_t> *dupIdx) {
-    auto &vDupIdx = dupIdx->AtPos(posKey);
-
-    if (containsPairs) {
-        for (auto pe : vpRe) {
-            if (!pe->IsPaired()) {
-                vDupIdx.push_back(pe->read1IndexInFile);
-            }
-        }
-    } else {
-        int maxScore = 0;
-        const ReadEnds *pBest = nullptr;
-        for (auto pe : vpRe) {
-            if (pe->score > maxScore || pBest == nullptr) {
-                maxScore = pe->score;
-                pBest = pe;
-            }
-        }
-
-        for (auto pe : vpRe) {
-            if (pe != pBest) {
-                vDupIdx.push_back(pe->read1IndexInFile);
-            }
-        }
-    }
-}
-
-/* 处理位于某个坐标的pairend reads */
-void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
-                        vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
-                        DupContainer<int64_t> *opticalDupIdx) {
-    if (readEnds.size() > 1) { // 有潜在的冗余
-        vpCache.clear();
-        //        std::sort(readEnds.begin(), readEnds.end());
-        const ReadEnds *pReadEnd = nullptr;
-        for (auto &re : readEnds) {
-            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))  // 跟前一个一样
-                vpCache.push_back(&re);  // 处理一个潜在的冗余组
-            else {
-                markDuplicatePairs(posKey, vpCache, dupIdx,
-                                   opticalDupIdx);  // 不一样
-                vpCache.clear();
-                vpCache.push_back(&re);
-                pReadEnd = &re;
-            }
-        }
-        markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
-    }
-}
-
-/* 处理位于某个坐标的 reads */
-void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
-                        vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
-    if (readEnds.size() > 1)  // 有潜在的冗余
-    {
-        vpCache.clear();
-        //        std::sort(readEnds.begin(), readEnds.end());
-        const ReadEnds *pReadEnd = nullptr;
-        bool containsPairs = false;
-        bool containsFrags = false;
-        for (auto &re : readEnds) {
-            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
-                vpCache.push_back(&re);
-                containsPairs = containsPairs || re.IsPaired();
-                containsFrags = containsFrags || !re.IsPaired();
-            } else {
-                if (vpCache.size() > 1 && containsFrags) {
-                    markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
-                }
-                vpCache.clear();
-                vpCache.push_back(&re);
-                pReadEnd = &re;
-                containsPairs = re.IsPaired();
-                containsFrags = !re.IsPaired();
-            }
-        }
-        if (vpCache.size() > 1 && containsFrags) {
-            markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
-        }
-    }
-}
-
 /* 对找到的pairend read end添加一些信息 */
 void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
    auto &pairedEnds = *pPairedEnds;
--- a/src/sam/markdups/md_funcs.h
+++ b/src/sam/markdups/md_funcs.h
@ -224,30 +224,6 @@ int16_t computeDuplicateScore(BamWrap &bw);
 */
 void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);

-/*
- * 处理一组pairend的readends，标记冗余
- */
-void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
-                        DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
-
-/*
- * 处理一组非paired的readends，标记冗余
- */
-void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
-                            DupContainer<int64_t> *dupIdx);
-
-/*
- * 处理位于某个坐标的pairend reads
- */
-void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
-                 DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
-
-/*
- * 处理位于某个坐标的非配对的frag reads
- */
-void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
-                 DupContainer<int64_t> *dupIdx);
-
 /*
 * 对找到的pairend read end添加一些信息
 */
--- a/src/sam/markdups/parallel_md.cpp
+++ b/src/sam/markdups/parallel_md.cpp
--- a/src/sam/markdups/parallel_md.h
+++ b/src/sam/markdups/parallel_md.h
@ -1,6 +0,0 @@
-#pragma once
-
-#include "md_types.h"
-
-// 并行运行mark duplicate
-void parallelMarkDups();
--- a/src/sam/markdups/pipeline_md.cpp
+++ b/src/sam/markdups/pipeline_md.cpp
@ -916,9 +916,7 @@ static void *pipeIntersect(void *data) {
            break;
        }
        /* 交叉数据处理 readends */
-//        cout << "intersect order: " << pipeArg.intersectOrder << endl;
        tm_arr[4].acc_start();
-//        cout << "intersect markdup size: " << PEEK_LOCK(pipeArg.markDupSig) << endl;
        doIntersect(pipeArg);
        tm_arr[4].acc_end();

--- a/src/sam/markdups/serial_md.cpp
+++ b/src/sam/markdups/serial_md.cpp
--- a/src/sam/markdups/serial_md.h
+++ b/src/sam/markdups/serial_md.h
@ -1,6 +0,0 @@
-#pragma once
-
-#include "md_types.h"
-
-// 串行运行mark duplicate
-void serialMarkDups();
--- a/src/sam/utils/read_name_parser.h
+++ b/src/sam/utils/read_name_parser.h
@ -139,6 +139,7 @@ struct ReadNameParser {
            } else {
                // Standard version that will use the regex
                cmatch m;
+                // cout << "here1" << endl;
                if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
                    loc->tile = std::stoi(m[1].str());
                    loc->x = std::stoi(m[2].str());
@ -166,6 +167,15 @@ struct ReadNameParser {
                    readNameRegex.c_str(), readName.c_str(), e.what());
                warnedAboutRegexNotMatching = false;
            }
+        } catch (...) {
+            if (warnedAboutRegexNotMatching) {
+                Warn(
+                    "A field parsed out of a read name was expected to contain "
+                    "an integer and did not. READ_NAME_REGEX: %s; Read name: "
+                    "%s",
+                    readNameRegex.c_str(), readName.c_str());
+                warnedAboutRegexNotMatching = false;
+            }
        }

        return true;
@ -190,6 +200,7 @@ struct ReadNameParser {
            if (readName.at(i) == delim || 0 == i) {
                numFields++;
                const int startIdx = (0 == i) ? 0 : (i + 1);
+                // cout << readName << endl;
                tmpLocationFields[tokensIdx] =
                    std::stoi(readName.substr(startIdx, endIdx - startIdx));
                tokensIdx--;