删除了串行相关的代码，修复了read名字解析坐标时候出异常的catch问题

2024-11-22 00:46:11 +08:00 · 2024-11-22 00:46:11 +08:00 · 7352eb9070
parent 6de538670f
commit 7352eb9070
11 changed files with 20 additions and 2572 deletions
--- a/run.sh
+++ b/run.sh
@ -1,14 +1,14 @@
-#nthread=1
+nthread=1
 #nthread=2
 #nthread=4
 #nthread=8
 #nthread=16
-nthread=32
+#nthread=32
-#nthread=64
+#nthread=32
 #nthread=128
 #input=/home/zzh/data/bam/zy_normal.bam
-input=/home/zzh/data/bam/zy_tumor.bam
+#input=/home/zzh/data/bam/zy_tumor.bam
-#input=/home/zzh/data/wgs_na12878.bam
+input=/home/zzh/data/wgs_na12878.bam
 #input=~/data/bam/100w.bam
 #input=~/data/bam/t100w.sam
 #input=~/data/bam/1k.sam
--- a/src/sam/markdups/dup_metrics.h
+++ b/src/sam/markdups/dup_metrics.h
@ -1,9 +1,11 @@
 #pragma once
 #include <string>
 #include <stdint.h>
 #include <string>
 #include <vector>
-#include "serial_md.h"
+
 #include "md_types.h"
 using std::string;
 using std::vector;
--- a/src/sam/markdups/markdups.cpp
+++ b/src/sam/markdups/markdups.cpp
@ -30,9 +30,7 @@ Date : 2023/10/23
 #include "dup_metrics.h"
 #include "markdups_arg.h"
 #include "md_funcs.h"
 #include "parallel_md.h"
 #include "pipeline_md.h"
 #include "serial_md.h"
 #include "shared_args.h"
 using namespace std;
@ -177,14 +175,7 @@ int MarkDuplicates(int argc, char *argv[]) {
    hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
    /* 冗余检查和标记 */
 //    if (g_gArg.num_threads == 1) {
 //        // serialMarkDups();  // 串行运行
 //        parallelMarkDups();  // 并行运行
 //    } else {
 //        parallelMarkDups();  // 并行运行
 //    }
    pipelineMarkDups();
    //parallelMarkDups();
    /* 标记冗余, 将处理后的结果写入文件 */
    sam_close(g_inBamFp);  // 重新打开bam文件
--- a/src/sam/markdups/md_funcs.cpp
+++ b/src/sam/markdups/md_funcs.cpp
@ -28,20 +28,6 @@ using std::set;
 using std::unordered_map;
 using std::vector;
 /* 清除key位置的数据 */
 void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
    auto &msIdx = *pmsIdx;
    if (msIdx.find(key) != msIdx.end())
        msIdx[key].clear();  // 清除该位点的冗余结果
 }
 /* 删除key位置的数据 */
 void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
    auto &msIdx = *pmsIdx;
    if (msIdx.find(key) != msIdx.end())
        msIdx.erase(key);
 }
 /*
 * 计算read的分数
 */
@ -108,140 +94,6 @@ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnd
        BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate);  // << 1 | k.orientation;
 }
 /**
 * Takes a list of ReadEndsForMarkDuplicates objects and identify the
 * representative read based on quality score. For all members of the duplicate
 * set, add the read1 index-in-file of the representative read to the records of
 * the first and second in a pair. This value becomes is used for the 'DI' tag.
 */
 void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
 /* 处理一组pairend的readends，标记冗余 */
 void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
                        DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
    if (vpRe.size() < 2) {
        if (vpRe.size() == 1) {
            // addSingletonToCount(libraryIdGenerator);
        }
        return;
    }
    // cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
    auto &vDupIdx = dupIdx->AtPos(posKey);
    auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
    int maxScore = 0;
    const ReadEnds *pBest = nullptr;
    /** All read ends should have orientation FF, FR, RF, or RR **/
    for (auto pe : vpRe)  // 找分数最高的readend
    {
        if (pe->score > maxScore || pBest == nullptr) {
            maxScore = pe->score;
            pBest = pe;
        }
    }
    if (!g_mdArg.READ_NAME_REGEX.empty())  // 检查光学冗余
    {
        // trackOpticalDuplicates
    }
    for (auto pe : vpRe)  // 对非best read标记冗余
    {
        if (pe != pBest)  // 非best
        {
            vDupIdx.push_back(pe->read1IndexInFile);  // 添加read1
            if (pe->read2IndexInFile != pe->read1IndexInFile)
                vDupIdx.push_back(pe->read2IndexInFile);  // 添加read2
        }
    }
    if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
        addRepresentativeReadIndex(vpRe);
    }
 }
 /* 处理一组非paired的readends，标记冗余 */
 void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
                            DupContainer<int64_t> *dupIdx) {
    auto &vDupIdx = dupIdx->AtPos(posKey);
    if (containsPairs) {
        for (auto pe : vpRe) {
            if (!pe->IsPaired()) {
                vDupIdx.push_back(pe->read1IndexInFile);
            }
        }
    } else {
        int maxScore = 0;
        const ReadEnds *pBest = nullptr;
        for (auto pe : vpRe) {
            if (pe->score > maxScore || pBest == nullptr) {
                maxScore = pe->score;
                pBest = pe;
            }
        }
        for (auto pe : vpRe) {
            if (pe != pBest) {
                vDupIdx.push_back(pe->read1IndexInFile);
            }
        }
    }
 }
 /* 处理位于某个坐标的pairend reads */
 void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
                        vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
                        DupContainer<int64_t> *opticalDupIdx) {
    if (readEnds.size() > 1) { // 有潜在的冗余
        vpCache.clear();
        //        std::sort(readEnds.begin(), readEnds.end());
        const ReadEnds *pReadEnd = nullptr;
        for (auto &re : readEnds) {
            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))  // 跟前一个一样
                vpCache.push_back(&re);  // 处理一个潜在的冗余组
            else {
                markDuplicatePairs(posKey, vpCache, dupIdx,
                                   opticalDupIdx);  // 不一样
                vpCache.clear();
                vpCache.push_back(&re);
                pReadEnd = &re;
            }
        }
        markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
    }
 }
 /* 处理位于某个坐标的 reads */
 void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
                        vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
    if (readEnds.size() > 1)  // 有潜在的冗余
    {
        vpCache.clear();
        //        std::sort(readEnds.begin(), readEnds.end());
        const ReadEnds *pReadEnd = nullptr;
        bool containsPairs = false;
        bool containsFrags = false;
        for (auto &re : readEnds) {
            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
                vpCache.push_back(&re);
                containsPairs = containsPairs || re.IsPaired();
                containsFrags = containsFrags || !re.IsPaired();
            } else {
                if (vpCache.size() > 1 && containsFrags) {
                    markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
                }
                vpCache.clear();
                vpCache.push_back(&re);
                pReadEnd = &re;
                containsPairs = re.IsPaired();
                containsFrags = !re.IsPaired();
            }
        }
        if (vpCache.size() > 1 && containsFrags) {
            markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
        }
    }
 }
 /* 对找到的pairend read end添加一些信息 */
 void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
    auto &pairedEnds = *pPairedEnds;
--- a/src/sam/markdups/md_funcs.h
+++ b/src/sam/markdups/md_funcs.h
@ -224,30 +224,6 @@ int16_t computeDuplicateScore(BamWrap &bw);
 */
 void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
 /*
 * 处理一组pairend的readends，标记冗余
 */
 void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
                        DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
 /*
 * 处理一组非paired的readends，标记冗余
 */
 void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
                            DupContainer<int64_t> *dupIdx);
 /*
 * 处理位于某个坐标的pairend reads
 */
 void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
                 DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
 /*
 * 处理位于某个坐标的非配对的frag reads
 */
 void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
                 DupContainer<int64_t> *dupIdx);
 /*
 * 对找到的pairend read end添加一些信息
 */
--- a/src/sam/markdups/parallel_md.cpp
+++ b/src/sam/markdups/parallel_md.cpp
--- a/src/sam/markdups/parallel_md.h
+++ b/src/sam/markdups/parallel_md.h
@ -1,6 +0,0 @@
 #pragma once
 #include "md_types.h"
 // 并行运行mark duplicate
 void parallelMarkDups();
--- a/src/sam/markdups/pipeline_md.cpp
+++ b/src/sam/markdups/pipeline_md.cpp
@ -916,9 +916,7 @@ static void *pipeIntersect(void *data) {
            break;
        }
        /* 交叉数据处理 readends */
 //        cout << "intersect order: " << pipeArg.intersectOrder << endl;
        tm_arr[4].acc_start();
 //        cout << "intersect markdup size: " << PEEK_LOCK(pipeArg.markDupSig) << endl;
        doIntersect(pipeArg);
        tm_arr[4].acc_end();
--- a/src/sam/markdups/serial_md.cpp
+++ b/src/sam/markdups/serial_md.cpp
--- a/src/sam/markdups/serial_md.h
+++ b/src/sam/markdups/serial_md.h
@ -1,6 +0,0 @@
 #pragma once
 #include "md_types.h"
 // 串行运行mark duplicate
 void serialMarkDups();
--- a/src/sam/utils/read_name_parser.h
+++ b/src/sam/utils/read_name_parser.h
@ -139,6 +139,7 @@ struct ReadNameParser {
            } else {
                // Standard version that will use the regex
                cmatch m;
                // cout << "here1" << endl;
                if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
                    loc->tile = std::stoi(m[1].str());
                    loc->x = std::stoi(m[2].str());
@ -166,6 +167,15 @@ struct ReadNameParser {
                    readNameRegex.c_str(), readName.c_str(), e.what());
                warnedAboutRegexNotMatching = false;
            }
        } catch (...) {
            if (warnedAboutRegexNotMatching) {
                Warn(
                    "A field parsed out of a read name was expected to contain "
                    "an integer and did not. READ_NAME_REGEX: %s; Read name: "
                    "%s",
                    readNameRegex.c_str(), readName.c_str());
                warnedAboutRegexNotMatching = false;
            }
        }
        return true;
@ -190,6 +200,7 @@ struct ReadNameParser {
            if (readName.at(i) == delim || 0 == i) {
                numFields++;
                const int startIdx = (0 == i) ? 0 : (i + 1);
                // cout << readName << endl;
                tmpLocationFields[tokensIdx] =
                    std::stoi(readName.substr(startIdx, endIdx - startIdx));
                tokensIdx--;