From ab7da8a7d711db2654ca6e658ae74b02565ae9ea Mon Sep 17 00:00:00 2001 From: zzh Date: Wed, 23 Apr 2025 14:59:40 +0800 Subject: [PATCH] Change some annotations --- src/markdup/dup_metrics.h | 8 +- src/markdup/markdup.cpp | 76 ++++++------- src/markdup/md_args.h | 24 ++-- src/markdup/md_funcs.cpp | 14 +-- src/markdup/md_funcs.h | 18 +-- src/markdup/md_pipeline.cpp | 198 ++++++++++++++++----------------- src/markdup/md_pipeline.h | 48 ++++---- src/markdup/md_types.h | 62 +++++------ src/markdup/read_ends.h | 46 ++++---- src/markdup/read_name_parser.h | 8 +- src/util/bam_buf.cpp | 74 ++++++------ src/util/bam_buf.h | 92 +++++++-------- src/util/bam_wrap.h | 80 ++++++------- src/util/murmur3.h | 2 +- 14 files changed, 375 insertions(+), 375 deletions(-) diff --git a/src/markdup/dup_metrics.h b/src/markdup/dup_metrics.h index 1fba5de..d072524 100644 --- a/src/markdup/dup_metrics.h +++ b/src/markdup/dup_metrics.h @@ -11,7 +11,7 @@ using std::string; using std::vector; /* - * 标记冗余过程中的一些数据统计 + * */ struct DuplicationMetrics { /** @@ -57,15 +57,15 @@ struct DuplicationMetrics { */ uint64_t ESTIMATED_LIBRARY_SIZE = 0; - // 其他的统计数据 + // vector CoverageMult; - // 比如在该位置,有4个冗余,那么所有4个冗余的位置数量 + // ,4,4 MDMap DuplicateCountHist; MDMap NonOpticalDuplicateCountHist; MDMap OpticalDuplicatesCountHist; - // 没有冗余的位置总数量 for test + // for test MDSet singletonReads; MDSet dupReads; // for test MDSet bestReads; diff --git a/src/markdup/markdup.cpp b/src/markdup/markdup.cpp index 9dec53a..f7d4986 100644 --- a/src/markdup/markdup.cpp +++ b/src/markdup/markdup.cpp @@ -1,6 +1,6 @@ /* Description: -标记bam文件中的冗余信息,只处理按照坐标排序后的bam,且bam为单一样本数据 +bam,bam,bam Copyright : All right reserved by ICT @@ -26,26 +26,26 @@ Date : 2023/10/23 namespace nsgv { -MarkDupsArg gMdArg; // 参数 -std::vector gNameParsers; // 每个线程一个read name parser -samFile *gInBamFp; // 输入的bam文件 -sam_hdr_t *gInBamHeader; // 输入的bam文件头信息 -samFile *gOutBamFp; // 输出文件, sam或者bam格式 -sam_hdr_t *gOutBamHeader; // 输出文件的header -DuplicationMetrics gMetrics; // 统计信息 +MarkDupsArg gMdArg; // +std::vector gNameParsers; // read name parser +samFile *gInBamFp; // bam +sam_hdr_t *gInBamHeader; // bam +samFile *gOutBamFp; // , sambam +sam_hdr_t *gOutBamHeader; // header +DuplicationMetrics gMetrics; // DupResult gDupRes; PipelineArg gPipe(&gDupRes); }; // namespace nsgv -// 字节缓冲区 +// struct ByteBuf { uint8_t *buf = nullptr; - int size = 0; // 当前使用量 - int capacity = 0; // 最大容量 + int size = 0; // + int capacity = 0; // }; /* - * 获取文件名后缀 + * */ static string getFileExtension(const string &filename) { auto last_dot = filename.find_last_of('.'); @@ -58,25 +58,25 @@ static string getFileExtension(const string &filename) { // entrance of mark duplicates int MarkDuplicates() { PROF_START(whole_process); - /* 初始化一些参数和变量*/ + /* */ nsgv::gNameParsers.resize(nsgv::gMdArg.NUM_THREADS); for (auto &parser : nsgv::gNameParsers) - parser.SetReadNameRegex(nsgv::gMdArg.READ_NAME_REGEX); // 用来解析read name中的tile,x,y信息 + parser.SetReadNameRegex(nsgv::gMdArg.READ_NAME_REGEX); // read nametile,x,y - /* 打开输入bam文件 */ + /* bam */ nsgv::gInBamFp = sam_open_format(nsgv::gMdArg.INPUT_FILE.c_str(), "r", nullptr); if (!nsgv::gInBamFp) { spdlog::error("[{}] load sam/bam file failed.\n", __func__); return -1; } hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); - nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // 读取header - // 获取样本名称(libraryId) + nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header + // (libraryId) nsgv::gMetrics.LIBRARY = sam_hdr_line_name(nsgv::gInBamHeader, "RG", 0); - /* 利用线程池对输入输出文件进行读写 */ - htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池 - htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 + /* */ + htsThreadPool htsPoolRead = {NULL, 0}; // , + htsThreadPool htsPoolWrite = {NULL, 0}; // htsPoolRead.pool = hts_tpool_init(nsgv::gMdArg.NUM_THREADS); htsPoolWrite.pool = hts_tpool_init(nsgv::gMdArg.NUM_THREADS); if (!htsPoolRead.pool || !htsPoolWrite.pool) { @@ -86,12 +86,12 @@ int MarkDuplicates() { } hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead); - /* 冗余检查和标记 */ + /* */ PROF_START(markdup_all); PipelineMarkDups(); PROF_END(gprof[GP_markdup_all], markdup_all); - /* 标记冗余, 将处理后的结果写入文件 */ + /* , */ char modeout[12] = "wb"; sam_open_mode(modeout + 1, nsgv::gMdArg.OUTPUT_FILE.c_str(), NULL); nsgv::gOutBamFp = sam_open(nsgv::gMdArg.OUTPUT_FILE.c_str(), modeout); @@ -100,14 +100,14 @@ int MarkDuplicates() { return -1; } nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader); - // 修改输出文件的header + // header sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", nsgv::gMdArg.PROGRAM_RECORD_ID.c_str(), "VN", FASTDUP_VERSION, "CL", nsgv::gMdArg.CLI_STR.c_str(), NULL); - // 用同样的线程数量处理输出文件 + // hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); - sam_close(nsgv::gInBamFp); // 重新打开bam文件 + sam_close(nsgv::gInBamFp); // bam nsgv::gInBamFp = sam_open_format(nsgv::gMdArg.INPUT_FILE.c_str(), "r", nullptr); if (!nsgv::gInBamFp) { spdlog::error("[{}] load sam/bam file failed.\n", __func__); @@ -122,8 +122,8 @@ int MarkDuplicates() { sam_close(nsgv::gInBamFp); return -1; } - // 输出index文件 - string indexFn = nsgv::gMdArg.OUTPUT_FILE + ".bai"; // min_shift = 0 是bai格式 + // index + string indexFn = nsgv::gMdArg.OUTPUT_FILE + ".bai"; // min_shift = 0 bai if ("sam" == getFileExtension(nsgv::gMdArg.OUTPUT_FILE) || !nsgv::gMdArg.CREATE_INDEX) { indexFn = ""; } @@ -141,7 +141,7 @@ int MarkDuplicates() { } } - // 读取输入文件,标记冗余并输出 + // , BamBufType inBuf(nsgv::gMdArg.DUPLEX_IO); inBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gMdArg.MAX_MEM); @@ -190,13 +190,13 @@ int MarkDuplicates() { isOpticalDup = false; isInDuplicateSet = false; - // 删除原来的duplicate tag + // duplicate tag if (nsgv::gMdArg.CLEAR_DT) { uint8_t *oldTagVal = bam_aux_get(bw->b, nsgv::gMdArg.DUPLICATE_TYPE_TAG.c_str()); if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); } - // 统计信息 + // if (bw->GetReadUnmappedFlag()) { ++nsgv::gMetrics.UNMAPPED_READS; } else if (bw->IsSecondaryOrSupplementary()) { @@ -207,7 +207,7 @@ int MarkDuplicates() { ++nsgv::gMetrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end } - /* 判断是否冗余 */ + /* */ if (bamIdx == dupIdx) { ++realDupSize; // for test isDup = true; @@ -217,7 +217,7 @@ int MarkDuplicates() { duplicateSetSize = dupIdx.dupSet; } - // 为了防止小内存运行的时候,有重复的dupidx,这时候dup的repIdx和dupSetSize可能会有不同 + // ,dupidx,duprepIdxdupSetSize while ((dupIdx = dupIdxQue.Pop()) == bamIdx); if (opticalIdx == bamIdx) isOpticalDup = true; @@ -227,7 +227,7 @@ int MarkDuplicates() { isOpticalDup = true; } - // 添加冗余标识 + // bw->SetDuplicateReadFlag(true); uint8_t *oldTagVal = bam_aux_get(bw->b, nsgv::gMdArg.DUPLICATE_TYPE_TAG.c_str()); @@ -242,7 +242,7 @@ int MarkDuplicates() { nsgv::gMdArg.DUPLICATE_TYPE_LIBRARY.size() + 1, (const uint8_t *)nsgv::gMdArg.DUPLICATE_TYPE_LIBRARY.c_str()); - // 计算统计信息 + // if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) { // Update the duplication metrics if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) { @@ -282,7 +282,7 @@ int MarkDuplicates() { (const uint8_t *)&duplicateSetSize); } } - // 每个read都要写到output,只是添加标识,或者删掉这些冗余record + // readoutput,,record ++bamIdx; if (isDup && nsgv::gMdArg.REMOVE_DUPLICATES) { continue; @@ -312,7 +312,7 @@ int MarkDuplicates() { } bam_destroy1(bp); - // 计算统计信息 + // nsgv::gMetrics.READ_PAIRS_EXAMINED /= 2; nsgv::gMetrics.READ_PAIR_DUPLICATES /= 2; for (auto &arr : nsgv::gDupRes.opticalDupIdxArr) nsgv::gMetrics.READ_PAIR_OPTICAL_DUPLICATES += arr.size(); @@ -329,7 +329,7 @@ int MarkDuplicates() { } calculateRoiHistogram(nsgv::gMetrics); - // 将统计信息写到文件里 + // if (!nsgv::gMdArg.METRICS_FILE.empty()) { ofstream ofsM(nsgv::gMdArg.METRICS_FILE); ofsM << "## StringHeader" << endl; @@ -364,7 +364,7 @@ int MarkDuplicates() { return -1; } - /* 关闭文件,收尾清理 */ + /* , */ sam_close(nsgv::gOutBamFp); sam_close(nsgv::gInBamFp); diff --git a/src/markdup/md_args.h b/src/markdup/md_args.h index 5ce449b..299fde3 100644 --- a/src/markdup/md_args.h +++ b/src/markdup/md_args.h @@ -1,5 +1,5 @@ /* -Description: Markduplicate需要用到的一些参数 +Description: Markduplicate Copyright : All right reserved by ICT @@ -40,7 +40,7 @@ enum ValidationStringency { */ enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All }; -/* 排序的方式 */ +/* */ enum SortOrder { unsorted, queryname, @@ -49,14 +49,14 @@ enum SortOrder { unknown }; -/* 计算reads分数的方式(比那个read得分更高) */ +/* reads(read) */ enum ScoringStrategy { SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH, RANDOM }; -/* 索引文件的格式 (bai或者csi) */ +/* (baicsi) */ enum IndexFormat { BAI, CSI }; } // namespace nsmd -/* markduplicate 需要的参数*/ +/* markduplicate */ struct MarkDupsArg { string INPUT_FILE; // input bam filename @@ -64,9 +64,9 @@ struct MarkDupsArg { int NUM_THREADS = 1; - size_t MAX_MEM = ((size_t)1) << 30; // // 最小1G + size_t MAX_MEM = ((size_t)1) << 30; // // 1G - bool DUPLEX_IO = true; // 同时读写 + bool DUPLEX_IO = true; // /** * The optional attribute in SAM/BAM/CRAM files used to store the duplicate type. @@ -156,7 +156,7 @@ struct MarkDupsArg { optional = true) */ string MOLECULAR_IDENTIFIER_TAG = ""; - /* 继承自 AbstractMarkDuplicatesCommandLineProgram 的参数*/ + /* AbstractMarkDuplicatesCommandLineProgram */ /* "File to write duplication metrics to" */ string METRICS_FILE; @@ -196,7 +196,7 @@ struct MarkDupsArg { optional = true */ vector COMMENT; - /* 继承自 AbstractOpticalDuplicateFinderCommandLineProgram 的参数 */ + /* AbstractOpticalDuplicateFinderCommandLineProgram */ /* "MarkDuplicates can use the tile and cluster positions to estimate the rate of optical duplication " + "in addition to the dominant source of duplication, PCR, to provide a more accurate estimation of library @@ -227,7 +227,7 @@ struct MarkDupsArg { completely disable this check, " + "set the value to -1." */ long MAX_OPTICAL_DUPLICATE_SET_SIZE = DEFAULT_MAX_DUPLICATE_SET_SIZE; - /* 继承自 CommandLineProgram 的参数*/ + /* CommandLineProgram */ /* "Whether to suppress job-summary info on System.err.", common = true */ bool QUIET = false; @@ -256,9 +256,9 @@ struct MarkDupsArg { /* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/ bool ADD_PG_TAG_TO_READS = true; - // 命令行字符串 + // string CLI_STR; - // 开始运行时间 + // string START_TIME; }; \ No newline at end of file diff --git a/src/markdup/md_funcs.cpp b/src/markdup/md_funcs.cpp index b62e2fe..cad85f4 100644 --- a/src/markdup/md_funcs.cpp +++ b/src/markdup/md_funcs.cpp @@ -30,7 +30,7 @@ extern DuplicationMetrics gMetrics; }; /* - * 计算read的分数 + * read */ int16_t computeDuplicateScore(BamWrap &bw) { int16_t score = 0; @@ -71,7 +71,7 @@ int16_t computeDuplicateScore(BamWrap &bw) { /* * Builds a read ends object that represents a single read. - * 用来表示一个read的特征结构 + * read */ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) { auto &k = *pKey; @@ -93,11 +93,11 @@ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnd else nsgv::gMdArg.CHECK_OPTICAL_DUP = false; // cout << k.tile << ' ' << k.x << ' ' << k.y << endl; - // 计算位置key + // key k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation; } -/* 对找到的pairend read end添加一些信息 */ +/* pairend read end */ void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) { auto &pairedEnds = *pPairedEnds; @@ -202,7 +202,7 @@ static void findOpticalDuplicates(vector &readEndsArr, const R if (currentLoc == pBestRe) keeperIndex = i; if (currentLoc->tile != ReadEnds::NO_VALUE) { - int key = currentLoc->tile; // 只处理一个样本,所以只有一个read group + int key = currentLoc->tile; // ,read group tileRGmap[key].push_back(i); } opticalDistanceRelationGraph.addNode(i); @@ -323,7 +323,7 @@ static int checkOpticalDuplicates(vector &readEndsArr, const R } /** - * 记录光学原因造成的冗余 + * */ void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { bool hasFR = false, hasRF = false; @@ -358,7 +358,7 @@ void trackOpticalDuplicates(vector &readEndsArr, const ReadEnd nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe); } - // 统计信息,trackDuplicateCounts + // ,trackDuplicateCounts ++nsgv::gMetrics.DuplicateCountHist[readEndsArr.size()]; if (readEndsArr.size() > nOpticalDup) ++nsgv::gMetrics.NonOpticalDuplicateCountHist[readEndsArr.size() - nOpticalDup]; diff --git a/src/markdup/md_funcs.h b/src/markdup/md_funcs.h index c8a70cd..b3ff9c5 100644 --- a/src/markdup/md_funcs.h +++ b/src/markdup/md_funcs.h @@ -14,19 +14,19 @@ using std::unordered_map; using std::unordered_set; using std::vector; -/* 前向声明 */ +/* */ class BamWrap; class ReadEnds; class ReadNameParser; /* - * 用来检测optical duplication的graph + * optical duplicationgraph */ template -struct Graph { // 用set? - vector nodes; // 图中的结点 +struct Graph { // set? + vector nodes; // unordered_map nodeIdxMap; - unordered_map> neighbors; // 邻接列表 + unordered_map> neighbors; // int addNode(const Node &singleton) { int idx = -1; @@ -98,18 +98,18 @@ struct Graph { // 用set? }; /* - * 计算read的分数 + * read */ int16_t computeDuplicateScore(BamWrap &bw); /* * Builds a read ends object that represents a single read. - * 用来表示一个read的特征结构 + * read */ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey); /* - * 对找到的pairend read end添加一些信息 + * pairend read end */ void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds); @@ -118,7 +118,7 @@ void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds); * in fact optical duplicates, and stores the data in the instance level histogram. * Additionally sets the transient isOpticalDuplicate flag on each read end that is * identified as an optical duplicate. - * 记录光学原因造成的冗余 + * */ void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe); diff --git a/src/markdup/md_pipeline.cpp b/src/markdup/md_pipeline.cpp index 8096d5e..5f51905 100644 --- a/src/markdup/md_pipeline.cpp +++ b/src/markdup/md_pipeline.cpp @@ -22,16 +22,16 @@ using namespace std; namespace nsgv { -extern MarkDupsArg gMdArg; // 用来测试性能 -extern samFile *gInBamFp; // 输入的bam文件 -extern sam_hdr_t *gInBamHeader; // 输入的bam文件头信息 -extern DuplicationMetrics gMetrics; // 统计信息 +extern MarkDupsArg gMdArg; // +extern samFile *gInBamFp; // bam +extern sam_hdr_t *gInBamHeader; // bam +extern DuplicationMetrics gMetrics; // extern vector gNameParsers; extern DupResult gDupRes; extern PipelineArg gPipe; }; // namespace nsgv -/* 处理一组pairend的readends,标记冗余, 这个函数需要串行运行,因为需要做一些统计*/ +/* pairendreadends,, ,*/ static void markDupsForPairs(vector &vpRe, DPSet *dupIdx, MDSet *opticalDupIdx, DPSet *repIdx, MDSet *notDupIdx = nullptr, MDSet *notOpticalDupIdx = nullptr, MDSet *notRepIdx = nullptr) { @@ -41,7 +41,7 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup int maxScore = 0; const ReadEnds *pBest = nullptr; /** All read ends should have orientation FF, FR, RF, or RR **/ - for (auto pe : vpRe) { // 找分数最高的readend + for (auto pe : vpRe) { // readend if (pe->score > maxScore || pBest == nullptr) { maxScore = pe->score; pBest = pe; @@ -53,7 +53,7 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup notDupIdx->insert(pBest->read2IndexInFile); } - if (nsgv::gMdArg.CHECK_OPTICAL_DUP) { // 检查光学冗余 + if (nsgv::gMdArg.CHECK_OPTICAL_DUP) { // // trackOpticalDuplicates vector prevOpticalRe; if (notOpticalDupIdx != nullptr) { @@ -64,7 +64,7 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup } } trackOpticalDuplicates(vpRe, pBest); - // 由于重叠问题,可能会更新数据 + // , if (notOpticalDupIdx != nullptr) { for (auto pe : prevOpticalRe) { if (!pe->isOpticalDuplicate) { @@ -74,13 +74,13 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup } } } - for (auto pe : vpRe) { // 对非best read标记冗余 - if (pe != pBest) { // 非best - dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1 + for (auto pe : vpRe) { // best read + if (pe != pBest) { // best + dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read1 if (pe->read2IndexInFile != pe->read1IndexInFile) dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2, - // 注意这里代表是read1的索引 - // 检查是否optical dup + // read1 + // optical dup if (pe->isOpticalDuplicate && opticalDupIdx != nullptr) { opticalDupIdx->insert(pe->read1IndexInFile); if (pe->read2IndexInFile != pe->read1IndexInFile) @@ -88,7 +88,7 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup } } } - // 在输出的bam文件中添加tag, best作为dupset的代表 + // bamtag, bestdupset if (nsgv::gMdArg.TAG_DUPLICATE_SET_MEMBERS) { repIdx->insert(DupInfo(pBest->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); repIdx->insert(DupInfo(pBest->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); @@ -104,7 +104,7 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup } } -/* 处理一组非paired的readends,标记冗余 */ +/* pairedreadends, */ static void markDupsForFrags(vector &vpRe, bool containsPairs, DPSet *dupIdx, MDSet *notDupIdx = nullptr) { if (containsPairs) { @@ -133,28 +133,28 @@ static void markDupsForFrags(vector &vpRe, bool containsPairs, } } -/* 找到与readend pos相等的所有readend */ +/* readend posreadend */ static void getEqualRE(const ReadEnds &re, vector &src, vector *dst) { - auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::CorLittleThan); // 只比对位点 + auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::CorLittleThan); // dst->insert(dst->end(), range.first, range.second); } #define LOWER_BOUND(tid, nthread, ndata) ((tid) * (ndata) / (nthread)) #define UPPER_BOUND(tid, nthread, ndata) ((tid + 1) * (ndata) / (nthread)) -/* 处理pairs */ +/* pairs */ static void processPairs(vector &readEnds, DPSet *dupIdx, MDSet *opticalDupIdx, DPSet *repIdx, MDSet *notDupIdx = nullptr, MDSet *notOpticalDupIdx = nullptr, MDSet *notRepIdx = nullptr) { // return; - vector vpCache; // 有可能是冗余的reads + vector vpCache; // reads const ReadEnds *pReadEnd = nullptr; for (auto &re : readEnds) { - if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 - vpCache.push_back(&re); // 处理一个潜在的冗余组 + if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // + vpCache.push_back(&re); // else { markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, - notRepIdx); // 不一样 + notRepIdx); // vpCache.clear(); vpCache.push_back(&re); pReadEnd = &re; @@ -163,11 +163,11 @@ static void processPairs(vector &readEnds, DPSet *dupIdx, MDS markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx); } -/* 处理frags */ +/* frags */ static void processFrags(vector &readEnds, DPSet *dupIdx, MDSet *notDupIdx = nullptr) { bool containsPairs = false; bool containsFrags = false; - vector vpCache; // 有可能是冗余的reads + vector vpCache; // reads const ReadEnds *pReadEnd = nullptr; for (auto &re : readEnds) { if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) { @@ -191,7 +191,7 @@ static void processFrags(vector &readEnds, DPSet *dupIdx, MDS } -/* 获取交叉部分的数据 */ +/* */ static inline void getIntersectData(vector &leftArr, vector &rightArr, vector *dst, bool isPairCmp = false) { if (leftArr.empty() || rightArr.empty()) { @@ -204,7 +204,7 @@ static inline void getIntersectData(vector &leftArr, vector while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) { leftSpan += 1; - if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了(可能会有bug,上上个也与下一个有交集呢?) + if (leftSpan > leftEndIdx) { // (bug,?) leftSpan = leftArr.size() - 1; break; } @@ -212,7 +212,7 @@ static inline void getIntersectData(vector &leftArr, vector while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) { rightSpan += 1; - if (rightSpan == rightArr.size() - 1) // 同上,可能会有bug + if (rightSpan == rightArr.size() - 1) // ,bug break; } dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end()); @@ -223,7 +223,7 @@ static inline void getIntersectData(vector &leftArr, vector std::sort(dst->begin(), dst->end(), ReadEnds::FragLittleThan); } -/* 将frags重叠部分的dup idx放进数据中 */ +/* fragsdup idx */ static inline void refreshFragDupIdx(DPSet &dupIdx, MDSet ¬DupIdx, MarkDupDataArg *lastArg, MarkDupDataArg *curArg) { auto &lp = *lastArg; @@ -259,26 +259,26 @@ static void mtGenerateReadEnds(void *data, long idx, int tid) { PROF_START(gen); size_t start_id = LOWER_BOUND(idx, nThread, bams.size()); size_t end_id = UPPER_BOUND(idx, nThread, bams.size()); - for (size_t i = start_id; i < end_id; ++i) { // 循环处理每个read + for (size_t i = start_id; i < end_id; ++i) { // read BamWrap *bw = bams[i]; const int64_t bamIdx = bamStartIdx + i; if (bw->GetReadUnmappedFlag()) { if (bw->b->core.tid == -1) // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort). break; - } else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对 + } else if (!bw->IsSecondaryOrSupplementary()) { // ReadEnds fragEnd; buildReadEnds(*bw, bamIdx, rnParser, &fragEnd); - frags.push_back(fragEnd); // 添加进frag集合 - if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了 + frags.push_back(fragEnd); // frag + if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // pairendread string key = bw->query_name(); if (unpairedDic.find(key) == unpairedDic.end()) { unpairedDic[key] = {taskSeq, fragEnd}; - } else { // 找到了pairend + } else { // pairend auto &pairedEnds = unpairedDic.at(key).unpairedRE; modifyPairedEnds(fragEnd, &pairedEnds); pairs.push_back(pairedEnds); - unpairedDic.erase(key); // 删除找到的pairend + unpairedDic.erase(key); // pairend } } } @@ -302,8 +302,8 @@ static void doGenRE(PipelineArg &pipeArg) { const int numThread = pipeArg.numThread; kt_for(numThread, mtGenerateReadEnds, &pipeArg, numThread); - // 用来在genRE阶段计算一些Sort阶段应该计算的数据,保持每个step用时更平衡 - // 轮询每个线程中未找到匹配的read,找到匹配的 + // genRESort,step + // read, genREData.unpairedDic.clear(); vector &pairs = genREData.pairsArr[numThread]; pairs.clear(); @@ -316,11 +316,11 @@ static void doGenRE(PipelineArg &pipeArg) { const ReadEnds &fragEnd = val.second.unpairedRE; if (genREData.unpairedDic.find(key) == genREData.unpairedDic.end()) { genREData.unpairedDic[key] = {readData.taskSeq, fragEnd}; - } else { // 找到了pairend + } else { // pairend auto &pairedEnds = genREData.unpairedDic.at(key).unpairedRE; modifyPairedEnds(fragEnd, &pairedEnds); pairs.push_back(pairedEnds); - genREData.unpairedDic.erase(key); // 删除找到的pairend + genREData.unpairedDic.erase(key); // pairend } } } @@ -367,11 +367,11 @@ static void doMarkDup(PipelineArg &pipeArg) { sortData.dataPtr = tmpPtr; SortMarkData &smd = *(SortMarkData *)mdData.dataPtr; - // 先处理 pair + // pair PROF_START(markdup_pair); processPairs(smd.pairs, &mdData.pairDupIdx, &mdData.pairOpticalDupIdx, &mdData.pairRepIdx); PROF_END(gprof[GP_markdup_pair], markdup_pair); - // 再处理frag + // frag PROF_START(markdup_frag); processFrags(smd.frags, &mdData.fragDupIdx); PROF_END(gprof[GP_markdup_frag], markdup_frag); @@ -418,7 +418,7 @@ static void refreshMarkDupData(DPSet &dupIdx, MDSet &opticalDu refreshNotDupIdx(repIdx, p.pairRepIdx); } -// 处理相邻数据块之间重叠的部分 +// static void processIntersectFragPairs(MarkDupData &lp, MarkDupData &cp) { SortMarkData &lsm = *(SortMarkData *)lp.dataPtr; SortMarkData &csm = *(SortMarkData *)cp.dataPtr; @@ -430,7 +430,7 @@ static void processIntersectFragPairs(MarkDupData &lp, MarkDupData &cp) { MDSet notOpticalDupIdx; MDSet notDupIdx; MDSet notRepIdx; - // 先处理重叠的frags + // frags getIntersectData(lsm.frags, csm.frags, &reArr); processFrags(reArr, &dupIdx, ¬DupIdx); refreshDupIdx(dupIdx, lp.fragDupIdx); @@ -438,34 +438,34 @@ static void processIntersectFragPairs(MarkDupData &lp, MarkDupData &cp) { refreshNotDupIdx(notDupIdx, lp.fragDupIdx); refreshNotDupIdx(notDupIdx, cp.fragDupIdx); - // 再处理重叠的pairs + // pairs reArr.clear(); dupIdx.clear(); notDupIdx.clear(); getIntersectData(lsm.pairs, csm.pairs, &reArr, true); processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, ¬DupIdx, ¬OpticalDupIdx, ¬RepIdx); - refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, cp, lp); // 放在cp里,因为后面global里可能有相同的dup,防止多次出现 + refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, cp, lp); // cp,globaldup, } -// 在相邻的数据块之间寻找未匹配的readends, 找到匹配的放到lp里 +// readends, lp static void findUnpairedInDatas(MarkDupData &lp, MarkDupData &cp) { auto &interPairedData = lp.ckeyReadEndsMap; SortMarkData &lsm = *(SortMarkData *)lp.dataPtr; SortMarkData &csm = *(SortMarkData *)cp.dataPtr; - for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end(); ) { // 遍历上一个任务中的每个未匹配的read + for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end(); ) { // read auto &lastUnpair = *itr; auto &readName = lastUnpair.first; auto &lastUnpairInfo = lastUnpair.second; - auto lastRE = lastUnpairInfo.unpairedRE; // 未匹配的read end - if (csm.unpairedDic.find(readName) != csm.unpairedDic.end()) { // 在当前这个任务里找到了这个未匹配的read + auto lastRE = lastUnpairInfo.unpairedRE; // read end + if (csm.unpairedDic.find(readName) != csm.unpairedDic.end()) { // read auto &curUnpairInfo = csm.unpairedDic[readName]; auto &curRE = curUnpairInfo.unpairedRE; - modifyPairedEnds(curRE, &lastRE); // lastRE当做找到匹配后,完整的ReadEnds + modifyPairedEnds(curRE, &lastRE); // lastRE,ReadEnds CalcKey ck(lastRE); auto &pairArr = interPairedData[ck]; pairArr.push_back(lastRE); - // 从dict中清除配对后的readends + // dictreadends csm.unpairedDic.erase(readName); itr = lsm.unpairedDic.erase(itr); } else { @@ -474,23 +474,23 @@ static void findUnpairedInDatas(MarkDupData &lp, MarkDupData &cp) { } } -// 在global和lp中寻找未匹配的readends, 找到匹配的放到global里 +// globallpreadends, global static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) { auto &interPairedData = g.ckeyReadEndsMap; SortMarkData &lsm = *(SortMarkData *)lp.dataPtr; - for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end();) { // 遍历上一个任务中的每个未匹配的read + for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end();) { // read auto &lastUnpair = *itr; auto &readName = lastUnpair.first; auto &lastUnpairInfo = lastUnpair.second; - auto lastRE = lastUnpairInfo.unpairedRE; // 未匹配的read end - if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在global里找到了这个未匹配的read + auto lastRE = lastUnpairInfo.unpairedRE; // read end + if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // globalread auto &gUnpairInfo = g.unpairedDic[readName]; auto &gRE = gUnpairInfo.unpairedRE; - modifyPairedEnds(lastRE, &gRE); // gRE当做找到匹配后,完整的ReadEnds + modifyPairedEnds(lastRE, &gRE); // gRE,ReadEnds CalcKey ck(gRE); auto &pairArr = interPairedData[ck]; pairArr.push_back(gRE); - // 从dict中清除配对后的readends + // dictreadends g.unpairedDic.erase(readName); itr = lsm.unpairedDic.erase(itr); } else { @@ -528,10 +528,10 @@ static void doIntersect(PipelineArg &pipeArg) { SortMarkData &lsm = *(SortMarkData *)lp.dataPtr; SortMarkData &csm = *(SortMarkData *)cp.dataPtr; - // 处理相邻数据块之间重叠的部分 + // processIntersectFragPairs(lp, cp); - // 检查确保lp和np之间没有数据交叉 + // lpnp int64_t lastLeft = INT64_MIN, lastRight = INT64_MAX, curLeft = INT64_MAX, curRight = INT64_MAX; if (lsm.pairs.size() > 0) { lastLeft = lsm.frags[0].Left(); @@ -549,21 +549,21 @@ static void doIntersect(PipelineArg &pipeArg) { } g.rightPos = lastRight; - findUnpairedInDatas(lp, cp); // 找到的匹配放到lp里 - findUnpairedInGlobal(g, cp); // 把cp中未匹配的放到global里保存 + findUnpairedInDatas(lp, cp); // lp + findUnpairedInGlobal(g, cp); // cpglobal - // 处理lp中的新找到的匹配 + // lp TaskSeqDupInfo t; for (auto itr = lp.ckeyReadEndsMap.begin(); itr != lp.ckeyReadEndsMap.end();) { auto &ckVal = *itr; auto &ck = ckVal.first; auto &pairArr = ckVal.second; - getEqualRE(pairArr[0], lsm.pairs, &pairArr); // 如果不在计算范围内,会放在global里 - if (ck.Right() <= lastRight) { // 必须在当前数据块的范围内, 才进行处理 - if (ck.Left() >= curLeft) { // 在交叉的范围内才去加上这些在cp中的数据 + getEqualRE(pairArr[0], lsm.pairs, &pairArr); // ,global + if (ck.Right() <= lastRight) { // , + if (ck.Left() >= curLeft) { // cp getEqualRE(pairArr[0], csm.pairs, &pairArr); } - // 在global里找一找ck + // globalck auto gitr = g.ckeyReadEndsMap.find(ck); if (gitr != g.ckeyReadEndsMap.end()) { auto &gPairArr = gitr->second; @@ -578,7 +578,7 @@ static void doIntersect(PipelineArg &pipeArg) { } } - // 处理找到匹配的global数据 + // global for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) { auto &ckVal = *itr; auto &ck = ckVal.first; @@ -586,7 +586,7 @@ static void doIntersect(PipelineArg &pipeArg) { if (ck.Left() >= lastLeft) { getEqualRE(pairArr[0], lsm.pairs, &pairArr); } - if (ck.Right() <= lastRight) { // 只有在这个范围内,对应位点的所有reads才完全都包含了 + if (ck.Right() <= lastRight) { // ,reads sort(pairArr.begin(), pairArr.end(), ReadEnds::PairLittleThan); processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx); itr = g.ckeyReadEndsMap.erase(itr); @@ -595,16 +595,16 @@ static void doIntersect(PipelineArg &pipeArg) { } } - // 剩余的在lp中没处理的放到global里 + // lpglobal for (auto &ckVal : lp.ckeyReadEndsMap) { auto &pairArr = g.ckeyReadEndsMap[ckVal.first]; pairArr.insert(pairArr.end(), ckVal.second.begin(), ckVal.second.end()); } lp.ckeyReadEndsMap.clear(); - // 更新一下冗余结果 + // refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp, cp); - // 处理g中新找到的匹配 + // g putDupinfoToGlobal(g, lp); for (auto &unPair : lsm.unpairedDic) { @@ -620,16 +620,16 @@ static void *pipeRead(void *data) { while (1) { PROF_START(read_wait); yarn::POSSESS(pipeArg.readSig); - yarn::WAIT_FOR(pipeArg.readSig, yarn::NOT_TO_BE, 1); // 只有一个坑,因为在bambuf内部支持异步读取 + yarn::WAIT_FOR(pipeArg.readSig, yarn::NOT_TO_BE, 1); // ,bambuf PROF_END(gprof[GP_read_wait], read_wait); size_t readNum = 0; PROF_START(read); if (inBamBuf.ReadStat() >= 0) - readNum = inBamBuf.ReadBam(); // 读入新一轮的数据 + readNum = inBamBuf.ReadBam(); // PROF_END(gprof[GP_read], read); if (readNum < 1) { pipeArg.readFinish = 1; - yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // 读入了一轮数据 + yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // break; } spdlog::info("{} reads processed in {} round", readNum, pipeArg.readOrder); @@ -640,9 +640,9 @@ static void *pipeRead(void *data) { pipeArg.readData.taskSeq = pipeArg.readOrder; readNumSum += readNum; - inBamBuf.ClearAll(); // 清理上一轮读入的数据 + inBamBuf.ClearAll(); // pipeArg.readOrder += 1; // for next - yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // 读入了一轮数据 + yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // } spdlog::info("{} total reads processed", readNumSum); return 0; @@ -659,21 +659,21 @@ static void *pipeGenRE(void *data) { while (1) { PROF_START(gen_wait); yarn::POSSESS(pipeArg.readSig); - yarn::WAIT_FOR(pipeArg.readSig, yarn::NOT_TO_BE, 0); // 等待有数据 + yarn::WAIT_FOR(pipeArg.readSig, yarn::NOT_TO_BE, 0); // yarn::POSSESS(pipeArg.genRESig); PROF_END(gprof[GP_gen_wait], gen_wait); - yarn::WAIT_FOR(pipeArg.genRESig, yarn::NOT_TO_BE, pipeArg.GENBUFNUM); // 有BUFNUM个坑 - yarn::RELEASE(pipeArg.genRESig); // 因为有不止一个位置,所以要释放 + yarn::WAIT_FOR(pipeArg.genRESig, yarn::NOT_TO_BE, pipeArg.GENBUFNUM); // BUFNUM + yarn::RELEASE(pipeArg.genRESig); // , - if (pipeArg.readFinish) { // 读取结束的时候,没有新的数据需要处理了 + if (pipeArg.readFinish) { // , yarn::POSSESS(pipeArg.genRESig); pipeArg.genREFinish = 1; yarn::TWIST(pipeArg.genRESig, yarn::BY, 1); yarn::TWIST(pipeArg.readSig, yarn::BY, -1); break; } - /* 处理bam,生成readends */ + /* bam,readends */ PROF_START(gen); doGenRE(pipeArg); PROF_END(gprof[GP_gen], gen); @@ -681,7 +681,7 @@ static void *pipeGenRE(void *data) { yarn::POSSESS(pipeArg.genRESig); pipeArg.genREOrder += 1; yarn::TWIST(pipeArg.genRESig, yarn::BY, 1); - yarn::TWIST(pipeArg.readSig, yarn::BY, -1); // 使用了一次读入的数据 + yarn::TWIST(pipeArg.readSig, yarn::BY, -1); // } return 0; } @@ -691,19 +691,19 @@ static void *pipeSort(void *data) { while (1) { PROF_START(sort_wait); yarn::POSSESS(pipeArg.genRESig); - yarn::WAIT_FOR(pipeArg.genRESig, yarn::NOT_TO_BE, 0); // 等待有数据 + yarn::WAIT_FOR(pipeArg.genRESig, yarn::NOT_TO_BE, 0); // yarn::RELEASE(pipeArg.genRESig); PROF_END(gprof[GP_sort_wait], sort_wait); yarn::POSSESS(pipeArg.sortSig); - yarn::WAIT_FOR(pipeArg.sortSig, yarn::NOT_TO_BE, pipeArg.SORTBUFNUM); // 有BUFNUM个位置 + yarn::WAIT_FOR(pipeArg.sortSig, yarn::NOT_TO_BE, pipeArg.SORTBUFNUM); // BUFNUM yarn::RELEASE(pipeArg.sortSig); if (pipeArg.genREFinish) { - // 处理完剩余数据 + // while (pipeArg.sortOrder < pipeArg.genREOrder) { yarn::POSSESS(pipeArg.sortSig); - yarn::WAIT_FOR(pipeArg.sortSig, yarn::NOT_TO_BE, pipeArg.SORTBUFNUM); // 有BUFNUM个位置 + yarn::WAIT_FOR(pipeArg.sortSig, yarn::NOT_TO_BE, pipeArg.SORTBUFNUM); // BUFNUM yarn::RELEASE(pipeArg.sortSig); PROF_START(sort); @@ -719,7 +719,7 @@ static void *pipeSort(void *data) { yarn::TWIST(pipeArg.sortSig, yarn::BY, 1); break; } - /* 排序 readends */ + /* readends */ PROF_START(sort); doSort(pipeArg); PROF_END(gprof[GP_sort], sort); @@ -739,7 +739,7 @@ static void *pipeMarkDup(void *data) { while (1) { PROF_START(markdup_wait); yarn::POSSESS(pipeArg.sortSig); - yarn::WAIT_FOR(pipeArg.sortSig, yarn::NOT_TO_BE, 0); // 等待有数据 + yarn::WAIT_FOR(pipeArg.sortSig, yarn::NOT_TO_BE, 0); // yarn::RELEASE(pipeArg.sortSig); PROF_END(gprof[GP_markdup_wait], markdup_wait); @@ -748,7 +748,7 @@ static void *pipeMarkDup(void *data) { yarn::RELEASE(pipeArg.markDupSig); if (pipeArg.sortFinish) { - // 应该还得处理剩余的数据 + // while (pipeArg.markDupOrder < pipeArg.sortOrder) { yarn::POSSESS(pipeArg.markDupSig); yarn::WAIT_FOR(pipeArg.markDupSig, yarn::NOT_TO_BE, pipeArg.MARKBUFNUM); @@ -767,7 +767,7 @@ static void *pipeMarkDup(void *data) { yarn::TWIST(pipeArg.markDupSig, yarn::TO, 2 + pipeArg.MARKBUFNUM); break; } - /* 冗余检测 readends */ + /* readends */ PROF_START(markdup); doMarkDup(pipeArg); PROF_END(gprof[GP_markdup], markdup); @@ -787,7 +787,7 @@ static void *pipeIntersect(void *data) { while (1) { PROF_START(intersect_wait); yarn::POSSESS(pipeArg.markDupSig); - yarn::WAIT_FOR(pipeArg.markDupSig, yarn::TO_BE_MORE_THAN, kInitIntersectOrder); // 等待有数据 + yarn::WAIT_FOR(pipeArg.markDupSig, yarn::TO_BE_MORE_THAN, kInitIntersectOrder); // yarn::RELEASE(pipeArg.markDupSig); PROF_END(gprof[GP_intersect_wait], intersect_wait); @@ -801,7 +801,7 @@ static void *pipeIntersect(void *data) { break; } - /* 交叉数据处理 readends */ + /* readends */ PROF_START(intersect); doIntersect(pipeArg); PROF_END(gprof[GP_intersect], intersect); @@ -814,7 +814,7 @@ static void *pipeIntersect(void *data) { return 0; } -/* 当所有任务结束后,global data里还有未处理的数据 */ +/* ,global data */ static void processLastData(PipelineArg &pipeArg) { IntersectData &g = pipeArg.intersectData; MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM]; @@ -823,7 +823,7 @@ static void processLastData(PipelineArg &pipeArg) { if (lsm.pairs.size() > 0) { lastLeft = lsm.frags[0].Left(); } - // 处理找到匹配的global数据 + // global TaskSeqDupInfo t; for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) { auto &ckVal = *itr; @@ -836,9 +836,9 @@ static void processLastData(PipelineArg &pipeArg) { processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx); itr = g.ckeyReadEndsMap.erase(itr); } - // 更新一下冗余结果 + // refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp); - // 处理g中新找到的匹配 + // g putDupinfoToGlobal(g, lp); } @@ -866,7 +866,7 @@ static void parallelPipeline() { // spdlog::info("result size : {} GB", nsgv::gDupRes.byteSize() / 1024.0 / 1024 / 1024); } -/* 并行流水线方式处理数据,标记冗余 */ +/* , */ void PipelineMarkDups() { if (nsgv::gMdArg.NUM_THREADS > 1) return parallelPipeline(); @@ -879,11 +879,11 @@ void PipelineMarkDups() { for (int i = 0; i < pipeArg.GENBUFNUM; ++i) { pipeArg.genREData[i].Init(pipeArg.numThread); } - pipeArg.intersectOrder = 1; // do intersect 从1开始 + pipeArg.intersectOrder = 1; // do intersect 1 while (1) { size_t readNum = 0; if (inBamBuf.ReadStat() >= 0) - readNum = inBamBuf.ReadBam(); // 读入新一轮的数据 + readNum = inBamBuf.ReadBam(); // if (readNum < 1) { break; } @@ -908,7 +908,7 @@ void PipelineMarkDups() { } readNumSum += readNum; - inBamBuf.ClearAll(); // 清理上一轮读入的数据 + inBamBuf.ClearAll(); // pipeArg.readOrder += 1; // for next } processLastData(pipeArg); diff --git a/src/markdup/md_pipeline.h b/src/markdup/md_pipeline.h index 4cfd189..489e1b4 100644 --- a/src/markdup/md_pipeline.h +++ b/src/markdup/md_pipeline.h @@ -8,22 +8,22 @@ struct ReadData { vector bams; // read step output - int64_t bamStartIdx = 0; // 每轮读入的bam数组,起始位置在全局bam中的索引 - int64_t taskSeq = 0; // 任务序号 + int64_t bamStartIdx = 0; // bam,bam + int64_t taskSeq = 0; // }; struct GenREData { - vector> pairsArr; // 成对的reads - vector> fragsArr; // 暂未找到配对的reads - vector unpairedDicArr; // 用来寻找pair end + vector> pairsArr; // reads + vector> fragsArr; // reads + vector unpairedDicArr; // pair end void Init(int nThread) { - for (int i = 0; i <= nThread; ++i) { // 比线程多一个,主要是pairs多一个,其他没用 + for (int i = 0; i <= nThread; ++i) { // ,pairs, pairsArr.push_back(vector()); fragsArr.push_back(vector()); unpairedDicArr.push_back(UnpairedNameMap()); } } - UnpairedNameMap unpairedDic; // 代替sort step中一部分计算 + UnpairedNameMap unpairedDic; // sort step size_t byteSize() { size_t bytes = 0; for (auto &v : pairsArr) @@ -38,9 +38,9 @@ struct GenREData { }; struct SortMarkData { - vector pairs; // 成对的reads - vector frags; // 暂未找到配对的reads - UnpairedNameMap unpairedDic; // 用来寻找pair end + vector pairs; // reads + vector frags; // reads + UnpairedNameMap unpairedDic; // pair end size_t byteSize() { size_t bytes = 0; for (auto &r : pairs) bytes += sizeof(r); @@ -56,11 +56,11 @@ struct SortData { }; struct MarkDupData { - int64_t taskSeq = 0; // 任务序号 - DPSet pairDupIdx; // pair的冗余read的索引 - MDSet pairOpticalDupIdx; // optical冗余read的索引 - DPSet fragDupIdx; // frag的冗余read的索引 - DPSet pairRepIdx; // pair的dupset代表read的索引 + int64_t taskSeq = 0; // + DPSet pairDupIdx; // pairread + MDSet pairOpticalDupIdx; // opticalread + DPSet fragDupIdx; // fragread + DPSet pairRepIdx; // pairdupsetread CkeyReadEndsMap ckeyReadEndsMap; volatile void *dataPtr; // SortMarkData pointer @@ -111,11 +111,11 @@ struct DupResult { }; struct IntersectData { - UnpairedNameMap unpairedDic; // 用来寻找pair end + UnpairedNameMap unpairedDic; // pair end CkeyReadEndsMap ckeyReadEndsMap; int64_t rightPos = 0; - // 每个task对应一个vector + // taskvector vector> &dupIdxArr; vector> &opticalDupIdxArr; vector> &repIdxArr; @@ -138,7 +138,7 @@ struct IntersectData { } }; -// 记录流水线状态,task的序号,以及某阶段是否结束 +// ,task, struct PipelineArg { static const int GENBUFNUM = 2; static const int SORTBUFNUM = 2; @@ -161,8 +161,8 @@ struct PipelineArg { yarn::lock_t *markDupSig; PipelineArg(DupResult *resPtr) : intersectData(resPtr) { - readSig = yarn::NEW_LOCK(0); // 最大值1, 双buffer在bambuf中实现了,对调用透明 - genRESig = yarn::NEW_LOCK(0); // 最大值2, 双buffer + readSig = yarn::NEW_LOCK(0); // 1, bufferbambuf, + genRESig = yarn::NEW_LOCK(0); // 2, buffer sortSig = yarn::NEW_LOCK(0); markDupSig = yarn::NEW_LOCK(0); for (int i = 0; i < SORTBUFNUM; ++i) { @@ -208,8 +208,8 @@ struct PipelineArg { }; struct REArrIdIdx { - int arrId = 0; // 数组索引 - uint64_t arrIdx = 0; // 数组内部当前索引 + int arrId = 0; // + uint64_t arrIdx = 0; // const ReadEnds *pE = nullptr; }; @@ -224,7 +224,7 @@ struct REPairGreaterThan { template struct ReadEndsHeap { - // 将冗余索引和他对应的task vector对应起来 + // task vector vector> *arr2d; priority_queue, CMP> minHeap; uint64_t popNum = 0; @@ -269,5 +269,5 @@ struct ReadEndsHeap { } }; -// 并行运行mark duplicate +// mark duplicate void PipelineMarkDups(); \ No newline at end of file diff --git a/src/markdup/md_types.h b/src/markdup/md_types.h index 7993773..47fdd73 100644 --- a/src/markdup/md_types.h +++ b/src/markdup/md_types.h @@ -19,13 +19,13 @@ using std::string; using std::unordered_set; using std::vector; -/* 存放未匹配readend相同位点的所有readend */ +/* readendreadend */ struct UnpairedREInfo { - int64_t taskSeq; // 对应第几轮计算 + int64_t taskSeq; // ReadEnds unpairedRE; }; -/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */ +/* pair,,read1read2 */ struct CalcKey { int8_t orientation = -1; int32_t read1ReferenceIndex = -1; @@ -70,7 +70,7 @@ struct CalcKey { comp = read2ReferenceIndex - o.read2ReferenceIndex; if (comp == 0) comp = read2Coordinate - o.read2Coordinate; - // 需要orientation,因为要跟排序的比较方式和顺序一致 + // orientation, if (comp == 0) comp = orientation - o.orientation; return comp < 0; @@ -91,7 +91,7 @@ struct CalcKey { comp = read2ReferenceIndex - o.read2ReferenceIndex; if (comp == 0) comp = read2Coordinate - o.read2Coordinate; - // 需要orientation,因为要跟排序的比较方式和顺序一致 + // orientation, if (comp == 0) comp = orientation - o.orientation; return comp < 0; @@ -113,10 +113,10 @@ struct CalcKeyEqual { bool operator()(const CalcKey &o1, const CalcKey &o2) const { return o1 == o2; } }; -/* 用来记录冗余索引相关的信息 */ +/* */ struct DupInfo { int16_t dupSet = 0; // dup set size - uint16_t repIdxHigh = 0; // 这一批冗余中的非冗余read 代表的索引 + uint16_t repIdxHigh = 0; // read uint32_t repIdxLow = 0; int64_t idx; @@ -164,7 +164,7 @@ using CalcSet = set; using ReadEndsSet = tsl::robin_set; -/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */ +/* pair read, */ struct TaskSeqDupInfo { DPSet dupIdx; MDSet opticalDupIdx; @@ -174,7 +174,7 @@ struct TaskSeqDupInfo { MDSet notRepIdx; }; -/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */ +/* pair,read endread end */ struct UnpairedPosInfo { int unpairedNum = 0; int64_t taskSeq; @@ -184,33 +184,33 @@ struct UnpairedPosInfo { // typedef unordered_map UnpairedNameMap; // typedef unordered_map UnpairedPositionMap; -typedef tsl::robin_map UnpairedNameMap; // 以read name为索引,保存未匹配的pair read -//typedef map UnpairedNameMap; // 以read name为索引,保存未匹配的pair read -typedef tsl::robin_map UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 -// typedef map> CkeyReadEndsMap; // 以calckey为关键字,保存在相邻数据块之前找到的匹配readEnds +typedef tsl::robin_map UnpairedNameMap; // read name,pair read +//typedef map UnpairedNameMap; // read name,pair read +typedef tsl::robin_map UnpairedPositionMap; // ,readread +// typedef map> CkeyReadEndsMap; // calckey,readEnds typedef unordered_map, CalcKeyHash, CalcKeyEqual> - CkeyReadEndsMap; // 以calckey为关键字,保存在相邻数据块之前找到的匹配readEnds + CkeyReadEndsMap; // calckey,readEnds // typedef tsl::robin_map, CalcKeyHash, CalcKeyEqual> CkeyReadEndsMap; // -// 以calckey为关键字,保存在相邻数据块之前找到的匹配readEnds +// calckey,readEnds -/* 单线程处理冗余参数结构体 */ +/* */ struct MarkDupDataArg { - int64_t taskSeq; // 任务序号 - int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 - vector bams; // 存放待处理的bam read - vector pairs; // 成对的reads - vector frags; // 暂未找到配对的reads - DPSet pairDupIdx; // pair的冗余read的索引 - MDSet pairOpticalDupIdx; // optical冗余read的索引 - DPSet fragDupIdx; // frag的冗余read的索引 - DPSet pairRepIdx; // pair的dupset代表read的索引 - MDSet pairSingletonIdx; // 某位置只有一对read的所有read pair个数 - UnpairedNameMap unpairedDic; // 用来寻找pair end - UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 + int64_t taskSeq; // + int64_t bamStartIdx; // vBambambam + vector bams; // bam read + vector pairs; // reads + vector frags; // reads + DPSet pairDupIdx; // pairread + MDSet pairOpticalDupIdx; // opticalread + DPSet fragDupIdx; // fragread + DPSet pairRepIdx; // pairdupsetread + MDSet pairSingletonIdx; // readread pair + UnpairedNameMap unpairedDic; // pair end + UnpairedPositionMap unpairedPosArr; // ReadEndReadEnd, }; /* - * 优先队列,用最小堆来实现对所有冗余索引的排序 + * , */ template struct PairArrIdIdx { @@ -226,9 +226,9 @@ struct IdxGreaterThan { template struct DupIdxQueue { - // 将冗余索引和他对应的task vector对应起来 + // task vector - // 由于是多个task来查找冗余,所以每次找到的冗余index都放在一个独立的vector中,vector之间可能有重叠,所以需要用一个最小堆来维护 + // task,indexvector,vector, vector> *dupIdx2DArr; priority_queue, vector>, IdxGreaterThan> minHeap; uint64_t popNum = 0; diff --git a/src/markdup/read_ends.h b/src/markdup/read_ends.h index 866f165..fa5257a 100644 --- a/src/markdup/read_ends.h +++ b/src/markdup/read_ends.h @@ -1,6 +1,6 @@ /* Description: read -ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等 +ends, Copyright : All right reserved by ICT @@ -39,15 +39,15 @@ struct PhysicalLocation { int16_t y = -1; }; -/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/ +/* read ends,picard ReadEndsForMarkDuplicates*/ struct ReadEnds : PhysicalLocation { static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5; - /* 保留一些bam记录中的数据 */ + /* bam */ bool read1FirstOfPair = true; - /* ReadEnds中的成员变量 */ + /* ReadEnds */ /** Little struct-like class to hold read pair (and fragment) end data for * duplicate marking. */ - // int16_t libraryId; // 没用,不考虑多样本 + // int16_t libraryId; // , int8_t orientation = -1; int32_t read1ReferenceIndex = -1; int32_t read1Coordinate = -1; @@ -55,8 +55,8 @@ struct ReadEnds : PhysicalLocation { // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported) int32_t read2Coordinate = -1; /* Additional information used to detect optical dupes */ - // int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read - // group,normal或者tumor + // int16_t readGroup = -1; bamread + // group,normaltumor /** For optical duplicate detection the orientation matters regard to 1st or * 2nd end of a mate */ int8_t orientationForOpticalDuplicates = -1; @@ -64,7 +64,7 @@ struct ReadEnds : PhysicalLocation { */ bool isOpticalDuplicate = false; - /* ReadEndsForMarkDuplicates中的成员变量 */ + /* ReadEndsForMarkDuplicates */ /** Little struct-like class to hold read pair (and fragment) end data for * MarkDuplicatesWithMateCigar **/ int16_t score = 0; @@ -72,20 +72,20 @@ struct ReadEnds : PhysicalLocation { int64_t read2IndexInFile = -1; int64_t duplicateSetSize = -1; - /* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */ + /* ReadEndsForMarkDuplicatesWithBarcodes () */ // int32_t barcode = 0; // primary barcode for this read (and pair) // int32_t readOneBarcode = 0; // read one barcode, 0 if not present // int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not // paired - /* zzh增加的成员变量 */ - int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << - // MAX_CONTIG_LEN_SHIFT | (int64_t)pos; (包含clip的序列,也就是可能比map结果更偏左) + /* zzh */ + int64_t posKey = -1; // return (int64_t)tid << + // MAX_CONTIG_LEN_SHIFT | (int64_t)pos; (clip,map) - /* 用来做一些判断,因为一些readends会做多次操作,比如task之间有重叠等等 */ + /* ,readends,task */ int oprateTime = 0; - /* 根据pairend read的比对方向,来确定整体的比对方向 */ + /* pairend read, */ static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand) { if (read1NegativeStrand) { if (read2NegativeStrand) @@ -100,7 +100,7 @@ struct ReadEnds : PhysicalLocation { } } - /* 比较两个readends是否一样(有个冗余) */ + /* readends() */ static bool AreComparableForDuplicates(const ReadEnds &lhs, const ReadEnds &rhs, bool compareRead2) { bool areComparable = true; areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex && @@ -112,15 +112,15 @@ struct ReadEnds : PhysicalLocation { return areComparable; } - /* 比对方向是否正向 */ + /* */ bool IsPositiveStrand() const { return orientation == F; } - /* pairend是否合适的比对上了 */ + /* pairend */ bool IsPaired() const { return read2ReferenceIndex != -1; } bool IsNegativeStrand() const { return orientation == R; } - // 对于相交的数据进行比对,a是否小于b,根据AreComparableForDuplicates函数得来 + // ,ab,AreComparableForDuplicates static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false) { int comp = a.read1ReferenceIndex - b.read1ReferenceIndex; if (comp == 0) @@ -141,7 +141,7 @@ struct ReadEnds : PhysicalLocation { int comp = a.read1ReferenceIndex - b.read1ReferenceIndex; if (comp == 0) comp = a.read1Coordinate - b.read1Coordinate; - if (comp == 0) // 这个放在坐标比较之前,因为在解析bam的时候,可能有的给read2ReferenceIndex赋值了,有的则没赋值 + if (comp == 0) // ,bam,read2ReferenceIndex, comp = a.orientation - b.orientation; if (comp == 0) comp = a.read2ReferenceIndex - b.read2ReferenceIndex; @@ -150,7 +150,7 @@ struct ReadEnds : PhysicalLocation { // if (comp == 0) // comp = a.tile - b.tile; // if (comp == 0) -// comp = a.x - b.x; // 由于picard的bug,用short类型来表示x,y,导致其可能为负数 +// comp = a.x - b.x; // picardbug,shortx,y, // if (comp == 0) // comp - a.y - b.y; if (comp == 0) @@ -168,12 +168,12 @@ struct ReadEnds : PhysicalLocation { comp = a.read2ReferenceIndex - b.read2ReferenceIndex; if (comp == 0) comp = a.read2Coordinate - b.read2Coordinate; - if (comp == 0) // 这个放在坐标比较了之后,把坐标范围的放在之前,这样对分段数据块比较好处理 + if (comp == 0) // ,, comp = a.orientation - b.orientation; // if (comp == 0) // comp = a.tile - b.tile; // if (comp == 0) -// comp = a.x - b.x; // 由于picard的bug,用short类型来表示x,y,导致其可能为负数 +// comp = a.x - b.x; // picardbug,shortx,y, // if (comp == 0) // comp - a.y - b.y; if (comp == 0) @@ -191,7 +191,7 @@ struct ReadEnds : PhysicalLocation { comp = a.read2ReferenceIndex - b.read2ReferenceIndex; if (comp == 0) comp = a.read2Coordinate - b.read2Coordinate; - if (comp == 0) // 这个放在坐标比较了之后,把坐标范围的放在之前,这样对分段数据块比较好处理 + if (comp == 0) // ,, comp = a.orientation - b.orientation; return comp < 0; } diff --git a/src/markdup/read_name_parser.h b/src/markdup/read_name_parser.h index 1746d4b..817bb31 100644 --- a/src/markdup/read_name_parser.h +++ b/src/markdup/read_name_parser.h @@ -1,5 +1,5 @@ /* -Description: 解析read的name中的信息,比如tile, x, y等 +Description: readname,tile, x, y Copyright : All right reserved by ICT @@ -28,7 +28,7 @@ using std::string; * Provides access to the physical location information about a cluster. * All values should be defaulted to -1 if unavailable. ReadGroup and Tile * should only allow non-zero positive integers, x and y coordinates may be - * negative. 非线程安全 + * negative. */ struct ReadNameParser { /** @@ -72,7 +72,7 @@ struct ReadNameParser { warnAboutRegexNotMatching = isWarn; } - /* 重新设置readNameRegex */ + /* readNameRegex */ void SetReadNameRegex(const string &strReadNameRegex) { readNameRegex = strReadNameRegex; if (strReadNameRegex == DEFAULT_READ_NAME_REGEX) @@ -83,7 +83,7 @@ struct ReadNameParser { // readNamePattern = strReadNameRegex; } - /* 添加测序时候的tile x y 信息 */ + /* tile x y */ bool AddLocationInformation(const string &readName, PhysicalLocation *loc) { if (!(readName == readNameStored)) { if (ReadLocationInformation(readName, loc)) { diff --git a/src/util/bam_buf.cpp b/src/util/bam_buf.cpp index 0473772..8f88976 100644 --- a/src/util/bam_buf.cpp +++ b/src/util/bam_buf.cpp @@ -1,5 +1,5 @@ /* - Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 + Description: sam/bam,buf, Copyright : All right reserved by ICT @@ -10,25 +10,25 @@ #include "bam_buf.h" /* - * BamBuf类 + * BamBuf */ -// 读取数据直到读完,或者缓冲区满 +// , int BamBuf::ReadBam() { int read_num = 0; - if (handle_last) { // 处理上次读入的最后一个bam - if (has_enough_space()) { // 必须调用,在边界处调整memffset + if (handle_last) { // bam + if (has_enough_space()) { // ,memffset ++read_num; append_one_bam(); } else { - return read_num; // 还是没空间 + return read_num; // } } while (read_stat_ >= 0 && (read_stat_ = sam_read1(fp, hdr, bw->b)) >= 0) { bw->end_pos_ = BamWrap::BamEndPos(bw->b); - if (has_enough_space()) { // 还有空间 - // if (true) { // 还有空间 + if (has_enough_space()) { // + // if (true) { // append_one_bam(); - ++read_num; // 放进缓存才算读取到 + ++read_num; // } else { break; } @@ -41,7 +41,7 @@ int BamBuf::ReadBam() { return read_num; } -// 初始化缓存 +// void BamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { this->fp = fp; this->hdr = hdr; @@ -71,9 +71,9 @@ void BamBuf::ClearAll() { prepare_read(); } -// 为下一次读取做准备, 计算一些边界条件 +// , inline void BamBuf::prepare_read() { - // 计算余留的下次计算可能用到的bam所占的位置 + // bam if (bv.size() > 0) { BamWrap *bw = bv[0]; legacy_start = (int64_t)bw - (int64_t)mem; @@ -81,11 +81,11 @@ inline void BamBuf::prepare_read() { legacy_end = (int64_t)bw + bw->length() - (int64_t)mem; } else { legacy_start = legacy_end = 0; - mem_offset = 0; // 上次没剩下,那就从头存储 + mem_offset = 0; // , } } -// 检查缓存是否还有空间 +// inline bool BamBuf::has_enough_space() { const uint32_t bam_len = bw->length(); int64_t potential_end = mem_offset + bam_len; @@ -104,7 +104,7 @@ inline bool BamBuf::has_enough_space() { return potential_end < legacy_start; } -// 处理一个读取后的bam +// bam inline void BamBuf::append_one_bam() { BamWrap *bwp = (BamWrap *)(mem + mem_offset); *bwp = *bw; @@ -113,15 +113,15 @@ inline void BamBuf::append_one_bam() { *bp = *bw->b; bp->data = (uint8_t *)((char *)bwp->b + sizeof(bam1_t)); memcpy(bp->data, bw->b->data, bw->b->l_data); - // 更新下次存储的位置 + // mem_offset = (mem_offset + bw->length() + 8 - 1) & ~((size_t)(8 - 1)); bv.push_back(bwp); } -// 处理上次读入的最后一个read +// read inline bool BamBuf::handle_last_read() { - if (handle_last) { // 处理上次读入的最后一个bam - if (has_enough_space()) { // 必须调用,在边界处调整memffset + if (handle_last) { // bam + if (has_enough_space()) { // ,memffset append_one_bam(); handle_last = false; return true; @@ -131,9 +131,9 @@ inline bool BamBuf::handle_last_read() { } /* - * AsyncIoBamBuf 类 + * AsyncIoBamBuf */ -// 初始化缓存 +// void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { if (use_async_io_) { buf1_.Init(fp, hdr, mem_size >> 1); @@ -147,7 +147,7 @@ void AsyncIoBamBuf::Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size) { } } -// 读取数据 +// int AsyncIoBamBuf::ReadBam() { if (use_async_io_) { hasThread = true; @@ -178,11 +178,11 @@ int AsyncIoBamBuf::async_read_bam() { first_read_ = false; refresh_bam_arr(); } else { - // join, 交换缓冲区指针 + // join, pthread_join(*tid_, 0); resize_buf(); - if (need_read_) { // 需要交换指针 + if (need_read_) { // BamBuf *tmp = pi_; pi_ = po_; po_ = tmp; @@ -190,14 +190,14 @@ int AsyncIoBamBuf::async_read_bam() { read_num = last_read_num_; refresh_bam_arr(); } - // 异步读 + // pthread_create(tid_, 0, async_read, this); return read_num; } void *AsyncIoBamBuf::async_read(void *data) { AsyncIoBamBuf *ab = (AsyncIoBamBuf *)data; - if (ab->need_read_ && ab->ReadStat() >= 0) { // 需要读取 + if (ab->need_read_ && ab->ReadStat() >= 0) { // ab->last_read_num_ = ab->po_->ReadBam(); } else { ab->last_read_num_ = 0; @@ -205,23 +205,23 @@ void *AsyncIoBamBuf::async_read(void *data) { pthread_exit(0); } -// 为下一次读取做准备, -// 计算一些边界条件,延迟操作,因为此时可能po_对应的buf正在读取 +// , +// ,,po_buf void AsyncIoBamBuf::ClearBeforeIdx(size_t idxInBv) { clear_before_idx_ = idxInBv; } -// 清空上一次所有读入的数据,延迟操作,因为此时可能po_对应的buf正在读取 +// ,,po_buf void AsyncIoBamBuf::ClearAll() { clear_all_ = true; } inline void AsyncIoBamBuf::resize_buf() { - if (clear_all_) { // 清理上一轮的数据 + if (clear_all_) { // clear_all_ = false; po_->ClearBeforeIdx(legacy_size_); pi_->ClearAll(); - if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存 + if (pi_->handle_last_read()) { // read last_read_num_ += 1; - legacy_size_ = pi_->Size(); // 应该只有一个read + legacy_size_ = pi_->Size(); // read need_read_ = true; - } else { // 没空间存放,则不交换指针,或者文件已经读取完毕 + } else { // ,, legacy_size_ = 0; need_read_ = false; } @@ -229,16 +229,16 @@ inline void AsyncIoBamBuf::resize_buf() { if (clear_before_idx_ < legacy_size_) { po_->ClearBeforeIdx(clear_before_idx_); legacy_size_ -= clear_before_idx_; - // 不需要交换指针,不需要读取 + // , need_read_ = false; } else { po_->ClearBeforeIdx(legacy_size_); pi_->ClearBeforeIdx(clear_before_idx_ - legacy_size_); - if (pi_->handle_last_read()) { // 上次读取有一个read没放入缓存 + if (pi_->handle_last_read()) { // read last_read_num_ += 1; - legacy_size_ = pi_->Size(); // 应该只有一个read + legacy_size_ = pi_->Size(); // read need_read_ = true; - } else { // 没空间存放,则不交换指针,或者文件已经读取完毕 + } else { // ,, legacy_size_ = 0; need_read_ = false; } diff --git a/src/util/bam_buf.h b/src/util/bam_buf.h index ee6a3c8..d0bb7e7 100644 --- a/src/util/bam_buf.h +++ b/src/util/bam_buf.h @@ -1,5 +1,5 @@ /* - Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 + Description: sam/bam,buf, Copyright : All right reserved by ICT @@ -27,32 +27,32 @@ using std::vector; using namespace std; /* - * 存放读入的bam数据 + * bam */ struct BamBuf { - sam_hdr_t *hdr; // sam文件的header信息 - samFile *fp; // sam文件指针 - BamWrap *bw = nullptr; // 用来循环读入bam - uint8_t *mem = nullptr; // 用来存放bam的数据, - // 程序结束后自动释放,所以没在析构函数里释放 - int64_t mem_offset = 0; // 下一次要存放的位置 - int64_t mem_size; // 缓存大小 - int read_stat_ = 0; // 读取状态,是否读完 - vector bv; // 方便对bam数据的访问 - int64_t legacy_start = 0; // 没处理完的bam在mem中的起始位置, 闭区间 - int64_t legacy_end = 0; // 没处理完的bam在mem中的结束位置, 开区间 - bool handle_last = false; // 上次最后读入的bam是否需要处理 + sam_hdr_t *hdr; // samheader + samFile *fp; // sam + BamWrap *bw = nullptr; // bam + uint8_t *mem = nullptr; // bam, + // , + int64_t mem_offset = 0; // + int64_t mem_size; // + int read_stat_ = 0; // , + vector bv; // bam + int64_t legacy_start = 0; // bammem, + int64_t legacy_end = 0; // bammem, + bool handle_last = false; // bam - // 初始化缓存 + // void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size); - // 读取数据直到读完,或者缓冲区满 + // , int ReadBam(); - // 为下一次读取做准备, 计算一些边界条件 + // , void ClearBeforeIdx(size_t idxInBv); - // 清空上一次所有读入的数据 + // void ClearAll(); - inline int64_t Size() { return bv.size(); } // 包含多少个read - inline int ReadStat() { return read_stat_; } // 文件的读取状态,是否可读(读取完全) + inline int64_t Size() { return bv.size(); } // read + inline int ReadStat() { return read_stat_; } // ,() ~BamBuf() { if (this->mem != nullptr) { free(this->mem); @@ -62,7 +62,7 @@ struct BamBuf { free(bw); } } - void FreeMemory() // 释放开辟的内存 + void FreeMemory() // { if (this->mem != nullptr) { free(this->mem); @@ -75,14 +75,14 @@ struct BamBuf { this->bw = nullptr; } void prepare_read(); - // 检查缓存是否还有空间 + // bool has_enough_space(); - // 处理一个读取后的bam + // bam void append_one_bam(); - // 处理上次读入的最后一个read + // read bool handle_last_read(); - // 针对bv的操作 + // bv inline BamWrap *operator[](int64_t pos) { return bv[pos]; } inline void push_back(BamWrap *val) { bv.push_back(val); } inline void clear() { bv.clear(); } @@ -90,53 +90,53 @@ struct BamBuf { }; /* - * io异步缓冲区 + * io */ struct AsyncIoBamBuf { BamBuf buf1_; BamBuf buf2_; - BamBuf *pi_; // 当前用的buf - BamBuf *po_; // 后台在读取的buf + BamBuf *pi_; // buf + BamBuf *po_; // buf pthread_t *tid_ = NULL; bool hasThread = false; - int64_t legacy_size_ = 0; // 上一轮运算之后,缓存中还剩余的上次读取的read数量 + int64_t legacy_size_ = 0; // ,read bool first_read_ = true; - int last_read_num_ = 0; // 上一次读取了多少reads + int last_read_num_ = 0; // reads bool need_read_ = true; bool use_async_io_ = true; - int64_t clear_before_idx_ = 0; // 用户异步读取,下一轮读取之前清理掉clear_before_idx_之前的所有reads - bool clear_all_ = false; // 用于异步读取,下一轮读取之前清理掉之前的所有reads + int64_t clear_before_idx_ = 0; // ,clear_before_idx_reads + bool clear_all_ = false; // ,reads - vector bam_arr_; // 用来访问buf中的bam + vector bam_arr_; // bufbam AsyncIoBamBuf() {} AsyncIoBamBuf(bool use_async) : use_async_io_(use_async) {} - // 析构 + // ~AsyncIoBamBuf() { if (tid_ != NULL) { if (hasThread) pthread_join(*tid_, 0); free(tid_); } - // 其他的内存就等程序结束自动释放 - // buf的析构函数会自动调用 + // + // buf } - // 初始化缓存 + // void Init(samFile *fp, sam_hdr_t *hdr, int64_t mem_size); - // 读取数据 + // int ReadBam(); - // 为下一次读取做准备, 计算一些边界条件 + // , void ClearBeforeIdx(size_t idxInBv); - vector &GetBamArr() { return bam_arr_; } // 获取bam array - // 清空上一次所有读入的数据 + vector &GetBamArr() { return bam_arr_; } // bam array + // void ClearAll(); - // 包含的read数量 + // read inline int64_t Size() { return legacy_size_ + pi_->Size(); } inline int ReadStat() { return pi_->read_stat_; } inline BamWrap *operator[](int64_t pos) { return bam_arr_[pos]; } - // 获取某一段reads + // reads inline vector Slice(size_t startIdx, size_t endIdx) { if (endIdx > startIdx) { auto begItr = bam_arr_.begin(); @@ -149,11 +149,11 @@ struct AsyncIoBamBuf { buf2_.FreeMemory(); } - // 同步读取 + // int sync_read_bam(); - // 异步读取 + // int async_read_bam(); - // 异步读取线程函数 + // static void *async_read(void *data); void resize_buf(); inline void refresh_bam_arr() { diff --git a/src/util/bam_wrap.h b/src/util/bam_wrap.h index 3e5a20f..adeed8d 100644 --- a/src/util/bam_wrap.h +++ b/src/util/bam_wrap.h @@ -1,5 +1,5 @@ /* - Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 + Description: sam/bam,buf, Copyright : All right reserved by ICT @@ -20,37 +20,37 @@ using namespace std; /* - 这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明 - 这是类似bam的一个属性,而大写加驼峰命名的函数,更侧重说明这是通过计算得出的。 + ,,,,, + bam,,。 */ /* - * sam read的封装 + * sam read */ struct BamWrap { - // 将contig左移后加上pos作为全局位置 - const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位,和位点拼合在一起 + // contigpos + const static int MAX_CONTIG_LEN_SHIFT = 40; // id, const static int READ_MAX_LENGTH = 200; - const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系 + const static int READ_MAX_DEPTH = 1000; // , - // 成员变量尽量少,减少占用内存空间 + // , bam1_t *b; - int64_t end_pos_; // bam的全局结束位置, 闭区间 + int64_t end_pos_; // bam, - // 全局开始位置 + // inline int64_t start_pos() { return bam_global_pos(b); } - // 全局结束位置 + // inline int64_t end_pos() { return end_pos_; } - // 和reference对应的序列长度 + // reference inline int16_t read_len() { return (end_pos_ - start_pos() + 1); } - // 在contig内的开始位置 + // contig inline int32_t contig_pos() { return b->core.pos; } - // 在contig内部的结束位置 + // contig inline int32_t contig_end_pos() { return bam_pos(end_pos_); } - // 序列的长度(AGTC字母个数) + // (AGTC) inline int16_t seq_len() { return b->core.l_qseq; } - // 算上开头的softclip + // softclip inline int32_t softclip_start() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -61,7 +61,7 @@ struct BamWrap { return bc.pos; } - // 算上结尾的softclip + // softclip inline int32_t softclip_end() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -72,7 +72,7 @@ struct BamWrap { return bam_pos(end_pos_); } - // 算上结尾的softclip + // softclip inline int32_t right_softclip_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -83,7 +83,7 @@ struct BamWrap { return 0; } - // 获取序列 + // inline std::string sequence() { ostringstream oss; char *seq = (char *)bam_get_seq(b); @@ -96,9 +96,9 @@ struct BamWrap { return std::move(oss.str()); } - // 获取名字 + // inline const char *query_name() { return bam_get_qname(b); } - // 获取cigar 字符串 + // cigar inline string cigar_str() { ostringstream oss; const uint32_t *cigar = bam_get_cigar(b); @@ -111,10 +111,10 @@ struct BamWrap { return std::move(oss.str()); } - // 占用的内存大小 + // inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; } - // 获取cigar中insert的总长度 + // cigarinsert inline int32_t insert_cigar_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -128,7 +128,7 @@ struct BamWrap { return ret; } - // 获取cigar中delete的总长度 + // cigardelete inline int32_t del_cigar_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -142,7 +142,7 @@ struct BamWrap { return ret; } - // 计算sam read的终点位置 + // sam read static inline int64_t BamEndPos(const bam1_t *b) { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; @@ -170,7 +170,7 @@ struct BamWrap { return hasWellDefinedFragmentSize; } - // 计算bam的adapterBoundary + // bamadapterBoundary int GetAdapterBoundary() { const bam1_core_t &bc = b->core; int adapterBoundary; @@ -179,11 +179,11 @@ struct BamWrap { else if (bc.flag & BAM_FREVERSE) adapterBoundary = bc.mpos - 1; else - adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1 + adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 GATK3.5,3.5+1 return adapterBoundary; } - // 获取开头的I的长度 + // I inline int GetHeadInsertLen() { int insLen = 0; const uint32_t *cigar = bam_get_cigar(b); @@ -200,8 +200,8 @@ struct BamWrap { return insLen; } - // 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?, - // 注意开头的I要当做S?) + // soft clip(HS,?, + // IS?) inline int64_t GetSoftStart() { int64_t softStart = b->core.pos; const uint32_t *cigar = bam_get_cigar(b); @@ -217,7 +217,7 @@ struct BamWrap { return softStart; } - // 获取unclipped开始位置(包括hardclip) + // unclipped(hardclip) inline int64_t GetUnclippedStart() { int64_t start = b->core.pos; const uint32_t *cigar = bam_get_cigar(b); @@ -233,7 +233,7 @@ struct BamWrap { return start; } - // 获取unclipped结束位置(包括hardclip) + // unclipped(hardclip) inline int64_t GetUnclippedEnd() { int64_t end_pos = bam_endpos(b); const uint32_t *cigar = bam_get_cigar(b); @@ -249,7 +249,7 @@ struct BamWrap { return end_pos - 1; } - /* 获取碱基质量分数的加和 */ + /* */ /** Calculates a score for the read which is the sum of scores over Q15. */ inline int GetSumOfBaseQualities() { int score = 0; @@ -262,9 +262,9 @@ struct BamWrap { return score; } - /* 与flag相关的检测 */ + /* flag */ - /* 没有比对上 unmapped */ + /* unmapped */ inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; } /* Template having multiple segments in sequencing */ @@ -313,7 +313,7 @@ struct BamWrap { */ bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; } - /* 其他的一些信息 */ + /* */ inline int GetReferenceLength() { int length = 0; const uint32_t *cigar = bam_get_cigar(b); @@ -336,26 +336,26 @@ struct BamWrap { return length; } - // 计算bam的全局位置,算上染色体序号和比对位置 + // bam, static inline int64_t bam_global_pos(bam1_t *b) { return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos); } static inline int64_t bam_global_pos(int tid, int pos) { return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos); } - // 根据全局位置获取bam的染色体序号 + // bam static inline int32_t bam_tid(int64_t global_pos) { const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1); const int64_t high_tid = global_pos & mask; return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT); } - // 根据全局位置获取bam的比对位置(染色体内) + // bam() static inline int32_t bam_pos(int64_t global_pos) { const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1; return (int32_t)(global_pos & mask); } - // 设置是否冗余的标记 + // void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); } void setFlag(bool flag, int bit) { diff --git a/src/util/murmur3.h b/src/util/murmur3.h index d404ecc..48c304a 100644 --- a/src/util/murmur3.h +++ b/src/util/murmur3.h @@ -1,5 +1,5 @@ /* -Description: Murmur哈希 +Description: Murmur Copyright : All right reserved by ICT