From 4fd267c4dbb364163a57afe54ae51f6c2a7f6f0e Mon Sep 17 00:00:00 2001 From: zzh Date: Mon, 11 Nov 2024 18:10:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=9C=A8=E7=BB=93=E6=9E=9C=E4=B8=AD=E5=8A=A0?= =?UTF-8?q?=E5=85=A5=E4=BA=86TAG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .vscode/launch.json | 4 +- .vscode/settings.json | 3 +- run.sh | 10 +- src/common/hts/bam_wrap.h | 8 + src/common/utils/global_arg.cpp | 21 +++ src/common/utils/global_arg.h | 2 + src/common/utils/util.h | 18 +++ src/sam/markdups/dup_metrics.h | 21 ++- src/sam/markdups/markdups.cpp | 259 +++++++++++++++++++++++++++--- src/sam/markdups/markdups_arg.cpp | 22 ++- src/sam/markdups/markdups_arg.h | 5 + src/sam/markdups/md_funcs.cpp | 29 +--- src/sam/markdups/md_types.h | 148 +++++++++++++++++ src/sam/markdups/serial_md.cpp | 69 ++++---- src/sam/markdups/serial_md.h | 139 +--------------- src/sam/markdups/shared_args.h | 15 +- src/sam/utils/read_ends.h | 8 + 18 files changed, 539 insertions(+), 243 deletions(-) create mode 100644 src/sam/markdups/md_types.h diff --git a/.gitignore b/.gitignore index dd1fa75..b04fd59 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # for fast-markdup +core.* *.sam *.bam *.log diff --git a/.vscode/launch.json b/.vscode/launch.json index a229eb9..fa039c7 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,8 +13,8 @@ "program": "${workspaceRoot}/build/bin/picard_cpp", "args": [ "MarkDuplicates", - "--INPUT", "~/data/bam/zy_tumor.bam", - "--OUTPUT", "/dev/null", + "--INPUT", "~/data/bam/1k.sam", + "--OUTPUT", "./out.sam", "--METRICS_FILE", "metrics.txt", "--num_threads", "1", "--max_mem", "2G", diff --git a/.vscode/settings.json b/.vscode/settings.json index 9b14081..cc7054c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -101,6 +101,7 @@ "queue": "cpp", "stack": "cpp", "__bit_reference": "cpp", - "__string": "cpp" + "__string": "cpp", + "filesystem": "cpp" } } \ No newline at end of file diff --git a/run.sh b/run.sh index a83e99e..f5b9eaa 100755 --- a/run.sh +++ b/run.sh @@ -1,18 +1,22 @@ #input=~/data/bam/zy_normal.bam #input=~/data/bam/zy_tumor.bam #input=~/data/bam/100w.bam -input=~/data/bam/1kw.sam +input=~/data/bam/1k.sam +#input=~/data/bam/1kw.sam +#input=~/data/bam/1kw.bam #input=~/data/bam/n1kw.sam +cd ./build/ && make -j 8 && cd .. + #./out.sam \ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ MarkDuplicates \ --INPUT $input \ - --OUTPUT /dev/null \ + --OUTPUT ./out.sam \ --INDEX_FORMAT BAI \ --num_threads 1 \ --max_mem 2G \ --verbosity DEBUG \ - --asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" + --asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null #--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX "" #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" #--READ_NAME_REGEX "" diff --git a/src/common/hts/bam_wrap.h b/src/common/hts/bam_wrap.h index 1562514..3f42abe 100644 --- a/src/common/hts/bam_wrap.h +++ b/src/common/hts/bam_wrap.h @@ -355,6 +355,14 @@ struct BamWrap { const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1; return (int32_t)(global_pos & mask); } + + // 设置是否冗余的标记 + void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); } + + void setFlag(bool flag, int bit) { + if (flag) this->b->core.flag |= bit; + else this->b->core.flag &= ~bit; + } }; typedef std::map> SampleBamMap; diff --git a/src/common/utils/global_arg.cpp b/src/common/utils/global_arg.cpp index f147eaa..e14d9e3 100644 --- a/src/common/utils/global_arg.cpp +++ b/src/common/utils/global_arg.cpp @@ -8,6 +8,7 @@ */ #include "global_arg.h" +#include "util.h" #include #include @@ -100,4 +101,24 @@ void GlobalArg::parseArgument(int argNum) { default: break; } +} + +string GlobalArg::GetArgValueString() { + printf("--INPUT = %s\n", in_fn.c_str()); + printf("--OUTPUT = %s\n", out_fn.c_str()); + printf("--num_threads = %d\n", num_threads); + printf("--max_mem = %ld\n", max_mem); + printf("--verbosity = %d\n", verbosity); + printf("--asyncio = %d\n", use_asyncio); + + vector argVals; + argVals.push_back("--INPUT"); argVals.push_back(in_fn); + argVals.push_back("--OUTPUT"); argVals.push_back(out_fn); + argVals.push_back("--num_threads"); argVals.push_back(StringUtil::Format(num_threads)); + argVals.push_back("--asyncio"); argVals.push_back(use_asyncio ? "true" : "false"); + + string argValStr; + StringUtil::Join(argVals, argValStr, ' '); + cout << argValStr << endl; + return argValStr; } \ No newline at end of file diff --git a/src/common/utils/global_arg.h b/src/common/utils/global_arg.h index 903eb14..98bec6c 100644 --- a/src/common/utils/global_arg.h +++ b/src/common/utils/global_arg.h @@ -89,6 +89,8 @@ struct GlobalArg { printf("--asyncio = %d\n", use_asyncio); } + string GetArgValueString(); + private: GlobalArg() { initGlobalOptions(); }; }; diff --git a/src/common/utils/util.h b/src/common/utils/util.h index 34c7682..3e566cc 100755 --- a/src/common/utils/util.h +++ b/src/common/utils/util.h @@ -34,6 +34,13 @@ void Warn(const char *format, ...); void Info(const char *format, ...); void Debug(const char *format, ...); +// 字节缓冲区 +struct ByteBuf { + uint8_t *buf = nullptr; + int size = 0; // 当前使用量 + int capacity = 0; // 最大容量 +}; + /* * StringUtil 用来合并字符串等操作 */ @@ -74,6 +81,17 @@ struct StringUtil { } return true; } + + //static void StringToBytes(const string &str, ByteBuf &buf) { + // if (str.size() > buf.capacity) { + // buf.capacity = str.size() * 2; + // buf.buf = (uint8_t*)realloc(buf.buf, buf.capacity); + // } + // buf.size = str.size(); + // for (int i = 0; i < str.size(); ++i) { + // buf.buf[i] = (uint8_t)str[i] + 64; + // } + //} }; // 二进制读写相关 diff --git a/src/sam/markdups/dup_metrics.h b/src/sam/markdups/dup_metrics.h index 508c4e8..28def4e 100644 --- a/src/sam/markdups/dup_metrics.h +++ b/src/sam/markdups/dup_metrics.h @@ -2,8 +2,11 @@ #include #include +#include +#include "serial_md.h" using std::string; +using std::vector; /* * 标记冗余过程中的一些数据统计 @@ -54,17 +57,11 @@ struct DuplicationMetrics { // 其他的统计数据 - // addSingletonToCount需要记录的数据 - uint64_t DuplicateCountHist = 0; - uint64_t NonOpticalDuplicateCountHist = 0; + // 比如在该位置,有4个冗余,那么所有4个冗余的位置数量 + MDMap DuplicateCountHist; + MDMap NonOpticalDuplicateCountHist; + MDMap OpticalDuplicatesCountHist; - // track optical duplicates 需要记录的数据 - uint64_t OpticalDuplicatesCountHist = 0; - uint64_t OpticalDuplicatesByLibraryId = 0; - - // 统计相关的函数 - void AddSingletonToCount() { - ++this->DuplicateCountHist; - ++this->NonOpticalDuplicateCountHist; - } + // 没有冗余的位置总数量 + MDSet singletonReads; }; \ No newline at end of file diff --git a/src/sam/markdups/markdups.cpp b/src/sam/markdups/markdups.cpp index 086c36e..0ee60ef 100644 --- a/src/sam/markdups/markdups.cpp +++ b/src/sam/markdups/markdups.cpp @@ -25,6 +25,7 @@ Date : 2023/10/23 #include #include #include +#include #include "dup_metrics.h" #include "markdups_arg.h" @@ -35,6 +36,7 @@ Date : 2023/10/23 using namespace std; using std::cout; +using std::string; #define SMA_TAG_PG "PG" @@ -56,11 +58,67 @@ MarkDupsArg &g_mdArg = g_mdArg_; static GlobalDataArg gData_; GlobalDataArg &gData = gData_; DuplicationMetrics gMetrics_; -DuplicationMetrics &gMetrics = gMetrics_; +DuplicationMetrics &gMetrics = gMetrics_; // 记录统计信息 +/* 程序版本等信息 */ +const char *PROGRAM_GROUP_VERSION = "3.2.0"; + +// 调试信息 int zzhtestnum = 0; -set zzhopticalSet; -vector zzhopticalArr; +int64_t bam_num1=0, bam_num2=0; +/** + * Estimates the size of a library based on the number of paired end molecules observed + * and the number of unique pairs observed. + *

+ * Based on the Lander-Waterman equation that states: + * C/X = 1 - exp( -N/X ) + * where + * X = number of distinct molecules in library + * N = number of read pairs + * C = number of distinct fragments observed in read pairs + */ +static int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs) { + int64_t librarySize = 0; + int64_t readPairDuplicates = readPairs - uniqueReadPairs; + auto f = [](double x, double c, double n) { return c / x - 1 + exp(-n / x); }; + if (readPairs > 0 && readPairDuplicates > 0) { + double m = 1.0; + double M = 100.0; + if (uniqueReadPairs >= readPairs || f(m * uniqueReadPairs, uniqueReadPairs, readPairs) < 0) { + cerr << "Invalid values for pairs and unique pairs: " << readPairs << ", " << uniqueReadPairs << endl; + return 0; + } + // find value of M, large enough to act as other side for bisection method + while (f(M * uniqueReadPairs, uniqueReadPairs, readPairs) > 0) { + M *= 10.0; + } + // use bisection method (no more than 40 times) to find solution + for (int i = 0; i < 40; ++i) { + double r = (m + M) / 2.0; + double u = f(r * uniqueReadPairs, uniqueReadPairs, readPairs); + if (u == 0) + break; + else if (u > 0) + m = r; + else if (u < 0) + M = r; + } + return uniqueReadPairs * (m + M) / 2.0; + } + return librarySize; +} + +/* + * 获取文件名后缀 + */ +string getFileExtension(const string &filename) { + auto last_dot = filename.find_last_of('.'); + if (last_dot == string::npos) { + return ""; + } + return filename.substr(last_dot + 1); +} + /* * mark duplicate * 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题 @@ -109,6 +167,10 @@ int MarkDuplicates(int argc, char *argv[]) { sam_open_mode(modeout + 1, g_gArg.out_fn.c_str(), NULL); g_outBamFp = sam_open(g_gArg.out_fn.c_str(), modeout); g_outBamHeader = sam_hdr_dup(g_inBamHeader); + // 修改输出文件的header + sam_hdr_add_line(g_outBamHeader, "PG", "ID", g_mdArg.PROGRAM_RECORD_ID.c_str(), "VN", PROGRAM_GROUP_VERSION, "CL", + (g_mdArg.PROGRAM_RECORD_ID + " " + g_gArg.GetArgValueString() + " " + g_mdArg.GetArgValueString()).c_str(), NULL); + // 用同样的线程池处理输出文件 hts_set_opt(g_outBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite); @@ -139,44 +201,138 @@ int MarkDuplicates(int argc, char *argv[]) { // 输出index文件 // string indexFn = g_gArg.out_fn + ".csi"; // 现在索引都是csi格式的 string indexFn = g_gArg.out_fn + ".bai"; // min_shift = 0 是bai格式 - int index_min_shift = 0; - if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) { - indexFn = g_gArg.out_fn + ".csi"; - index_min_shift = 14; + if ("sam" == getFileExtension(g_gArg.out_fn)) { + indexFn = ""; } - if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) { - Error("failed to open index \"%s\" for writing", indexFn.c_str()); - sam_close(g_outBamFp); - sam_close(g_inBamFp); - return -1; + + if (!indexFn.empty()) { + int index_min_shift = 0; + if (g_mdArg.INDEX_FORMAT == ns_md::IndexFormat::CSI) { + indexFn = g_gArg.out_fn + ".csi"; + index_min_shift = 14; + } + if (sam_idx_init(g_outBamFp, g_outBamHeader, 0 /*csi 14*/, indexFn.c_str()) < 0) { + Error("failed to open index \"%s\" for writing", indexFn.c_str()); + sam_close(g_outBamFp); + sam_close(g_inBamFp); + return -1; + } } // 读取输入文件 // BamBufType inBuf(false); BamBufType inBuf(g_gArg.use_asyncio); inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem); DupIdxQueue dupIdxQue, repIdxQue; + DupIdxQueue opticalIdxQue; dupIdxQue.Init(&gData.dupIdxArr); repIdxQue.Init(&gData.repIdxArr); + opticalIdxQue.Init(&gData.opticalDupIdxArr); Timer tw; cout << "dupsize: " << dupIdxQue.Size() << endl; uint64_t bamIdx = 0; DupInfo dupIdx = dupIdxQue.Pop(); DupInfo repIdx = repIdxQue.Pop(); - exit(0); + uint64_t opticalIdx = opticalIdxQue.Pop(); + ByteBuf bytes; + bam1_t *bp = bam_init1(); + bool isDup = false; + bool isOpticalDup = false; + bool isInDuplicateSet = false; + uint32_t representativeReadIndexInFile = 0; + uint32_t duplicateSetSize = 0; + // exit(0); while (inBuf.ReadStat() >= 0) { Timer tw1; size_t readNum = inBuf.ReadBam(); // cout << "read: " << readNum << endl; for (size_t i = 0; i < inBuf.Size(); ++i) { + BamWrap *bw = inBuf[i]; + if (bam_copy1(bp, bw->b) == nullptr) { + Error("Can not copy sam record!"); + return -1; + } + bw->b = bp; + isDup = false; + isOpticalDup = false; + isInDuplicateSet = false; + // 删除原来的duplicate tag + if (g_mdArg.CLEAR_DT) { + uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str()); + if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); + } + ++bam_num2; + // 统计信息 + if (bw->GetReadUnmappedFlag()) { + ++gMetrics.UNMAPPED_READS; + } else if (bw->IsSecondaryOrSupplementary()) { + ++gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS; + } else if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) { + ++gMetrics.UNPAIRED_READS_EXAMINED; + } else { + ++gMetrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end + } + /* 判断是否冗余 */ if (bamIdx == dupIdx) { - // if (dupIdx.dupSet != 0) - // cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl; + isDup = true; + + if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && dupIdx.dupSet != 0) { + // cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl; + isInDuplicateSet = true; + representativeReadIndexInFile = dupIdx.repIdx; + duplicateSetSize = dupIdx.dupSet; + } // 为了防止小内存运行的时候,有重复的dupidx,这时候dup的repIdx和dupSetSize可能会有不同 while ((dupIdx = dupIdxQue.Pop()) == bamIdx); + if (opticalIdx == bamIdx) + isOpticalDup = true; + else if (opticalIdx < bamIdx) { + while ((opticalIdx = opticalIdxQue.Pop()) < bamIdx); + if (opticalIdx == bamIdx) + isOpticalDup = true; + } + // 添加冗余标识 + bw->SetDuplicateReadFlag(true); + // if (isOpticalDup) + // cout << bamIdx << " optical" << endl; + // else + // cout << bamIdx << " not opt" << endl; + + uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str()); + if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); + if (isOpticalDup) + bam_aux_append(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str(), 'Z', + g_mdArg.DUPLICATE_TYPE_SEQUENCING.size() + 1, + (const uint8_t *)g_mdArg.DUPLICATE_TYPE_SEQUENCING.c_str()); + else + bam_aux_append(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str(), 'Z', + g_mdArg.DUPLICATE_TYPE_LIBRARY.size() + 1, + (const uint8_t *)g_mdArg.DUPLICATE_TYPE_LIBRARY.c_str()); + + // cout << bamIdx << endl; + // int ival = 2; + // cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl; + // if (bam_aux_append(bw->b, "DT", 'Z', 3, (const uint8_t*)"LB") < 0) Error("add tag error!"); + // // if (bam_aux_append(bw->b, "DT", 'i', sizeof(ival), (uint8_t *)&ival) < 0) Error("add tag error!"); + // cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl; + + // 计算统计信息 + if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) { + // Update the duplication metrics + if (!bw->GetReadPairedFlag() || bw->GetMateUnmappedFlag()) { + ++gMetrics.UNPAIRED_READ_DUPLICATES; + } else { + ++gMetrics.READ_PAIR_DUPLICATES; // will need to be divided by 2 at the end + } + } + } else { + bw->SetDuplicateReadFlag(false); } if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent // cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl; + isInDuplicateSet = true; + representativeReadIndexInFile = repIdx.repIdx; + duplicateSetSize = repIdx.dupSet; while (repIdx == bamIdx || repIdx.dupSet == 0) { if (repIdxQue.Size() > 0) repIdx = repIdxQue.Pop(); @@ -186,19 +342,82 @@ int MarkDuplicates(int argc, char *argv[]) { } } } - if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) { - Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str()); + + if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) { + // cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl; + uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str()); + if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); + bam_aux_append(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str(), 'i', + sizeof(representativeReadIndexInFile), (uint8_t *)&representativeReadIndexInFile); + oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_SIZE_TAG.c_str()); + if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); + bam_aux_append(bw->b, g_mdArg.DUPLICATE_SET_SIZE_TAG.c_str(), 'i', sizeof(duplicateSetSize), + (const uint8_t *)&duplicateSetSize); + } + // 每个read都要写到output,只是添加标识 + // cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl; + // if (bamIdx == 922) break; + ++bamIdx; + if (isDup && g_mdArg.REMOVE_DUPLICATES) { + continue; + } + if (isOpticalDup && g_mdArg.REMOVE_SEQUENCING_DUPLICATES) { + continue; + } + if (!g_mdArg.PROGRAM_RECORD_ID.empty() && g_mdArg.ADD_PG_TAG_TO_READS) { + uint8_t *oldTagVal = bam_aux_get(bw->b, "PG"); + if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); + bam_aux_append(bw->b, "PG", 'Z', g_mdArg.PROGRAM_RECORD_ID.size() + 1, + (const uint8_t *)g_mdArg.PROGRAM_RECORD_ID.c_str()); + } + if (sam_write1(g_outBamFp, g_outBamHeader, bw->b) < 0) { + Error("failed writing sam record to \"%s\"", g_gArg.out_fn.c_str()); sam_close(g_outBamFp); sam_close(g_inBamFp); return -1; } - ++bamIdx; } inBuf.ClearAll(); cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl; } - cout << "dupsize: " << dupIdxQue.Size() << endl; - if (sam_idx_save(g_outBamFp) < 0) { + bam_destroy1(bp); + cout << "bam_Num: " << bam_num1 << " : " << bam_num2 << endl; + + // 计算统计信息 + gMetrics.READ_PAIRS_EXAMINED /= 2; + gMetrics.READ_PAIR_DUPLICATES /= 2; + for (auto &arr : gData.opticalDupIdxArr) gMetrics.READ_PAIR_OPTICAL_DUPLICATES += arr.size(); + gMetrics.READ_PAIR_OPTICAL_DUPLICATES = gMetrics.READ_PAIR_OPTICAL_DUPLICATES / 2; + gMetrics.ESTIMATED_LIBRARY_SIZE = + estimateLibrarySize(gMetrics.READ_PAIRS_EXAMINED - gMetrics.READ_PAIR_OPTICAL_DUPLICATES, + gMetrics.READ_PAIRS_EXAMINED - gMetrics.READ_PAIR_DUPLICATES); + if (gMetrics.UNPAIRED_READS_EXAMINED + gMetrics.READ_PAIRS_EXAMINED != 0) { + gMetrics.PERCENT_DUPLICATION = (gMetrics.UNPAIRED_READ_DUPLICATES + gMetrics.READ_PAIR_DUPLICATES * 2) / + (double)(gMetrics.UNPAIRED_READS_EXAMINED + gMetrics.READ_PAIRS_EXAMINED * 2); + } else { + gMetrics.PERCENT_DUPLICATION = 0; + } + + cout << "metrics: \n" + << "LIBRARY: " << gMetrics.LIBRARY << "\n" + << "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n" + << "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n" + << "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n" + << "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n" + << "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n" + << "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n" + << "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n" + << "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n" + << "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl; + + cout << "singleton pos: " << gMetrics.singletonReads.size() << endl; + for (auto &itr : gMetrics.DuplicateCountHist) cout << "dup counts: " << itr.first << ": " << itr.second << endl; + for (auto &itr : gMetrics.NonOpticalDuplicateCountHist) + cout << "non optical dup counts: " << itr.first << ": " << itr.second << endl; + for (auto &itr : gMetrics.OpticalDuplicatesCountHist) + cout << "optical dup counts: " << itr.first << ": " << itr.second << endl; + cout << "dupsize: " << dupIdxQue.Size() << "\t" << opticalIdxQue.Size() << "\t" << repIdxQue.Size() << endl; + if (!indexFn.empty() && sam_idx_save(g_outBamFp) < 0) { Error("writing index failed"); sam_close(g_outBamFp); sam_close(g_inBamFp); diff --git a/src/sam/markdups/markdups_arg.cpp b/src/sam/markdups/markdups_arg.cpp index f00eb51..e6d4652 100644 --- a/src/sam/markdups/markdups_arg.cpp +++ b/src/sam/markdups/markdups_arg.cpp @@ -9,6 +9,7 @@ Date : 2023/10/27 #include "markdups_arg.h" #include "common/utils/global_arg.h" +#include "common/utils/util.h" #include #include @@ -65,7 +66,8 @@ const static struct option kMdOpts[] = { {"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM}, {"CREATE_INDEX", required_argument, NULL, CREATE_INDEX}, {"INDEX_FORMAT", required_argument, NULL, INDEX_FORMAT}, - {"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}}; + {"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}, + {"ADD_PG_TAG_TO_READS", required_argument, NULL, ADD_PG_TAG_TO_READS}}; // 判断bool类型的参数 void setBoolArg(bool *arg) { @@ -193,7 +195,8 @@ void MarkDupsArg::parseArgument(int argc, COMMENT.push_back(optarg); break; case ns_md::READ_NAME_REGEX: - READ_NAME_REGEX = optarg; + if (strcmp("null", optarg) == 0) READ_NAME_REGEX=""; + else READ_NAME_REGEX = optarg; break; case ns_md::OPTICAL_DUPLICATE_PIXEL_DISTANCE: OPTICAL_DUPLICATE_PIXEL_DISTANCE = stoi(optarg); @@ -229,6 +232,9 @@ void MarkDupsArg::parseArgument(int argc, case ns_md::CREATE_MD5_FILE: setBoolArg(&CREATE_MD5_FILE); break; + case ns_md::ADD_PG_TAG_TO_READS: + setBoolArg(&ADD_PG_TAG_TO_READS); + break; default: break; } @@ -245,6 +251,18 @@ void MarkDupsArg::printArgValue() printf("--INDEX_FORMAT = %s\n", this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi"); } +string MarkDupsArg::GetArgValueString() { + vector argVals; + argVals.push_back("--READ_NAME_REGEX"); + if (this->READ_NAME_REGEX.empty()) argVals.push_back("null"); + else argVals.push_back(this->READ_NAME_REGEX.c_str()); + argVals.push_back("--INDEX_FORMAT"); argVals.push_back(this->INDEX_FORMAT == ns_md::IndexFormat::BAI ? "bai" : "csi"); + string argValStr; + StringUtil::Join(argVals, argValStr, ' '); + cout << argValStr << endl; + return argValStr; +} + // 打印版本信息 void MarkDupsArg::PrintVersion() { diff --git a/src/sam/markdups/markdups_arg.h b/src/sam/markdups/markdups_arg.h index 1d167ad..d72a98c 100644 --- a/src/sam/markdups/markdups_arg.h +++ b/src/sam/markdups/markdups_arg.h @@ -56,6 +56,7 @@ enum MarkDupsArgEnum { CREATE_INDEX, INDEX_FORMAT, CREATE_MD5_FILE, + ADD_PG_TAG_TO_READS, _END_NUM }; @@ -294,6 +295,9 @@ struct MarkDupsArg /* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */ bool CREATE_MD5_FILE = false; + /* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/ + bool ADD_PG_TAG_TO_READS = true; + // mark duplicate参数个数 const static int ARG_COUNT = ns_md::MarkDupsArgEnum::_END_NUM - ns_md::MarkDupsArgEnum::_START_NUM - 1; // 解析参数 @@ -302,6 +306,7 @@ struct MarkDupsArg GlobalArg *pGArg); void printArgValue(); + string GetArgValueString(); static void PrintHelp(); diff --git a/src/sam/markdups/md_funcs.cpp b/src/sam/markdups/md_funcs.cpp index 4fed041..12b5075 100644 --- a/src/sam/markdups/md_funcs.cpp +++ b/src/sam/markdups/md_funcs.cpp @@ -461,18 +461,9 @@ static int checkOpticalDuplicates(vector &readEndsArr, const R ReadEnds *pRe = const_cast(readEndsArr[i]); if (opticalDuplicateFlags[i]) { ++opticalDuplicates; - // if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) { - // cout << "val: " << pRe->isOpticalDuplicate << endl; - // } pRe->isOpticalDuplicate = true; - zzhopticalSet.insert(pRe->read1IndexInFile); - zzhopticalSet.insert(pRe->read2IndexInFile); - zzhopticalArr.push_back(pRe->read1IndexInFile); - zzhopticalArr.push_back(pRe->read2IndexInFile); } else { pRe->isOpticalDuplicate = false; - zzhopticalSet.erase(pRe->read1IndexInFile); - zzhopticalSet.erase(pRe->read2IndexInFile); } } return opticalDuplicates; @@ -483,11 +474,8 @@ static int checkOpticalDuplicates(vector &readEndsArr, const R */ void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { bool hasFR = false, hasRF = false; - int prevOpticalDupNum = 0; // Check to see if we have a mixture of FR/RF for (auto pRe : readEndsArr) { - if (pRe->isOpticalDuplicate) - ++prevOpticalDupNum; if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) hasFR = true; else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) @@ -517,17 +505,8 @@ void trackOpticalDuplicates(vector &readEndsArr, const ReadEnd nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe); } - // trackDuplicateCounts - gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup; - if (readEndsArr.size() > nOpticalDup) - gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup; - if (nOpticalDup) - gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1; - - - gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum; - //gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup; - - // cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl; - // cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl; + // 统计信息,trackDuplicateCounts + ++gMetrics.DuplicateCountHist[readEndsArr.size()]; + if (readEndsArr.size() > nOpticalDup) ++gMetrics.NonOpticalDuplicateCountHist[readEndsArr.size() - nOpticalDup]; + if (nOpticalDup) ++gMetrics.OpticalDuplicatesCountHist[nOpticalDup + 1]; } \ No newline at end of file diff --git a/src/sam/markdups/md_types.h b/src/sam/markdups/md_types.h new file mode 100644 index 0000000..befc8d9 --- /dev/null +++ b/src/sam/markdups/md_types.h @@ -0,0 +1,148 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +using std::set; +using std::string; +using std::unordered_set; +using std::vector; + +/* 存放未匹配readend相同位点的所有readend */ +struct UnpairedREInfo { + int64_t taskSeq; + ReadEnds unpairedRE; +}; + +/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */ +struct CalcKey { + int64_t read1Pos; + int64_t read2Pos; + bool operator<(const CalcKey &o) const { + int comp = (int)(read1Pos - o.read1Pos); + if (comp == 0) + comp = (int)(read2Pos - o.read2Pos); + return comp < 0; + } + bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; } + std::size_t operator()(const CalcKey &o) const { + return std::hash()(read1Pos) ^ std::hash()(read2Pos); + } +}; + +struct CalcKeyHash { + std::size_t operator()(const CalcKey &o) const { + return std::hash()(o.read1Pos) ^ std::hash()(o.read2Pos); + } +}; + +/* 用来记录冗余索引相关的信息 */ +struct DupInfo { + int64_t idx; + int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引 + int16_t dupSet = 0; // dup set size + + DupInfo() : DupInfo(-1, 0, 0) {} + DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {} + DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {} + bool operator<(const DupInfo &o) const { return idx < o.idx; } + bool operator>(const DupInfo &o) const { return idx > o.idx; } + operator int64_t() const { return idx; } +}; + +struct DupInfoHash { + std::size_t operator()(const DupInfo &o) const { return std::hash()(o.idx); } +}; + +struct DupInfoEqual { + bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; } + bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; } + bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; } +}; + +using MDMap = tsl::robin_map; + +template +// using MDSet = set; +// using MDSet = unordered_set; +using MDSet = tsl::robin_set; + +template +// using DPSet = set; +// using DPSet = unordered_set; +using DPSet = tsl::robin_set; + +template +// using CalcSet = set; +using CalcSet = tsl::robin_set; + +using ReadEndsSet = tsl::robin_set; + +/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */ +struct TaskSeqDupInfo { + DPSet dupIdx; + MDSet opticalDupIdx; + DPSet repIdx; + MDSet notDupIdx; + MDSet notOpticalDupIdx; + MDSet notRepIdx; +}; + +/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */ +struct UnpairedPosInfo { + int unpairedNum = 0; + int64_t taskSeq; + vector pairArr; + MDSet readNameSet; +}; +// typedef unordered_map UnpairedNameMap; +// typedef unordered_map UnpairedPositionMap; + +typedef tsl::robin_map UnpairedNameMap; // 以read name为索引,保存未匹配的pair read +typedef tsl::robin_map + UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 + +/* 单线程处理冗余参数结构体 */ +struct MarkDupDataArg { + int64_t taskSeq; // 任务序号 + int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 + vector bams; // 存放待处理的bam read + vector pairs; // 成对的reads + vector frags; // 暂未找到配对的reads + DPSet pairDupIdx; // pair的冗余read的索引 + MDSet pairOpticalDupIdx; // optical冗余read的索引 + DPSet fragDupIdx; // frag的冗余read的索引 + DPSet pairRepIdx; // pair的dupset代表read的索引 + MDSet pairSingletonIdxArr; // 某位置只有一对read的所有read pair个数 + UnpairedNameMap unpairedDic; // 用来寻找pair end + UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 +}; + +/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */ +struct GlobalDataArg { + UnpairedNameMap unpairedDic; // 用来寻找pair end + UnpairedPositionMap unpairedPosArr; + + // 每个task对应一个vector + vector> dupIdxArr; + vector> opticalDupIdxArr; + vector> repIdxArr; + vector> singletonIdxArr; + + // 用来存放后续计算的数据 + vector> latterDupIdxArr; + vector> latterOpticalDupIdxArr; + vector> latterRepIdxArr; + vector> latterSingletonIdxArr; + vector> latterNotDupIdxArr; + vector> latterNotOpticalDupIdxArr; + vector> latterNotRepIdxArr; + vector> latterNotSingletonIdxArr; +}; \ No newline at end of file diff --git a/src/sam/markdups/serial_md.cpp b/src/sam/markdups/serial_md.cpp index a82b50d..c7ee1e8 100644 --- a/src/sam/markdups/serial_md.cpp +++ b/src/sam/markdups/serial_md.cpp @@ -68,13 +68,13 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup MDSet *notOpticalDupIdx = nullptr, MDSet *notRepIdx = nullptr) { if (vpRe.size() < 2) { if (vpRe.size() == 1) { - // addSingletonToCount(libraryIdGenerator); // 这个统计可能会有误差,因为当前位点可能还有没匹配上的read,导致当前位点的read(paired)数量为1 - // 可以通过后续的补充计算来解决这个问题,有必要么? - gMetrics.AddSingletonToCount(); + // 可以通过后续的补充计算来解决这个问题,有必要么?好像没必要 + gMetrics.singletonReads.insert(vpRe[0]->read1IndexInFile); } return; } + int maxScore = 0; const ReadEnds *pBest = nullptr; // bool print = false; @@ -109,6 +109,7 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup notDupIdx->insert(pBest->read2IndexInFile); } // if (false) { + // if (true) { if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余 // trackOpticalDuplicates vector prevOpticalRe; @@ -196,7 +197,7 @@ static void getEqualRE(const ReadEnds &re, vector &src, vector reSet; // ReadEnds lastRe; - for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read + for (size_t i = 0; i < p.bams.size(); ++i) { // 循环处理每个read BamWrap *bw = p.bams[i]; + // ++bam_num1; const int64_t bamIdx = p.bamStartIdx + i; if (bw->GetReadUnmappedFlag()) { if (bw->b->core.tid == -1) @@ -308,7 +311,7 @@ static void processFrags(vector &readEnds, DPSet *dupIdx, MDS } /* 单线程markdup (第二步)*/ -static void markdups(SerailMarkDupArg *arg) { +static void markdups(MarkDupDataArg *arg) { auto &p = *arg; p.pairDupIdx.clear(); p.pairOpticalDupIdx.clear(); @@ -354,8 +357,8 @@ static inline void getIntersectData(vector &leftArr, vector } /* 将frags重叠部分的dup idx放进数据中 */ -static inline void refreshFragDupIdx(DPSet &dupIdx, MDSet ¬DupIdx, SerailMarkDupArg *lastArg, - SerailMarkDupArg *curArg) { +static inline void refreshFragDupIdx(DPSet &dupIdx, MDSet ¬DupIdx, MarkDupDataArg *lastArg, + MarkDupDataArg *curArg) { auto &lp = *lastArg; auto &p = *curArg; for (auto idx : dupIdx) { @@ -371,7 +374,7 @@ static inline void refreshFragDupIdx(DPSet &dupIdx, MDSet ¬ /* 将pairs重叠部分的dup idx放进数据中 */ static inline void refreshPairDupIdx(DPSet &dupIdx, MDSet &opticalDupIdx, DPSet &repIdx, MDSet ¬DupIdx, MDSet ¬OpticalDupIdx, - MDSet ¬RepIdx, SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) { + MDSet ¬RepIdx, MarkDupDataArg *lastArg, MarkDupDataArg *curArg) { auto &lp = *lastArg; auto &p = *curArg; for (auto idx : dupIdx) { @@ -466,7 +469,7 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet ¬DupI } /* 处理相邻的两个任务,有相交叉的数据 */ -static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) { +static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg, GlobalDataArg *gDataArg) { auto &lp = *lastArg; auto &p = *curArg; auto &g = *gDataArg; @@ -745,7 +748,7 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur } /* 当所有任务结束后,global data里还有未处理的数据 */ -static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { +static void handleLastTask(MarkDupDataArg *task, GlobalDataArg *gDataArg) { // cout << "last task start" << endl; auto &lp = *task; auto &g = *gDataArg; @@ -768,6 +771,7 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { g.unpairedDic.insert(prevUnpair); // 用来记录没有匹配的read个数 } } + map taskChanged; for (auto &e : g.unpairedPosArr) { auto posKey = e.first; @@ -779,6 +783,8 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { std::sort(arr.begin(), arr.end()); // cout << "last task before mark pair" << endl; processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notRepIdx); + } else if (arr.size() == 1) { + gMetrics.singletonReads.insert(arr[0].read1IndexInFile); } } // 更新结果 @@ -796,7 +802,10 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { // cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; g.unpairedPosArr.clear(); - g.unpairedDic.clear(); + + // for (auto &rn : g.unpairedDic) { + // cout << rn.first << endl; + // } /* int taskSeq = 0; @@ -888,12 +897,15 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { std::sort(vRepIdx.begin(), vRepIdx.end()); } -void calculateMetrics(SerailMarkDupArg &lp, SerailMarkDupArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) { +void calculateMetrics(MarkDupDataArg &lp, MarkDupDataArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) { DuplicationMetrics &gMetrics = *pgMetrics; + + // gMetrics.READ_PAIRS_EXAMINED /= 2; + cout << "calculateMetrics start: " << endl; cout << lp.unpairedDic.size() << "\t" << p.unpairedDic.size() << "\t" << g.unpairedDic.size() << endl; - + cout << "all: " << (lp.unpairedDic.size() + p.unpairedDic.size() + g.unpairedDic.size()) << endl; cout << "calculateMetrics end" << endl; } @@ -908,9 +920,9 @@ void serialMarkDups() { // inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024); int64_t processedBamNum = 0; - SerailMarkDupArg smdArg1, smdArg2; - SerailMarkDupArg *lastArgP = &smdArg1; - SerailMarkDupArg *curArgP = &smdArg2; + MarkDupDataArg smdArg1, smdArg2; + MarkDupDataArg *lastArgP = &smdArg1; + MarkDupDataArg *curArgP = &smdArg2; bool isFirstRound = true; int roundNum = 0; @@ -921,6 +933,7 @@ void serialMarkDups() { tm_arr[4].acc_start(); size_t readNum = inBamBuf.ReadBam(); readNumSum += readNum; + // readNumSum += inBamBuf.GetBamArr().size(); tm_arr[4].acc_end(); cout << "read num: " << readNum << '\t' << roundNum << endl; // lastArgP = curArgP; @@ -974,7 +987,7 @@ void serialMarkDups() { handleLastTask(lastArgP, &gData); // 计算各种统计指标 - calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics); + // calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics); // cout << "here 2" << endl; tm_arr[3].acc_end(); @@ -987,7 +1000,7 @@ void serialMarkDups() { map dup; int taskSeq = 0; - for (auto &arr : gData.dupIdxArr) { + /* for (auto &arr : gData.dupIdxArr) { for (auto idx : arr) { if (dup.find(idx.idx) != dup.end()) { // cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t' @@ -998,7 +1011,7 @@ void serialMarkDups() { // cout << taskSeq << "\t" << arr.size() << endl; taskSeq++; } - +*/ // #include // ofstream out("tumor_dup.txt"); // for (auto idx : dup) @@ -1023,23 +1036,11 @@ void serialMarkDups() { cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; - cout << "optical dup: " << zzhopticalSet.size() << endl; - cout << "optical arr dup: " << zzhopticalArr.size() << endl; cout << "optical size: " << opticalDupNum << endl; - - cout << "metrics: \n" - << "LIBRARY: " << gMetrics.LIBRARY << "\n" - << "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n" - << "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n" - << "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n" - << "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n" - << "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n" - << "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n" - << "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n" - << "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n" - << "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl; + cout << "singleton size: " << gMetrics.singletonReads.size() << endl; Timer::log_time("serial end "); + cout << "read num sum: " << readNumSum << endl; // for (auto i : gData.dupArr) // cout << i << endl; diff --git a/src/sam/markdups/serial_md.h b/src/sam/markdups/serial_md.h index de2a04e..d0f5bde 100644 --- a/src/sam/markdups/serial_md.h +++ b/src/sam/markdups/serial_md.h @@ -1,143 +1,6 @@ #pragma once -#include -#include -#include -#include - -#include -#include -#include -#include - -using std::set; -using std::string; -using std::unordered_set; -using std::vector; - -/* 存放未匹配readend相同位点的所有readend */ -struct UnpairedREInfo { - int64_t taskSeq; - ReadEnds unpairedRE; -}; - -/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */ -struct CalcKey { - int64_t read1Pos; - int64_t read2Pos; - bool operator<(const CalcKey &o) const { - int comp = (int)(read1Pos - o.read1Pos); - if (comp == 0) - comp = (int)(read2Pos - o.read2Pos); - return comp < 0; - } - bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; } - std::size_t operator()(const CalcKey &o) const { - return std::hash()(read1Pos) ^ std::hash()(read2Pos); - } -}; - -struct CalcKeyHash { - std::size_t operator()(const CalcKey &o) const { - return std::hash()(o.read1Pos) ^ std::hash()(o.read2Pos); - } -}; - -/* 用来记录冗余索引相关的信息 */ -struct DupInfo { - int64_t idx; - int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引 - int16_t dupSet = 0; // dup set size - - DupInfo() : DupInfo(-1, 0, 0) {} - DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {} - DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {} - bool operator<(const DupInfo &o) const { return idx < o.idx; } - bool operator>(const DupInfo &o) const { return idx > o.idx; } - operator int64_t() const { return idx; } -}; - -struct DupInfoHash { - std::size_t operator()(const DupInfo &o) const { return std::hash()(o.idx); } -}; - -struct DupInfoEqual { - bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; } - bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; } - bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; } -}; - -template -// using MDSet = set; -// using MDSet = unordered_set; -using MDSet = tsl::robin_set; - -template -// using DPSet = set; -// using DPSet = unordered_set; -using DPSet = tsl::robin_set; - -template -// using CalcSet = set; -using CalcSet = tsl::robin_set; - -/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */ -struct TaskSeqDupInfo { - DPSet dupIdx; - MDSet opticalDupIdx; - DPSet repIdx; - MDSet notDupIdx; - MDSet notOpticalDupIdx; - MDSet notRepIdx; -}; - -/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */ -struct UnpairedPosInfo { - int unpairedNum = 0; - int64_t taskSeq; - vector pairArr; - MDSet readNameSet; -}; -// typedef unordered_map UnpairedNameMap; -// typedef unordered_map UnpairedPositionMap; - -typedef tsl::robin_map UnpairedNameMap; // 以read name为索引,保存未匹配的pair read -typedef tsl::robin_map - UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 - -/* 单线程处理冗余参数结构体 */ -struct SerailMarkDupArg { - int64_t taskSeq; // 任务序号 - int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 - vector bams; // 存放待处理的bam read - vector pairs; // 成对的reads - vector frags; // 暂未找到配对的reads - DPSet pairDupIdx; // pair的冗余read的索引 - MDSet pairOpticalDupIdx; // optical冗余read的索引 - DPSet fragDupIdx; // frag的冗余read的索引 - DPSet pairRepIdx; // pair的dupset代表read的索引 - UnpairedNameMap unpairedDic; // 用来寻找pair end - UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 -}; - -/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */ -struct GlobalDataArg { - UnpairedNameMap unpairedDic; // 用来寻找pair end - UnpairedPositionMap unpairedPosArr; - - // 每个task对应一个vector - vector> dupIdxArr; - vector> opticalDupIdxArr; - vector> repIdxArr; - - // 用来存放后续计算的数据 - vector> latterDupIdxArr; - vector> latterOpticalDupIdxArr; - vector> latterRepIdxArr; - vector> latterNotDupIdxArr; - vector> latterNotOpticalDupIdxArr; - vector> latterNotRepIdxArr; -}; +#include "md_types.h" // 串行运行mark duplicate void serialMarkDups(); \ No newline at end of file diff --git a/src/sam/markdups/shared_args.h b/src/sam/markdups/shared_args.h index d239711..d0c51cc 100644 --- a/src/sam/markdups/shared_args.h +++ b/src/sam/markdups/shared_args.h @@ -6,6 +6,8 @@ #include #include #include +//#include "md_types.h" + #include using std::set; @@ -19,14 +21,15 @@ extern sam_hdr_t *g_outBamHeader; // 输出文件的header /* 参数对象作为全局对象,免得多次作为参数传入函数中 */ class GlobalArg; -extern GlobalArg &g_gArg; +extern GlobalArg &g_gArg; // 全局参数 class MarkDupsArg; -extern MarkDupsArg &g_mdArg; +extern MarkDupsArg &g_mdArg; // markduplicate程序相关的参数 +// 上边加下划线表示所有模块共享,不加下划线表示markduplicate计算时候内部使用 + class GlobalDataArg; -extern GlobalDataArg &gData; +extern GlobalDataArg &gData; // 计算冗余过程中保存在全局的数据 class DuplicationMetrics; -extern DuplicationMetrics &gMetrics; +extern DuplicationMetrics &gMetrics; // 用来计算统计信息 extern int zzhtestnum; -extern set zzhopticalSet; -extern vector zzhopticalArr; \ No newline at end of file +extern int64_t bam_num1, bam_num2; \ No newline at end of file diff --git a/src/sam/utils/read_ends.h b/src/sam/utils/read_ends.h index 22dbd24..638a074 100644 --- a/src/sam/utils/read_ends.h +++ b/src/sam/utils/read_ends.h @@ -167,4 +167,12 @@ struct ReadEnds : PhysicalLocation { } }; +struct ReadEndsHash { + std::size_t operator()(const ReadEnds &o) const { return std::hash()(o.read1IndexInFile); } +}; + +struct ReadEndsEqual { + bool operator()(const ReadEnds &o1, const ReadEnds &o2) const { return o1.read1IndexInFile == o2.read1IndexInFile; } +}; + #endif \ No newline at end of file