diff --git a/run.sh b/run.sh index 14a8e05..a83e99e 100755 --- a/run.sh +++ b/run.sh @@ -1,7 +1,7 @@ #input=~/data/bam/zy_normal.bam -input=~/data/bam/zy_tumor.bam +#input=~/data/bam/zy_tumor.bam #input=~/data/bam/100w.bam -#input=~/data/bam/1kw.sam +input=~/data/bam/1kw.sam #input=~/data/bam/n1kw.sam time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ diff --git a/src/main.cpp b/src/main.cpp index 5b190e6..4165928 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,17 +1,15 @@ +#include + #include #include -#include + #include "module.h" /* 版本信息 */ -const char *version() -{ - return PICARD_CPP_VERSION; -} +const char *version() { return PICARD_CPP_VERSION; } /* 使用说明 */ -static void usage(FILE *fp) -{ +static void usage(FILE *fp) { fprintf(fp, "\n" "Program: picard_cpp (A cpp implementation for picard.)\n" @@ -25,19 +23,14 @@ static void usage(FILE *fp) "\n"); } -int main(int argc, char *argv[]) -{ - - if (argc < 2) - { +int main(int argc, char *argv[]) { + if (argc < 2) { usage(stderr); return 1; } - if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) - { - if (argc == 2) - { + if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) { + if (argc == 2) { usage(stdout); return 0; } @@ -49,8 +42,7 @@ int main(int argc, char *argv[]) if (strcmp(argv[1], "MarkDuplicates") == 0) ret = MarkDuplicates(argc - 1, argv + 1); - else - { + else { fprintf(stderr, "\n[Error]: unrecognized command '%s'\n\n", argv[1]); usage(stdout); return 1; diff --git a/src/sam/markdups/markdups.cpp b/src/sam/markdups/markdups.cpp index 2d0e2d6..086c36e 100644 --- a/src/sam/markdups/markdups.cpp +++ b/src/sam/markdups/markdups.cpp @@ -26,12 +26,12 @@ Date : 2023/10/23 #include #include +#include "dup_metrics.h" #include "markdups_arg.h" #include "md_funcs.h" #include "parallel_md.h" #include "serial_md.h" #include "shared_args.h" -#include "dup_metrics.h" using namespace std; using std::cout; @@ -93,8 +93,8 @@ int MarkDuplicates(int argc, char *argv[]) { /* 利用线程池对输入输出文件进行读写 */ htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池 htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 - //htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads); - //htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads); + // htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads); + // htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads); htsPoolRead.pool = hts_tpool_init(32); htsPoolWrite.pool = hts_tpool_init(32); if (!htsPoolRead.pool || !htsPoolWrite.pool) { @@ -166,14 +166,14 @@ int MarkDuplicates(int argc, char *argv[]) { while (inBuf.ReadStat() >= 0) { Timer tw1; size_t readNum = inBuf.ReadBam(); - // cout << "read: " << readNum << endl; + // cout << "read: " << readNum << endl; for (size_t i = 0; i < inBuf.Size(); ++i) { /* 判断是否冗余 */ if (bamIdx == dupIdx) { - //if (dupIdx.dupSet != 0) - // cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl; - // 为了防止小内存运行的时候,有重复的dupidx,这时候dup的repIdx和dupSetSize可能会有不同 - while((dupIdx = dupIdxQue.Pop()) == bamIdx); + // if (dupIdx.dupSet != 0) + // cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl; + // 为了防止小内存运行的时候,有重复的dupidx,这时候dup的repIdx和dupSetSize可能会有不同 + while ((dupIdx = dupIdxQue.Pop()) == bamIdx); } if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent // cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl; @@ -213,4 +213,4 @@ int MarkDuplicates(int argc, char *argv[]) { // cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl; Timer::log_time("程序结束"); return 0; - } \ No newline at end of file +} \ No newline at end of file diff --git a/src/sam/markdups/serial_md.cpp b/src/sam/markdups/serial_md.cpp index a9283bb..a82b50d 100644 --- a/src/sam/markdups/serial_md.cpp +++ b/src/sam/markdups/serial_md.cpp @@ -64,7 +64,7 @@ static inline void sortReadEndsArr(vector &arr) { /* 处理一组pairend的readends,标记冗余, 这个函数需要串行运行,因为需要做一些统计*/ static void markDupsForPairs(vector &vpRe, DPSet *dupIdx, MDSet *opticalDupIdx, - DPSet *repIdx, MDSet *notDupIdx = nullptr, + DPSet *repIdx, MDSet *notDupIdx = nullptr, MDSet *notOpticalDupIdx = nullptr, MDSet *notRepIdx = nullptr) { if (vpRe.size() < 2) { if (vpRe.size() == 1) { @@ -81,8 +81,8 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup // int maxOperateTime = 0; /** All read ends should have orientation FF, FR, RF, or RR **/ for (auto pe : vpRe) { // 找分数最高的readend -// maxOperateTime = max(maxOperateTime, pe->oprateTime); -// (const_cast(pe))->oprateTime ++; + // maxOperateTime = max(maxOperateTime, pe->oprateTime); + // (const_cast(pe))->oprateTime ++; if (pe->score > maxScore || pBest == nullptr) { maxScore = pe->score; pBest = pe; @@ -102,13 +102,13 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup cout << "mark pair end: " << endl; } */ - // cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl; - // if (maxOperateTime == 0) ++zzhtestnum; + // cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << + // pBest->read2IndexInFile << endl; if (maxOperateTime == 0) ++zzhtestnum; if (notDupIdx != nullptr) { notDupIdx->insert(pBest->read1IndexInFile); notDupIdx->insert(pBest->read2IndexInFile); } - //if (false) { + // if (false) { if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余 // trackOpticalDuplicates vector prevOpticalRe; @@ -131,10 +131,10 @@ static void markDupsForPairs(vector &vpRe, DPSet *dup } } for (auto pe : vpRe) { // 对非best read标记冗余 - if (pe != pBest) { // 非best + if (pe != pBest) { // 非best dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1 if (pe->read2IndexInFile != pe->read1IndexInFile) - dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); //read2, + dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2, // 注意这里代表是read1的索引 // 检查是否optical dup if (pe->isOpticalDuplicate && opticalDupIdx != nullptr) { @@ -253,7 +253,7 @@ static void generateReadEnds(SerailMarkDupArg *arg) { } if (p.taskSeq == 98) { for (auto &re : p.unpairedPosArr[14293757783047].pairArr) { - cout << "task 99 unpair reads: " << re.read1IndexInFile << "\t" << re.read2IndexInFile << endl; + cout << "task 99 unpair reads: " << re.read1IndexInFile << "\t" << re.read2IndexInFile << endl; } } // cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() << @@ -370,13 +370,22 @@ static inline void refreshFragDupIdx(DPSet &dupIdx, MDSet ¬ /* 将pairs重叠部分的dup idx放进数据中 */ static inline void refreshPairDupIdx(DPSet &dupIdx, MDSet &opticalDupIdx, DPSet &repIdx, - MDSet ¬DupIdx, MDSet ¬OpticalDupIdx, MDSet ¬RepIdx, - SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) { + MDSet ¬DupIdx, MDSet ¬OpticalDupIdx, + MDSet ¬RepIdx, SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) { auto &lp = *lastArg; auto &p = *curArg; - for (auto idx : dupIdx) { lp.pairDupIdx.insert(idx); p.pairDupIdx.erase(idx); } - for (auto idx : opticalDupIdx) { lp.pairOpticalDupIdx.insert(idx); p.pairOpticalDupIdx.erase(idx); } - for (auto idx : repIdx) { lp.pairRepIdx.insert(idx); p.pairRepIdx.erase(idx); } + for (auto idx : dupIdx) { + lp.pairDupIdx.insert(idx); + p.pairDupIdx.erase(idx); + } + for (auto idx : opticalDupIdx) { + lp.pairOpticalDupIdx.insert(idx); + p.pairOpticalDupIdx.erase(idx); + } + for (auto idx : repIdx) { + lp.pairRepIdx.insert(idx); + p.pairRepIdx.erase(idx); + } // for (auto idx : notDupIdx) { // if (lp.pairDupIdx.find(idx) != lp.pairDupIdx.end()) cout << "find-1: " << idx << endl; // if (lp.pairDupIdx.find({idx}) != lp.pairDupIdx.end()) cout << "find-2: " << idx << endl; @@ -384,15 +393,22 @@ static inline void refreshPairDupIdx(DPSet &dupIdx, MDSet &opt // if (p.pairDupIdx.find({idx}) != p.pairDupIdx.end()) cout << "find-4: " << idx << endl; // lp.pairDupIdx.erase(idx); p.pairDupIdx.erase(idx); // } - for (auto idx : notOpticalDupIdx) { lp.pairOpticalDupIdx.erase(idx); p.pairOpticalDupIdx.erase(idx); } - for (auto idx : notRepIdx) { lp.pairRepIdx.erase(idx); p.pairRepIdx.erase(idx); } + for (auto idx : notOpticalDupIdx) { + lp.pairOpticalDupIdx.erase(idx); + p.pairOpticalDupIdx.erase(idx); + } + for (auto idx : notRepIdx) { + lp.pairRepIdx.erase(idx); + p.pairRepIdx.erase(idx); + } } // 用来分别处理dup和optical dup static void refeshTaskDupInfo(DPSet &dupIdx, MDSet &opticalDupIdx, DPSet &repIdx, MDSet ¬DupIdx, MDSet ¬OpticalDupIdx, MDSet ¬RepIdx, - DPSet &latterDupIdx, MDSet &latterOpticalDupIdx, DPSet &latterRepIdx, - MDSet &latterNotDupIdx, MDSet &latterNotOpticalDupIdx, MDSet &latterNotRepIdx) { + DPSet &latterDupIdx, MDSet &latterOpticalDupIdx, + DPSet &latterRepIdx, MDSet &latterNotDupIdx, + MDSet &latterNotOpticalDupIdx, MDSet &latterNotRepIdx) { for (auto idx : dupIdx) { latterDupIdx.insert(idx); // latterNotDupIdx.erase(idx); // 后来的更新为准 @@ -405,7 +421,7 @@ static void refeshTaskDupInfo(DPSet &dupIdx, MDSet &opticalDup } /* 最后合并数据并排序 */ -template +template static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet ¬DupIdx, vector &dupArr, vector &cacheDupIdx, vector &midArr) { midArr.resize(0); @@ -417,8 +433,8 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet ¬DupI auto ae = dupArr.end(); auto bi = cacheDupIdx.begin(); auto be = cacheDupIdx.end(); - //auto bi = dupIdx.begin(); - //auto be = dupIdx.end(); + // auto bi = dupIdx.begin(); + // auto be = dupIdx.end(); T val = 0; while (ai != ae && bi != be) { @@ -427,7 +443,7 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet ¬DupI } else if (*bi < *ai) { val = *bi++; } else { - val = *bi++; // 相等的时候取后面的作为结果 + val = *bi++; // 相等的时候取后面的作为结果 ai++; } if (notDupIdx.find(val) == notDupIdx.end()) { @@ -475,9 +491,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, ¬DupIdx, ¬OpticalDupIdx, ¬RepIdx); refreshPairDupIdx(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, &lp, &p); - //cout << (g.unpairedDic.find("A01415:368:HL7NTDSX3:3:1104:5195:34757") != g.unpairedDic.end()) << endl; - //cout << (g.unpairedPosArr.find(14293757783047) != g.unpairedPosArr.end()) << endl; - // 处理之前未匹配的部分 + // cout << (g.unpairedDic.find("A01415:368:HL7NTDSX3:3:1104:5195:34757") != g.unpairedDic.end()) << endl; + // cout << (g.unpairedPosArr.find(14293757783047) != g.unpairedPosArr.end()) << endl; + // 处理之前未匹配的部分 map recalcPos; CalcSet alreadyAdd; // 与该位点相同的pair都添加到数组里了 MDSet addToGlobal; @@ -516,8 +532,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur // if (prevFragEnd.read1IndexInFile == 255830545 || prevFragEnd.read1IndexInFile == 255830546 || // prevFragEnd.read1IndexInFile == 255832599 || prevFragEnd.read1IndexInFile == 255832601) { - // cout << "find in p: " << lp.taskSeq << "\t" << prevFragEnd.read1IndexInFile << "\t" << readName << endl; - // if (nextUnpairInfoP != nullptr) + // cout << "find in p: " << lp.taskSeq << "\t" << prevFragEnd.read1IndexInFile << "\t" << readName << + // endl; if (nextUnpairInfoP != nullptr) // cout << "next p: " << nextUnpairInfoP->unpairedNum << endl; // if (prevUnpairInfoP != nullptr) // cout << "prev p: " << prevUnpairInfoP->unpairedNum << endl; @@ -657,7 +673,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur // if (p.taskSeq == 163) { // cout << "final" << endl; // } - processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx); + processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, + &t.notRepIdx); if (taskSeq < lp.taskSeq) g.unpairedPosArr.erase(posKey); } @@ -673,7 +690,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur // for (auto &re: lp.unpairedPosArr[posKey].pairArr) { // cout << "lp reads: " << re.read1IndexInFile << "\t" << re.read2IndexInFile << endl; // } - // cout << "found in g: " << lp.taskSeq << "\t" << lp.unpairedPosArr[posKey].unpairedNum << "\t" << lp.unpairedPosArr[posKey].pairArr.size() << endl; + // cout << "found in g: " << lp.taskSeq << "\t" << lp.unpairedPosArr[posKey].unpairedNum << "\t" << + // lp.unpairedPosArr[posKey].pairArr.size() << endl; // } g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey]; } @@ -686,11 +704,14 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur if (taskSeq < lp.taskSeq) { refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], - g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]); + g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], + g.latterNotRepIdxArr[taskSeq]); } else if (taskSeq == lp.taskSeq) { - refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &lp, &p); + refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &lp, + &p); } else { - refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &p, &lp); // 把结果放到p中 + refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &p, + &lp); // 把结果放到p中 } } @@ -769,35 +790,35 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { // cout << t.dupIdx.size() << "\t" << t.notDupIdx.size() << endl; refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], - g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]); + g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], + g.latterNotRepIdxArr[taskSeq]); } // cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; g.unpairedPosArr.clear(); g.unpairedDic.clear(); -/* - int taskSeq = 0; - for (auto &arr : g.dupIdxArr) { - cout << taskSeq << "\t" << arr.size(); - if (taskSeq < (int)g.dupIdxArr.size() - 1) - cout << "\t" << g.latterDupIdxArr[taskSeq].size() << "\t" << g.latterNotDupIdxArr[taskSeq].size() << endl; - else - cout << endl; - // if (taskSeq == 98) { - // vector v; - // v.insert(v.end(), g.latterDupIdxArr[taskSeq].begin(), g.latterDupIdxArr[taskSeq].end()); - // std::sort(v.begin(), v.end()); - // for (auto &idx : v) - // cout << idx.idx << " " << idx.repIdx << " " << idx.dupSet << endl; - // v.clear(); - // v.insert(v.end(), g.latterNotDupIdxArr[taskSeq].begin(), g.latterNotDupIdxArr[taskSeq].end()); - // std::sort(v.begin(), v.end()); - // for (auto &idx : v) cout << idx << endl; - // } - taskSeq++; - } -*/ + /* + int taskSeq = 0; + for (auto &arr : g.dupIdxArr) { + cout << taskSeq << "\t" << arr.size(); + if (taskSeq < (int)g.dupIdxArr.size() - 1) + cout << "\t" << g.latterDupIdxArr[taskSeq].size() << "\t" << g.latterNotDupIdxArr[taskSeq].size() << + endl; else cout << endl; + // if (taskSeq == 98) { + // vector v; + // v.insert(v.end(), g.latterDupIdxArr[taskSeq].begin(), g.latterDupIdxArr[taskSeq].end()); + // std::sort(v.begin(), v.end()); + // for (auto &idx : v) + // cout << idx.idx << " " << idx.repIdx << " " << idx.dupSet << endl; + // v.clear(); + // v.insert(v.end(), g.latterNotDupIdxArr[taskSeq].begin(), g.latterNotDupIdxArr[taskSeq].end()); + // std::sort(v.begin(), v.end()); + // for (auto &idx : v) cout << idx << endl; + // } + taskSeq++; + } + */ // 将dupidx放进全局数据 vector cacheDupIdx; vector midArr; @@ -837,7 +858,8 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { */ } for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i) - refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotOpticalDupIdxArr[i], g.opticalDupIdxArr[i], intCacheDupIdx, intMidArr); + refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotOpticalDupIdxArr[i], g.opticalDupIdxArr[i], + intCacheDupIdx, intMidArr); for (int i = 0; i < (int)g.repIdxArr.size() - 1; ++i) refeshFinalTaskDupInfo(g.latterRepIdxArr[i], g.latterNotRepIdxArr[i], g.repIdxArr[i], cacheDupIdx, midArr); @@ -866,6 +888,15 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { std::sort(vRepIdx.begin(), vRepIdx.end()); } +void calculateMetrics(SerailMarkDupArg &lp, SerailMarkDupArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) { + DuplicationMetrics &gMetrics = *pgMetrics; + cout << "calculateMetrics start: " << endl; + + cout << lp.unpairedDic.size() << "\t" << p.unpairedDic.size() << "\t" << g.unpairedDic.size() << endl; + + cout << "calculateMetrics end" << endl; +} + /* 串行处理数据,标记冗余 */ void serialMarkDups() { tm_arr[5].acc_start(); @@ -933,14 +964,18 @@ void serialMarkDups() { // cout << "round time: " << t_round.seconds_elapsed() << endl; roundNum++; if (roundNum % 100 == 0) { - //cout << "read sum: " << readNumSum << endl; - //cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl; + // cout << "read sum: " << readNumSum << endl; + // cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl; } } // cout << "here" << endl; tm_arr[3].acc_start(); // 处理剩下的全局数据 handleLastTask(lastArgP, &gData); + + // 计算各种统计指标 + calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics); + // cout << "here 2" << endl; tm_arr[3].acc_end(); @@ -950,7 +985,7 @@ void serialMarkDups() { int64_t opticalDupNum = 0; map dup; - + int taskSeq = 0; for (auto &arr : gData.dupIdxArr) { for (auto idx : arr) { @@ -963,7 +998,6 @@ void serialMarkDups() { // cout << taskSeq << "\t" << arr.size() << endl; taskSeq++; } - // #include // ofstream out("tumor_dup.txt"); @@ -989,12 +1023,22 @@ void serialMarkDups() { cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; - cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t" - << gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl; cout << "optical dup: " << zzhopticalSet.size() << endl; cout << "optical arr dup: " << zzhopticalArr.size() << endl; cout << "optical size: " << opticalDupNum << endl; + cout << "metrics: \n" + << "LIBRARY: " << gMetrics.LIBRARY << "\n" + << "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n" + << "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n" + << "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n" + << "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n" + << "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n" + << "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n" + << "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n" + << "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n" + << "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl; + Timer::log_time("serial end "); // for (auto i : gData.dupArr) diff --git a/src/sam/markdups/serial_md.h b/src/sam/markdups/serial_md.h index f4782bb..de2a04e 100644 --- a/src/sam/markdups/serial_md.h +++ b/src/sam/markdups/serial_md.h @@ -7,12 +7,12 @@ #include #include -#include #include +#include using std::set; -using std::unordered_set; using std::string; +using std::unordered_set; using std::vector; /* 存放未匹配readend相同位点的所有readend */ @@ -38,29 +38,23 @@ struct CalcKey { }; struct CalcKeyHash { - std::size_t operator()(const CalcKey &o) const { - return std::hash()(o.read1Pos) ^ std::hash()(o.read2Pos); + std::size_t operator()(const CalcKey &o) const { + return std::hash()(o.read1Pos) ^ std::hash()(o.read2Pos); } }; /* 用来记录冗余索引相关的信息 */ struct DupInfo { int64_t idx; - int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引 + int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引 int16_t dupSet = 0; // dup set size - DupInfo() : DupInfo(-1, 0, 0) { } - DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) { } + DupInfo() : DupInfo(-1, 0, 0) {} + DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {} DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {} - bool operator<(const DupInfo &o) const { - return idx < o.idx; - } - bool operator>(const DupInfo &o) const { - return idx > o.idx; - } - operator int64_t() const { - return idx; - } + bool operator<(const DupInfo &o) const { return idx < o.idx; } + bool operator>(const DupInfo &o) const { return idx > o.idx; } + operator int64_t() const { return idx; } }; struct DupInfoHash { @@ -70,10 +64,10 @@ struct DupInfoHash { struct DupInfoEqual { bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; } bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; } - bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; } + bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; } }; -template +template // using MDSet = set; // using MDSet = unordered_set; using MDSet = tsl::robin_set; @@ -84,7 +78,7 @@ template using DPSet = tsl::robin_set; template -//using CalcSet = set; +// using CalcSet = set; using CalcSet = tsl::robin_set; /* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */ @@ -108,21 +102,22 @@ struct UnpairedPosInfo { // typedef unordered_map UnpairedPositionMap; typedef tsl::robin_map UnpairedNameMap; // 以read name为索引,保存未匹配的pair read -typedef tsl::robin_map UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 +typedef tsl::robin_map + UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 /* 单线程处理冗余参数结构体 */ struct SerailMarkDupArg { - int64_t taskSeq; // 任务序号 - int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 - vector bams; // 存放待处理的bam read - vector pairs; // 成对的reads - vector frags; // 暂未找到配对的reads + int64_t taskSeq; // 任务序号 + int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 + vector bams; // 存放待处理的bam read + vector pairs; // 成对的reads + vector frags; // 暂未找到配对的reads DPSet pairDupIdx; // pair的冗余read的索引 MDSet pairOpticalDupIdx; // optical冗余read的索引 DPSet fragDupIdx; // frag的冗余read的索引 DPSet pairRepIdx; // pair的dupset代表read的索引 - UnpairedNameMap unpairedDic; // 用来寻找pair end - UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 + UnpairedNameMap unpairedDic; // 用来寻找pair end + UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 }; /* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */ diff --git a/todo.md b/todo.md new file mode 100644 index 0000000..361cb65 --- /dev/null +++ b/todo.md @@ -0,0 +1,48 @@ +# 各种统计信息的计算,metrics文件中记录的信息 +- htsjdk.samtools.metrics.StringHeader + - 命令行参数 + - 执行命令的时间 +- METRICS CLASS picard.sam.DuplicationMetrics +``` +LIBRARY 样本ID(建库制备的样本id) +UNPAIRED_READS_EXAMINED 未匹配的reads数量 +READ_PAIRS_EXAMINED 匹配的reads数量 +SECONDARY_OR_SUPPLEMENTARY_RDS 非主要匹配的reads个数 +UNMAPPED_READS 未匹配的reads个数 +UNPAIRED_READ_DUPLICATES 未匹配的reads,冗余的个数 +READ_PAIR_DUPLICATES 匹配的reads,冗余个数 +READ_PAIR_OPTICAL_DUPLICATES 匹配的reads,光学原因造成的冗余个数 +PERCENT_DUPLICATION 冗余reads占比 +ESTIMATED_LIBRARY_SIZE 估计的样本reads总量 + +normal 1205 498430 729 1205 763 117212 6877 0.235643 924111 +``` +- HISTOGRAM java.lang.Double +``` +BIN 坐标值 +CoverageMult +all_sets +optical_sets +non_optical_sets + +1.0 1.010558 287416 0 291706 +2.0 1.599836 72561 6664 70093 +3.0 1.943455 15542 105 14299 +4.0 2.143827 3274 1 2831 +5.0 2.260667 708 0 607 +6.0 2.3288 136 0 114 +7.0 2.368529 28 0 17 +8.0 2.391696 7 0 5 +9.0 2.405205 2 0 2 +10.0 2.413082 0 0 0 +11.0 2.417676 0 0 0 +12.0 2.420354 0 0 0 +13.0 2.421916 0 0 0 +14.0 2.422827 0 0 0 +15.0 2.423358 0 0 0 +... +... +98.0 2.424101 0 0 0 +99.0 2.424101 0 0 0 +100.0 2.424101 0 0 0 +``` \ No newline at end of file