diff --git a/.gitignore b/.gitignore index e32c3fb..dd1fa75 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # for fast-markdup *.sam +*.bam *.log # ---> C++ # Prerequisites @@ -36,4 +37,4 @@ *.app lib/ -build/ \ No newline at end of file +build/ diff --git a/.vscode/launch.json b/.vscode/launch.json index 08f219a..fcd3c60 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,11 +13,11 @@ "program": "${workspaceRoot}/build/bin/picard_cpp", "args": [ "MarkDuplicates", - "--INPUT", "~/data/bam/100w.bam", + "--INPUT", "~/data/bam/zy_normal.bam", "--OUTPUT", "out.bam", "--METRICS_FILE", "metrics.txt", "--num_threads", "1", - "--max_mem", "4G", + "--max_mem", "100G", "--verbosity", "DEBUG", "--asyncio", "true", ], diff --git a/run.sh b/run.sh index 475ad64..b0f8145 100755 --- a/run.sh +++ b/run.sh @@ -1,6 +1,8 @@ -#input=~/data/bam/zy_normal.bam -input=~/data/bam/zy_tumor.bam +input=~/data/bam/zy_normal.bam +#input=~/data/bam/zy_tumor.bam #input=~/data/bam/100w.bam +#input=~/data/bam/1kw.sam +#input=~/data/bam/n1kw.sam time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ MarkDuplicates \ @@ -8,7 +10,7 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ --OUTPUT ~/data/bam/out.bam \ --INDEX_FORMAT BAI \ --num_threads 1 \ - --max_mem 2G \ + --max_mem 1G \ --verbosity DEBUG \ --asyncio true #\ #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" diff --git a/src/common/utils/global_arg.cpp b/src/common/utils/global_arg.cpp index d707dd6..f147eaa 100644 --- a/src/common/utils/global_arg.cpp +++ b/src/common/utils/global_arg.cpp @@ -71,7 +71,8 @@ void GlobalArg::parseArgument(int argNum) { mem_arg <<= 20; else if (*q == 'g' || *q == 'G') mem_arg <<= 30; - if (mem_arg >= max_mem) + //if (mem_arg >= max_mem) + if (true) max_mem = mem_arg; else { std::cerr << "[Warn] Too small mem size, use default" << std::endl; diff --git a/src/sam/markdups/markdups.cpp b/src/sam/markdups/markdups.cpp index 7860205..32b48a2 100644 --- a/src/sam/markdups/markdups.cpp +++ b/src/sam/markdups/markdups.cpp @@ -57,6 +57,10 @@ static GlobalDataArg gData_; GlobalDataArg &gData = gData_; DuplicationMetrics gMetrics_; DuplicationMetrics &gMetrics = gMetrics_; + +int zzhtestnum = 0; +set zzhopticalSet; +vector zzhopticalArr; /* * mark duplicate * 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题 @@ -161,11 +165,11 @@ int MarkDuplicates(int argc, char *argv[]) { while (inBuf.ReadStat() >= 0) { Timer tw1; size_t readNum = inBuf.ReadBam(); - cout << "read: " << readNum << endl; + // cout << "read: " << readNum << endl; for (size_t i = 0; i < inBuf.Size(); ++i) { /* 判断是否冗余 */ if (bamIdx == dupIdx) { - // cout << "冗余" << bamIdx << endl; + // cerr << bamIdx << endl; dupIdx = idxQue.Pop(); } if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) { diff --git a/src/sam/markdups/markdups_arg.h b/src/sam/markdups/markdups_arg.h index af5fb41..a308d9e 100644 --- a/src/sam/markdups/markdups_arg.h +++ b/src/sam/markdups/markdups_arg.h @@ -213,7 +213,7 @@ struct MarkDupsArg ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted; /* "The scoring strategy for choosing the non-duplicate among candidates." */ - ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH; + ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES; /* "The program record ID for the @PG record(s) created by this program. Set to null to disable " + "PG record creation. This string may have a suffix appended to avoid collision with other " + diff --git a/src/sam/markdups/md_funcs.cpp b/src/sam/markdups/md_funcs.cpp index 91a5de4..4fed041 100644 --- a/src/sam/markdups/md_funcs.cpp +++ b/src/sam/markdups/md_funcs.cpp @@ -28,8 +28,6 @@ using std::set; using std::unordered_map; using std::vector; -static int zzhtestnum = 0; - /* 清除key位置的数据 */ void clearIdxAtPos(int64_t key, map> *pmsIdx) { auto &msIdx = *pmsIdx; @@ -247,6 +245,7 @@ void handleFrags(int64_t posKey, vector &readEnds, /* 对找到的pairend read end添加一些信息 */ void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) { auto &pairedEnds = *pPairedEnds; + int64_t bamIdx = fragEnd.read1IndexInFile; const int matesRefIndex = fragEnd.read1ReferenceIndex; const int matesCoordinate = fragEnd.read1Coordinate; @@ -459,14 +458,23 @@ static int checkOpticalDuplicates(vector &readEndsArr, const R findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags); int opticalDuplicates = 0; for (int i = 0; i < opticalDuplicateFlags.size(); ++i) { + ReadEnds *pRe = const_cast(readEndsArr[i]); if (opticalDuplicateFlags[i]) { ++opticalDuplicates; - ReadEnds *pRe = const_cast(readEndsArr[i]); + // if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) { + // cout << "val: " << pRe->isOpticalDuplicate << endl; + // } pRe->isOpticalDuplicate = true; + zzhopticalSet.insert(pRe->read1IndexInFile); + zzhopticalSet.insert(pRe->read2IndexInFile); + zzhopticalArr.push_back(pRe->read1IndexInFile); + zzhopticalArr.push_back(pRe->read2IndexInFile); + } else { + pRe->isOpticalDuplicate = false; + zzhopticalSet.erase(pRe->read1IndexInFile); + zzhopticalSet.erase(pRe->read2IndexInFile); } } - if (opticalDuplicates > 0) - gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates; return opticalDuplicates; } @@ -475,8 +483,11 @@ static int checkOpticalDuplicates(vector &readEndsArr, const R */ void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { bool hasFR = false, hasRF = false; + int prevOpticalDupNum = 0; // Check to see if we have a mixture of FR/RF for (auto pRe : readEndsArr) { + if (pRe->isOpticalDuplicate) + ++prevOpticalDupNum; if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) hasFR = true; else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) @@ -513,5 +524,10 @@ void trackOpticalDuplicates(vector &readEndsArr, const ReadEnd if (nOpticalDup) gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1; + + gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum; + //gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup; + // cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl; + // cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl; } \ No newline at end of file diff --git a/src/sam/markdups/serial_md.cpp b/src/sam/markdups/serial_md.cpp index b173c0d..b1895b5 100644 --- a/src/sam/markdups/serial_md.cpp +++ b/src/sam/markdups/serial_md.cpp @@ -13,10 +13,10 @@ #include #include +#include "dup_metrics.h" #include "markdups_arg.h" #include "md_funcs.h" #include "shared_args.h" -#include "dup_metrics.h" using std::cout; using std::set; @@ -78,25 +78,28 @@ static void markDupsForPairs(vector &vpRe, set *dupId } int maxScore = 0; const ReadEnds *pBest = nullptr; + int maxOperateTime = 0; /** All read ends should have orientation FF, FR, RF, or RR **/ - for (auto pe : vpRe) { // 找分数最高的readend + for (auto pe : vpRe) { // 找分数最高的readend + maxOperateTime = max(maxOperateTime, pe->oprateTime); + (const_cast(pe))->oprateTime ++; if (pe->score > maxScore || pBest == nullptr) { maxScore = pe->score; pBest = pe; } } + // cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl; + // if (maxOperateTime == 0) ++zzhtestnum; if (notDupIdx != nullptr) { notDupIdx->insert(pBest->read1IndexInFile); notDupIdx->insert(pBest->read2IndexInFile); } - if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余 + if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余 // trackOpticalDuplicates trackOpticalDuplicates(vpRe, pBest); } - for (auto pe : vpRe) // 对非best read标记冗余 - { - if (pe != pBest) // 非best - { + for (auto pe : vpRe) { // 对非best read标记冗余 + if (pe != pBest) { // 非best dupIdx->insert(pe->read1IndexInFile); // 添加read1 if (pe->read2IndexInFile != pe->read1IndexInFile) dupIdx->insert(pe->read2IndexInFile); // 添加read2 @@ -139,8 +142,7 @@ static void markDupsForFrags(vector &vpRe, bool containsPairs, /* 找到与readend pos相等的所有readend */ static void getEqualRE(const ReadEnds &re, vector &src, vector *dst) { - auto range = std::equal_range(src.begin(), src.end(), re, - ReadEnds::PairsLittleThan); // 只比对位点 + auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点 dst->insert(dst->end(), range.first, range.second); } @@ -158,24 +160,24 @@ static void generateReadEnds(SerailMarkDupArg *arg) { // set reSet; // ReadEnds lastRe; - for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read + for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read BamWrap *bw = p.bams[i]; const int64_t bamIdx = p.bamStartIdx + i; if (bw->GetReadUnmappedFlag()) { if (bw->b->core.tid == -1) // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort). break; - } else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对 + } else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对 ReadEnds fragEnd; tm_arr[8].acc_start(); buildReadEnds(*bw, bamIdx, rnParser, &fragEnd); tm_arr[8].acc_end(); - p.frags.push_back(fragEnd); // 添加进frag集合 - if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了 + p.frags.push_back(fragEnd); // 添加进frag集合 + if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了 string key = bw->query_name(); if (p.unpairedDic.find(key) == p.unpairedDic.end()) { p.unpairedDic[key] = {p.taskSeq, fragEnd}; - } else { // 找到了pairend + } else { // 找到了pairend auto &pairedEnds = p.unpairedDic.at(key).unpairedRE; modifyPairedEnds(fragEnd, &pairedEnds); p.pairs.push_back(pairedEnds); @@ -185,8 +187,8 @@ static void generateReadEnds(SerailMarkDupArg *arg) { } } tm_arr[9].acc_start(); - //sortReadEndsArr(p.frags); - sort(p.frags.begin(), p.frags.end()); + sortReadEndsArr(p.frags); + // sort(p.frags.begin(), p.frags.end()); tm_arr[9].acc_end(); // cout << "sort pairs" << endl; tm_arr[10].acc_start(); @@ -277,7 +279,7 @@ static inline void getIntersectData(vector &leftArr, vector while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) { leftSpan += 1; - if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了(可能会有bug,上上个也与下一个有交集呢?) + if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了(可能会有bug,上上个也与下一个有交集呢?) leftSpan = leftArr.size() - 1; break; } @@ -285,7 +287,7 @@ static inline void getIntersectData(vector &leftArr, vector while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) { rightSpan += 1; - if (rightSpan == rightArr.size() - 1) // 同上,可能会有bug + if (rightSpan == rightArr.size() - 1) // 同上,可能会有bug break; } dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end()); @@ -432,21 +434,18 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur // 1. // prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据; // 2. - // prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read - // pair进行重新计算 - // 复杂情况1. - // g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去 + // prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read pair进行重新计算 + // 复杂情况1. g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去 // 3. - // prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read - // pair进行重新计算 - // 复杂情况2. p中是否包含prevPosKey对应的unpair + // prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read pair进行重新计算 + // 复杂情况2. p中是否包含prevPosKey对应的unpair // 4. - // prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read - // pair进行重新计算 + // prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read pair进行重新计算 bool addDataToPos = true; if (alreadyAdd.find(ck) != alreadyAdd.end()) { - addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了 + // 之前已经添加过了,后面就不用再添加数据了,因为同一个位置可能找到两个及以上的unpair数据,处理之前的数据时候可能已经添加了这些数据 + addDataToPos = false; } else alreadyAdd.insert(ck); @@ -478,9 +477,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur } recalcPos[ck] = prevPosInfo.taskSeq; std::sort(prevPairArr.begin(), prevPairArr.end()); - } else { // prevpos在交叉部分 - if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况 - if (nextUnpairInfoP != nullptr) { // 且在pos点,next task有unpair,这样才把这些数据放到next task里 + } else { // prevpos在交叉部分 + if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况 + if (nextUnpairInfoP != nullptr) { // 且在pos点,next task有unpair,这样才把这些数据放到next task里 auto &nextPairArr = nextUnpairInfoP->pairArr; nextPairArr.push_back(prevFragEnd); auto &prevPairArr = prevUnpairInfoP->pairArr; @@ -490,9 +489,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur } // 将数据放到next task里,(这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除) recalcPos[ck] = nextPosInfo.taskSeq; - + std::sort(prevPairArr.begin(), prevPairArr.end()); - } else { // next task在该位点没有unpair,那就把数据放到prev task里 + } else { // next task在该位点没有unpair,那就把数据放到prev task里 auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr prevPairArr.push_back(prevFragEnd); if (addDataToPos) // 第二种情况 @@ -515,8 +514,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur std::sort(prevPairArr.begin(), prevPairArr.end()); } } - p.unpairedDic.erase(readName); // 在next task里删除该read - } else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read + p.unpairedDic.erase(readName); // 在next task里删除该read + } else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read auto &remainPosInfo = g.unpairedDic[readName]; auto remainFragEnd = remainPosInfo.unpairedRE; int64_t remainPosKey = remainFragEnd.posKey; @@ -529,14 +528,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur std::sort(remainPairArr.begin(), remainPairArr.end()); g.unpairedDic.erase(readName); - } else { // 都没找到,那就保存到遗留数据里 + } else { // 都没找到,那就保存到遗留数据里 int64_t prevPosKey = prevFragEnd.posKey; g.unpairedDic.insert(prevUnpair); addToGlobal.insert(prevPosKey); } } - // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据 - for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey]; map taskChanged; set posProcessed; @@ -557,8 +554,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur if (taskSeq < lp.taskSeq) g.unpairedPosArr.erase(posKey); } - // 更新结果 + // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据 + // 放在这里,因为lp中的unpairedPosArr中的readends可能会被修改(比如optical duplicate) + for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey]; + + // 更新结果 for (auto &e : taskChanged) { auto taskSeq = e.first; auto &t = e.second; @@ -594,12 +595,12 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { auto &lp = *task; auto &g = *gDataArg; // 遗留的未匹配的pair - for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read + for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read auto &readName = prevUnpair.first; auto &prevPosInfo = prevUnpair.second; auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end - if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read + if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read auto &remainPosInfo = g.unpairedDic[readName]; auto remainFragEnd = remainPosInfo.unpairedRE; int64_t remainPosKey = remainFragEnd.posKey; @@ -719,8 +720,8 @@ void serialMarkDups() { // cout << "round time: " << t_round.seconds_elapsed() << endl; roundNum++; if (roundNum % 100 == 0) { - cout << "read sum: " << readNumSum << endl; - cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl; + //cout << "read sum: " << readNumSum << endl; + //cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl; } } // cout << "here" << endl; @@ -769,10 +770,10 @@ void serialMarkDups() { cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; - cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" - << gMetrics.NonOpticalDuplicateCountHist << "\t" - << gMetrics.OpticalDuplicatesCountHist << "\t" - << gMetrics.OpticalDuplicatesByLibraryId << endl; + cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t" + << gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl; + cout << "optical dup: " << zzhopticalSet.size() << endl; + cout << "optical arr dup: " << zzhopticalArr.size() << endl; Timer::log_time("serial end "); diff --git a/src/sam/markdups/shared_args.h b/src/sam/markdups/shared_args.h index 3f739ec..d239711 100644 --- a/src/sam/markdups/shared_args.h +++ b/src/sam/markdups/shared_args.h @@ -6,6 +6,8 @@ #include #include #include +#include +using std::set; extern Timer tm_arr[20]; // 用来测试性能 /* 全局本地变量 */ @@ -23,4 +25,8 @@ extern MarkDupsArg &g_mdArg; class GlobalDataArg; extern GlobalDataArg &gData; class DuplicationMetrics; -extern DuplicationMetrics &gMetrics; \ No newline at end of file +extern DuplicationMetrics &gMetrics; + +extern int zzhtestnum; +extern set zzhopticalSet; +extern vector zzhopticalArr; \ No newline at end of file diff --git a/src/sam/utils/read_ends.h b/src/sam/utils/read_ends.h index 25ff0b8..22dbd24 100644 --- a/src/sam/utils/read_ends.h +++ b/src/sam/utils/read_ends.h @@ -77,6 +77,9 @@ struct ReadEnds : PhysicalLocation { int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << // MAX_CONTIG_LEN_SHIFT | (int64_t)pos; + /* 用来做一些判断,因为一些readends会做多次操作,比如task之间有重叠等等 */ + int oprateTime = 0; + /* 根据pairend read的比对方向,来确定整体的比对方向 */ static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand) {