#include #include /* 存放未匹配readend相同位点的所有readend */ struct UnpairedREInfo { int64_t taskSeq; ReadEnds unpairedRE; }; struct GlobalUnpairedInfo { int64_t taskSeq; vector reArr; }; struct TaskSeqDupInfo { set dupIdx; set opticalDupIdx; set notDupIdx; }; // typedef unordered_map UnpairedNameMap; // typedef unordered_map> UnpairedPositionMap; typedef tsl::robin_map UnpairedNameMap; typedef tsl::robin_map> UnpairedPositionMap; /* 单线程处理冗余参数结构体 */ struct SerailMarkDupArg { int64_t taskSeq; int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置 vector bams; // 存放待处理的bam read vector pairs; vector frags; set pairDupIdx; // pair的冗余read的索引 set pairOpticalDupIdx; // optical冗余read的索引 set fragDupIdx; // frag的冗余read的索引 UnpairedNameMap unpairedDic; // 用来寻找pair end UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储 }; /* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */ struct GlobalDataArg { set pairDupIdx; // pair的冗余read的索引 set pairOpticalDupIdx; // optical冗余read的索引 set notDupIdx; // 不是冗余 //unordered_map unpairedDic; // 用来寻找pair end //unordered_map> unpairedPosArr; UnpairedNameMap unpairedDic; // 用来寻找pair end UnpairedPositionMap unpairedPosArr; // 每个task对应一个vector vector> dupIdxArr; vector> opticalDupIdxArr; }; static GlobalDataArg gData; /* 查找 */ template static inline Itr binaryFind(Itr first, Itr last, const T &val) { first = std::lower_bound(first, last, val); return (first != last && *first == val) ? first : last; } /* 排序 */ static inline void sortReadEndsArr(vector &arr) { size_t blockSize = 64 * 1024; blockSize = min(blockSize, arr.size()); size_t blockNum = (arr.size() + blockSize - 1) / blockSize; size_t crossNum = 1024; size_t start, end, i, left, right; std::sort(arr.begin(), arr.begin() + blockSize); for (i = 1; i < blockNum; ++i) { start = i * blockSize; end = min(start + blockSize, arr.size()); std::sort(arr.begin() + start, arr.begin() + end); left = crossNum; while (!(arr[start - left] < arr[start])) { left = left << 1; if (left >= blockSize) { std::sort(arr.begin(), arr.end()); // 退化到普通排序 return; } } right = min(crossNum, end - start - 1); while (!(arr[start - 1] < arr[start + right])) { right = min(right << 1, end - start - 1); if (right == end - start - 1) break; } // cout << "sort: " << left << ' ' << right << ' ' // << arr[start - left].posKey << ' ' << arr[start - 1].posKey << ' ' // << arr[start].posKey << ' ' << arr[start + right].posKey << endl; std::sort(arr.begin() + start - left, arr.begin() + start + right); } } /* 处理一组pairend的readends,标记冗余 */ static void markDupsForPairs(vector &vpRe, set *dupIdx, set *opticalDupIdx, set *notDupIdx = nullptr) { if (vpRe.size() < 2) { if (vpRe.size() == 1) { // addSingletonToCount(libraryIdGenerator); } return; } int maxScore = 0; const ReadEnds *pBest = nullptr; /** All read ends should have orientation FF, FR, RF, or RR **/ for (auto pe : vpRe) // 找分数最高的readend { if (pe->score > maxScore || pBest == nullptr) { maxScore = pe->score; pBest = pe; } } if (notDupIdx != nullptr) { notDupIdx->insert(pBest->read1IndexInFile); notDupIdx->insert(pBest->read2IndexInFile); } if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余 { // trackOpticalDuplicates } for (auto pe : vpRe) // 对非best read标记冗余 { if (pe != pBest) // 非best { dupIdx->insert(pe->read1IndexInFile); // 添加read1 if (pe->read2IndexInFile != pe->read1IndexInFile) dupIdx->insert(pe->read2IndexInFile); // 添加read2 } } // if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) // { // addRepresentativeReadIndex(vpRe); // } } /* 处理一组非paired的readends,标记冗余 */ static void markDupsForFrags(vector &vpRe, bool containsPairs, set *dupIdx, set *notDupIdx = nullptr) { if (containsPairs) { for (auto pe : vpRe) { if (!pe->IsPaired()) { dupIdx->insert(pe->read1IndexInFile); } } } else { int maxScore = 0; const ReadEnds *pBest = nullptr; for (auto pe : vpRe) { if (pe->score > maxScore || pBest == nullptr) { maxScore = pe->score; pBest = pe; } } if (notDupIdx != nullptr) { notDupIdx->insert(pBest->read1IndexInFile); } for (auto pe : vpRe) { if (pe != pBest) { dupIdx->insert(pe->read1IndexInFile); } } } } /* 找到与readend pos相等的所有readend */ static void getEqualRE(const ReadEnds &re, vector &src, vector *dst) { auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsCmp); dst->insert(dst->end(), range.first, range.second); } /* 单线程生成readends (第一步)*/ static void generateReadEnds(SerailMarkDupArg *arg) { auto &p = *arg; auto &rnParser = g_vRnParser[0]; p.pairs.clear(); p.frags.clear(); p.unpairedDic.clear(); p.unpairedPosArr.clear(); /* 处理每个read,创建ReadEnd,并放入frag和pair中 */ set reSet; ReadEnds lastRe; for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read { BamWrap *bw = p.bams[i]; const int64_t bamIdx = p.bamStartIdx + i; if (bw->GetReadUnmappedFlag()) { if (bw->b->core.tid == -1) // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort). break; } else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对 { ReadEnds fragEnd; tm_arr[8].acc_start(); buildReadEnds(*bw, bamIdx, rnParser, &fragEnd); tm_arr[8].acc_end(); p.frags.push_back(fragEnd); // 添加进frag集合 if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了 { string key = bw->query_name(); if (p.unpairedDic.find(key) == p.unpairedDic.end()) { p.unpairedDic[key] = {p.taskSeq, fragEnd}; } else // 找到了pairend { auto &pairedEnds = p.unpairedDic.at(key).unpairedRE; modifyPairedEnds(fragEnd, &pairedEnds); // if (pairedEnds.read1IndexInFile == 94 || pairedEnds.read1IndexInFile == 95) // { // cout << "pair score: " << pairedEnds.read1IndexInFile << ' ' << pairedEnds.score << endl; // } // if (pairedEnds.read1IndexInFile == 94) // { // lastRe = pairedEnds; // } // if (pairedEnds.read1IndexInFile == 95) // { // cout << "compare: " << (lastRe < pairedEnds) << ' ' << (pairedEnds < lastRe) << endl; // // } p.pairs.push_back(pairedEnds); p.unpairedDic.erase(key); // 删除找到的pairend } } } } // cout << "sort frags" << endl; sortReadEndsArr(p.frags); // sort(p.frags.begin(), p.frags.end()); // cout << "sort pairs" << endl; // sortReadEndsArr(p.pairs); sort(p.pairs.begin(), p.pairs.end()); // cout << "unpaired num: " << p.unpairedDic.size() << endl; // 把未匹配的pair对应的每个位点的pairs记录下来 for (auto &e : p.unpairedDic) { auto &unpair = e.second; auto posKey = unpair.unpairedRE.posKey; if (p.unpairedPosArr.find(posKey) == p.unpairedPosArr.end()) getEqualRE(unpair.unpairedRE, p.pairs, &p.unpairedPosArr[posKey]); } } /* 处理pairs */ static void processPairs(vector &readEnds, set *dupIdx, set *opticalDupIdx, set *notDupIdx = nullptr) { vector vpCache; // 有可能是冗余的reads const ReadEnds *pReadEnd = nullptr; for (auto &re : readEnds) { if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 vpCache.push_back(&re); // 处理一个潜在的冗余组 else { markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样 vpCache.clear(); vpCache.push_back(&re); pReadEnd = &re; } } markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); } /* 处理frags */ static void processFrags(vector &readEnds, set *dupIdx, set *notDupIdx = nullptr) { bool containsPairs = false; bool containsFrags = false; vector vpCache; // 有可能是冗余的reads const ReadEnds *pReadEnd = nullptr; for (auto &re : readEnds) { if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) { vpCache.push_back(&re); containsPairs = containsPairs || re.IsPaired(); containsFrags = containsFrags || !re.IsPaired(); } else { if (vpCache.size() > 1 && containsFrags) { markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx); } vpCache.clear(); vpCache.push_back(&re); pReadEnd = &re; containsPairs = re.IsPaired(); containsFrags = !re.IsPaired(); } } if (vpCache.size() > 1 && containsFrags) { markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx); } } /* 单线程markdup (第二步)*/ static void markdups(SerailMarkDupArg *arg) { auto &p = *arg; p.pairDupIdx.clear(); p.pairOpticalDupIdx.clear(); p.fragDupIdx.clear(); /* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */ // 先处理 pair processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx); // 再处理frag processFrags(p.frags, &p.fragDupIdx); } /* 获取交叉部分的数据 */ static inline void getIntersectData(vector &leftArr, vector &rightArr, vector *dst) { const size_t leftEndIdx = leftArr.size() - 1; const size_t rightStartIdx = 0; size_t leftSpan = 0; size_t rightSpan = 0; while (!(leftArr[leftEndIdx - leftSpan] < rightArr[rightStartIdx])) { leftSpan += 1; if (leftSpan > leftEndIdx) { leftSpan = leftArr.size() - 1; break; } } while (!(leftArr[leftEndIdx] < rightArr[rightSpan])) { rightSpan += 1; if (rightSpan == rightArr.size() - 1) break; } dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end()); dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan); std::sort(dst->begin(), dst->end()); } /* 将重叠部分的dup idx放进数据中 */ static inline void refreshFragDupIdx(set &dupIdx, set ¬DupIdx, SerailMarkDupArg * lastArg, SerailMarkDupArg *curArg) { auto &lp = *lastArg; auto &p = *curArg; for (auto idx : dupIdx) { lp.fragDupIdx.insert(idx); p.fragDupIdx.erase(idx); } for (auto idx : notDupIdx) { lp.fragDupIdx.erase(idx); p.fragDupIdx.erase(idx); } } static inline void refreshPairDupIdx(set &dupIdx, set &opticalDupIdx, set ¬DupIdx, SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) { auto &lp = *lastArg; auto &p = *curArg; for (auto idx : dupIdx) { lp.pairDupIdx.insert(idx); p.pairDupIdx.erase(idx); } for (auto idx : opticalDupIdx) { lp.pairOpticalDupIdx.insert(idx); p.pairOpticalDupIdx.erase(idx); } for (auto idx : notDupIdx) { lp.pairDupIdx.erase(idx); lp.pairOpticalDupIdx.erase(idx); p.pairDupIdx.erase(idx); p.pairOpticalDupIdx.erase(idx); } } /* 处理未匹配的部分 */ static inline void processUnpairedPosForCalc(UnpairedNameMap &lpUnpairedDic, UnpairedPositionMap &lpUnpairedPosArr, UnpairedNameMap &pUnpairedDic, UnpairedPositionMap &pUnpairedPosArr, vector &pairs, map &recalcPos, bool addToLast = false) { recalcPos.clear(); for (auto itr = pUnpairedDic.begin(); itr != pUnpairedDic.end();) { auto &readName = itr->first; if (lpUnpairedDic.find(readName) != lpUnpairedDic.end()) { auto &posInfo = lpUnpairedDic[readName]; auto posKey = posInfo.unpairedRE.posKey; auto &posReArr = lpUnpairedPosArr[posKey]; modifyPairedEnds(itr->second.unpairedRE, &posInfo.unpairedRE); posKey = posInfo.unpairedRE.posKey; if (recalcPos.find(posKey) == recalcPos.end()) // 如果之前没有这个位点 getEqualRE(posInfo.unpairedRE, pairs, &posReArr); recalcPos[posKey] = posInfo.taskSeq; posReArr.push_back(posInfo.unpairedRE); std::sort(posReArr.begin(), posReArr.end()); lpUnpairedDic.erase(readName); itr = pUnpairedDic.erase(itr); } else { if (addToLast) // 将数据添加进遗留数据中 { auto posKey = itr->second.unpairedRE.posKey; lpUnpairedDic[readName] = itr->second; lpUnpairedPosArr[posKey] = pUnpairedPosArr[posKey]; } ++itr; } } } // 用来分别处理dup和optical dup static void refeshTaskDupInfo(vector &addDup, map &ndPosVal, set &dupIdx, set ¬DupIdx, vector &dupArr) { addDup.clear(); ndPosVal.clear(); // 去除之前有的,重复的 for (auto i = dupIdx.begin(); i != dupIdx.end();) { auto itr = binaryFind(dupArr.begin(), dupArr.end(), *i); if (itr != dupArr.end()) { i = dupIdx.erase(i); } else { ++i; } } // 添加现有的 auto di = dupIdx.begin(); for (auto nidx : notDupIdx) { auto itr = binaryFind(dupArr.begin(), dupArr.end(), nidx); if (itr != dupArr.end()) { ndPosVal[itr - dupArr.begin()] = *di++; } } while (di != dupIdx.end()) addDup.push_back(*di++); for (auto pos : ndPosVal) dupArr[pos.first] = pos.second; dupArr.insert(dupArr.end(), addDup.begin(), addDup.end()); std::sort(dupArr.begin(), dupArr.end()); } /* 将遗留的冗余信息添加进对应的任务数据中 */ static void addDupInfoToTask(map &seqTaskChanged, GlobalDataArg *gDataArg) { auto &g = *gDataArg; // 更新遗留的结果 vector addDup; map ndPosVal; for (auto &e : seqTaskChanged) { refeshTaskDupInfo(addDup, ndPosVal, e.second.dupIdx, e.second.notDupIdx, g.dupIdxArr[e.first]); refeshTaskDupInfo(addDup, ndPosVal, e.second.opticalDupIdx, e.second.notDupIdx, g.opticalDupIdxArr[e.first]); } } /* 处理相邻的两个任务,有相交叉的数据 */ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg) { auto &lp = *lastArg; auto &p = *curArg; auto &g = *gDataArg; vector reArr; set dupIdx; set notDupIdx; // 先处理重叠的frags getIntersectData(lp.frags, p.frags, &reArr); processFrags(reArr, &dupIdx, ¬DupIdx); refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p); // 再处理重叠的pairs reArr.clear(); dupIdx.clear(); notDupIdx.clear(); set opticalDupIdx; getIntersectData(lp.pairs, p.pairs, &reArr); processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx); refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p); // 处理之前未匹配的部分 map recalcPos; processUnpairedPosForCalc(lp.unpairedDic, lp.unpairedPosArr, p.unpairedDic, p.unpairedPosArr, p.pairs, recalcPos); for (auto &e : recalcPos) { auto posKey = e.first; dupIdx.clear(); notDupIdx.clear(); opticalDupIdx.clear(); processPairs(lp.unpairedPosArr[posKey], &dupIdx, &opticalDupIdx, ¬DupIdx); refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p); } // 遗留的未匹配的pair processUnpairedPosForCalc(g.unpairedDic, g.unpairedPosArr, lp.unpairedDic, lp.unpairedPosArr, lp.pairs, recalcPos, true); map seqTaskChanged; for (auto &e : recalcPos) { auto posKey = e.first; auto seqNum = e.second; auto &t = seqTaskChanged[seqNum]; // 在对应的任务包含的dup idx里修改结果数据 processPairs(g.unpairedPosArr[posKey], &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx); g.unpairedPosArr.erase(posKey); } addDupInfoToTask(seqTaskChanged, &g); cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; // 将dupidx放进全局数据 g.dupIdxArr.push_back(vector()); auto &vIdx = g.dupIdxArr.back(); lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end()); vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end()); g.opticalDupIdxArr.push_back(vector()); auto &vOpticalIdx = g.opticalDupIdxArr.back(); vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end()); } /* 当所有任务结束后,global data里还有未处理的数据 */ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) { auto &lp = *task; auto &g = *gDataArg; map recalcPos; // 遗留的未匹配的pair processUnpairedPosForCalc(g.unpairedDic, g.unpairedPosArr, lp.unpairedDic, lp.unpairedPosArr, lp.pairs, recalcPos, true); map seqTaskChanged; for (auto &e : recalcPos) { auto posKey = e.first; auto seqNum = e.second; auto &t = seqTaskChanged[seqNum]; // 在对应的任务包含的dup idx里修改结果数据 processPairs(g.unpairedPosArr[posKey], &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx); g.unpairedPosArr.erase(posKey); } // 更新遗留的结果 addDupInfoToTask(seqTaskChanged, &g); cout << "last unpair info: " << g.unpairedPosArr.size() << '\t' << g.unpairedDic.size() << endl; // 将dupidx放进全局数据 g.dupIdxArr.push_back(vector()); auto &vIdx = g.dupIdxArr.back(); lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end()); vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end()); g.opticalDupIdxArr.push_back(vector()); auto &vOpticalIdx = g.opticalDupIdxArr.back(); vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end()); } /* 串行处理数据,标记冗余 */ static void serialMarkDups() { tm_arr[5].acc_start(); Timer::log_time("serial start"); // 读取缓存初始化 BamBufType inBamBuf(g_gArg.use_asyncio); inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem); // BamBufType inBamBuf(false); // inBamBuf.Init(g_inBamFp, g_inBamHeader, 20 * 1024 * 1024); int64_t processedBamNum = 0; SerailMarkDupArg smdArg1, smdArg2; SerailMarkDupArg *lastArgP = &smdArg1; SerailMarkDupArg *curArgP = &smdArg2; bool isFirstRound = true; int roundNum = 0; while (inBamBuf.ReadStat() >= 0) { Timer t_round; // 读取bam文件中的read tm_arr[4].acc_start(); size_t readNum = inBamBuf.ReadBam(); tm_arr[4].acc_end(); cout << "read num: " << readNum << endl; // lastArgP = curArgP; tm_arr[6].acc_start(); curArgP->taskSeq = roundNum; curArgP->bamStartIdx = processedBamNum; curArgP->bams = inBamBuf.GetBamArr(); tm_arr[6].acc_end(); tm_arr[0].acc_start(); Timer t1; generateReadEnds(curArgP); cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl; tm_arr[0].acc_end(); tm_arr[1].acc_start(); t1.reinit(); markdups(curArgP); cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl; tm_arr[1].acc_end(); if (!isFirstRound) { tm_arr[2].acc_start(); t1.reinit(); handleIntersectData(lastArgP, curArgP, &gData); cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl; // addTaskIdxToSet(lastArgP, &gData); tm_arr[2].acc_end(); } else { isFirstRound = false; } inBamBuf.ClearAll(); // 清理上一轮读入的数据 processedBamNum += readNum; // 交换 auto tmp = lastArgP; lastArgP = curArgP; curArgP = tmp; cout << "round time: " << t_round.seconds_elapsed() << endl; roundNum++; if (roundNum > 9){ // break; } } tm_arr[3].acc_start(); // 处理剩下的全局数据 handleLastTask(lastArgP, &gData); tm_arr[3].acc_end(); tm_arr[5].acc_end(); // 统计所有冗余index数量 int64_t dupNum = 0; unordered_set dup; for (auto &arr : gData.dupIdxArr) for (auto idx : arr) dup.insert(idx); dupNum += dup.size(); cout << "dup num : " << dupNum << endl; cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl; cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl; cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl; cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl; cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl; cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl; cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl; cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; Timer::log_time("serial end "); //for (auto i : gData.dupArr) // cout << i << endl; }