2023-12-04 18:02:07 +08:00
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
#include <robin-map/include/tsl/robin_map.h>
|
|
|
|
|
|
|
|
|
|
|
|
/* 存放未匹配readend相同位点的所有readend */
|
|
|
|
|
|
struct UnpairedREInfo
|
|
|
|
|
|
{
|
|
|
|
|
|
int64_t taskSeq;
|
|
|
|
|
|
ReadEnds unpairedRE;
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct GlobalUnpairedInfo
|
|
|
|
|
|
{
|
|
|
|
|
|
int64_t taskSeq;
|
|
|
|
|
|
vector<ReadEnds> reArr;
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct TaskSeqDupInfo
|
|
|
|
|
|
{
|
|
|
|
|
|
set<int64_t> dupIdx;
|
|
|
|
|
|
set<int64_t> opticalDupIdx;
|
|
|
|
|
|
set<int64_t> notDupIdx;
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
|
|
|
|
|
|
// typedef unordered_map<int64_t, vector<ReadEnds>> UnpairedPositionMap;
|
|
|
|
|
|
|
|
|
|
|
|
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap;
|
|
|
|
|
|
typedef tsl::robin_map<int64_t, vector<ReadEnds>> UnpairedPositionMap;
|
|
|
|
|
|
|
2023-11-28 10:45:40 +08:00
|
|
|
|
/* 单线程处理冗余参数结构体 */
|
|
|
|
|
|
struct SerailMarkDupArg
|
|
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
int64_t taskSeq;
|
|
|
|
|
|
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
|
|
|
|
|
vector<BamWrap *> bams; // 存放待处理的bam read
|
|
|
|
|
|
vector<ReadEnds> pairs;
|
|
|
|
|
|
vector<ReadEnds> frags;
|
|
|
|
|
|
set<int64_t> pairDupIdx; // pair的冗余read的索引
|
|
|
|
|
|
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
|
|
|
|
|
set<int64_t> fragDupIdx; // frag的冗余read的索引
|
|
|
|
|
|
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
|
|
|
|
|
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储
|
2023-11-28 10:45:40 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
|
|
|
|
|
struct GlobalDataArg
|
|
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
set<int64_t> pairDupIdx; // pair的冗余read的索引
|
|
|
|
|
|
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
|
|
|
|
|
set<int64_t> notDupIdx; // 不是冗余
|
|
|
|
|
|
//unordered_map<string, UnpairedREInfo> unpairedDic; // 用来寻找pair end
|
|
|
|
|
|
//unordered_map<int64_t, vector<ReadEnds>> unpairedPosArr;
|
|
|
|
|
|
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
|
|
|
|
|
UnpairedPositionMap unpairedPosArr;
|
|
|
|
|
|
|
|
|
|
|
|
// 每个task对应一个vector
|
|
|
|
|
|
vector<vector<int64_t>> dupIdxArr;
|
|
|
|
|
|
vector<vector<int64_t>> opticalDupIdxArr;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
static GlobalDataArg gData;
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
/* 查找 */
|
|
|
|
|
|
template<class Itr, class T>
|
|
|
|
|
|
static inline Itr binaryFind(Itr first, Itr last, const T &val)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
first = std::lower_bound(first, last, val);
|
|
|
|
|
|
return (first != last && *first == val) ? first : last;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 排序 */
|
|
|
|
|
|
static inline void sortReadEndsArr(vector<ReadEnds> &arr)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
size_t blockSize = 64 * 1024;
|
|
|
|
|
|
blockSize = min(blockSize, arr.size());
|
|
|
|
|
|
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
|
|
|
|
|
|
size_t crossNum = 1024;
|
|
|
|
|
|
size_t start, end, i, left, right;
|
|
|
|
|
|
std::sort(arr.begin(), arr.begin() + blockSize);
|
|
|
|
|
|
for (i = 1; i < blockNum; ++i)
|
|
|
|
|
|
{
|
|
|
|
|
|
start = i * blockSize;
|
|
|
|
|
|
end = min(start + blockSize, arr.size());
|
|
|
|
|
|
std::sort(arr.begin() + start, arr.begin() + end);
|
|
|
|
|
|
left = crossNum;
|
|
|
|
|
|
while (!(arr[start - left] < arr[start]))
|
|
|
|
|
|
{
|
|
|
|
|
|
left = left << 1;
|
|
|
|
|
|
if (left >= blockSize)
|
|
|
|
|
|
{
|
|
|
|
|
|
std::sort(arr.begin(), arr.end()); // 退化到普通排序
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
right = min(crossNum, end - start - 1);
|
|
|
|
|
|
|
|
|
|
|
|
while (!(arr[start - 1] < arr[start + right]))
|
|
|
|
|
|
{
|
|
|
|
|
|
right = min(right << 1, end - start - 1);
|
|
|
|
|
|
if (right == end - start - 1)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
// cout << "sort: " << left << ' ' << right << ' '
|
|
|
|
|
|
// << arr[start - left].posKey << ' ' << arr[start - 1].posKey << ' '
|
|
|
|
|
|
// << arr[start].posKey << ' ' << arr[start + right].posKey << endl;
|
|
|
|
|
|
std::sort(arr.begin() + start - left, arr.begin() + start + right);
|
|
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 处理一组pairend的readends,标记冗余 */
|
|
|
|
|
|
static void markDupsForPairs(vector<const ReadEnds *> &vpRe,
|
|
|
|
|
|
set<int64_t> *dupIdx,
|
|
|
|
|
|
set<int64_t> *opticalDupIdx,
|
|
|
|
|
|
set<int64_t> *notDupIdx = nullptr)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
if (vpRe.size() < 2)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (vpRe.size() == 1)
|
|
|
|
|
|
{
|
|
|
|
|
|
// addSingletonToCount(libraryIdGenerator);
|
|
|
|
|
|
}
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
int maxScore = 0;
|
|
|
|
|
|
const ReadEnds *pBest = nullptr;
|
|
|
|
|
|
/** All read ends should have orientation FF, FR, RF, or RR **/
|
|
|
|
|
|
for (auto pe : vpRe) // 找分数最高的readend
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pe->score > maxScore || pBest == nullptr)
|
|
|
|
|
|
{
|
|
|
|
|
|
maxScore = pe->score;
|
|
|
|
|
|
pBest = pe;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (notDupIdx != nullptr)
|
|
|
|
|
|
{
|
|
|
|
|
|
notDupIdx->insert(pBest->read1IndexInFile);
|
|
|
|
|
|
notDupIdx->insert(pBest->read2IndexInFile);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
|
|
|
|
|
{
|
|
|
|
|
|
// trackOpticalDuplicates
|
|
|
|
|
|
}
|
|
|
|
|
|
for (auto pe : vpRe) // 对非best read标记冗余
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pe != pBest) // 非best
|
|
|
|
|
|
{
|
|
|
|
|
|
dupIdx->insert(pe->read1IndexInFile); // 添加read1
|
|
|
|
|
|
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
|
|
|
|
|
dupIdx->insert(pe->read2IndexInFile); // 添加read2
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS)
|
|
|
|
|
|
// {
|
|
|
|
|
|
// addRepresentativeReadIndex(vpRe);
|
|
|
|
|
|
// }
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 处理一组非paired的readends,标记冗余 */
|
|
|
|
|
|
static void markDupsForFrags(vector<const ReadEnds *> &vpRe,
|
|
|
|
|
|
bool containsPairs,
|
|
|
|
|
|
set<int64_t> *dupIdx,
|
|
|
|
|
|
set<int64_t> *notDupIdx = nullptr)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
if (containsPairs)
|
|
|
|
|
|
{
|
|
|
|
|
|
for (auto pe : vpRe)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!pe->IsPaired())
|
|
|
|
|
|
{
|
|
|
|
|
|
dupIdx->insert(pe->read1IndexInFile);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
int maxScore = 0;
|
|
|
|
|
|
const ReadEnds *pBest = nullptr;
|
|
|
|
|
|
for (auto pe : vpRe)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pe->score > maxScore || pBest == nullptr)
|
|
|
|
|
|
{
|
|
|
|
|
|
maxScore = pe->score;
|
|
|
|
|
|
pBest = pe;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (notDupIdx != nullptr)
|
|
|
|
|
|
{
|
|
|
|
|
|
notDupIdx->insert(pBest->read1IndexInFile);
|
|
|
|
|
|
}
|
|
|
|
|
|
for (auto pe : vpRe)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pe != pBest)
|
|
|
|
|
|
{
|
|
|
|
|
|
dupIdx->insert(pe->read1IndexInFile);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 找到与readend pos相等的所有readend */
|
|
|
|
|
|
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsCmp);
|
|
|
|
|
|
dst->insert(dst->end(), range.first, range.second);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 单线程生成readends (第一步)*/
|
|
|
|
|
|
static void generateReadEnds(SerailMarkDupArg *arg)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto &p = *arg;
|
|
|
|
|
|
auto &rnParser = g_vRnParser[0];
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
p.pairs.clear();
|
|
|
|
|
|
p.frags.clear();
|
|
|
|
|
|
p.unpairedDic.clear();
|
|
|
|
|
|
p.unpairedPosArr.clear();
|
|
|
|
|
|
|
2023-11-28 10:45:40 +08:00
|
|
|
|
/* 处理每个read,创建ReadEnd,并放入frag和pair中 */
|
2023-12-04 18:02:07 +08:00
|
|
|
|
set<ReadEnds> reSet;
|
|
|
|
|
|
|
|
|
|
|
|
ReadEnds lastRe;
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < p.bams.size(); ++i) // 循环处理每个read
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
BamWrap *bw = p.bams[i];
|
2023-11-28 10:45:40 +08:00
|
|
|
|
const int64_t bamIdx = p.bamStartIdx + i;
|
|
|
|
|
|
if (bw->GetReadUnmappedFlag())
|
|
|
|
|
|
{
|
|
|
|
|
|
if (bw->b->core.tid == -1)
|
|
|
|
|
|
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (!bw->IsSecondaryOrSupplementary()) // 是主要比对
|
|
|
|
|
|
{
|
|
|
|
|
|
ReadEnds fragEnd;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
tm_arr[8].acc_start();
|
2023-11-28 10:45:40 +08:00
|
|
|
|
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
|
2023-12-04 18:02:07 +08:00
|
|
|
|
tm_arr[8].acc_end();
|
|
|
|
|
|
p.frags.push_back(fragEnd); // 添加进frag集合
|
2023-11-28 10:45:40 +08:00
|
|
|
|
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) // 是pairend而且互补的read也比对上了
|
|
|
|
|
|
{
|
|
|
|
|
|
string key = bw->query_name();
|
|
|
|
|
|
if (p.unpairedDic.find(key) == p.unpairedDic.end())
|
|
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
p.unpairedDic[key] = {p.taskSeq, fragEnd};
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
else // 找到了pairend
|
|
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
modifyPairedEnds(fragEnd, &pairedEnds);
|
2023-12-04 18:02:07 +08:00
|
|
|
|
// if (pairedEnds.read1IndexInFile == 94 || pairedEnds.read1IndexInFile == 95)
|
|
|
|
|
|
// {
|
|
|
|
|
|
// cout << "pair score: " << pairedEnds.read1IndexInFile << ' ' << pairedEnds.score << endl;
|
|
|
|
|
|
// }
|
|
|
|
|
|
// if (pairedEnds.read1IndexInFile == 94)
|
|
|
|
|
|
// {
|
|
|
|
|
|
// lastRe = pairedEnds;
|
|
|
|
|
|
// }
|
|
|
|
|
|
// if (pairedEnds.read1IndexInFile == 95)
|
|
|
|
|
|
// {
|
|
|
|
|
|
// cout << "compare: " << (lastRe < pairedEnds) << ' ' << (pairedEnds < lastRe) << endl;
|
|
|
|
|
|
//
|
|
|
|
|
|
// }
|
|
|
|
|
|
p.pairs.push_back(pairedEnds);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
p.unpairedDic.erase(key); // 删除找到的pairend
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2023-12-04 18:02:07 +08:00
|
|
|
|
// cout << "sort frags" << endl;
|
|
|
|
|
|
sortReadEndsArr(p.frags);
|
|
|
|
|
|
// sort(p.frags.begin(), p.frags.end());
|
|
|
|
|
|
// cout << "sort pairs" << endl;
|
|
|
|
|
|
// sortReadEndsArr(p.pairs);
|
|
|
|
|
|
sort(p.pairs.begin(), p.pairs.end());
|
|
|
|
|
|
// cout << "unpaired num: " << p.unpairedDic.size() << endl;
|
|
|
|
|
|
|
|
|
|
|
|
// 把未匹配的pair对应的每个位点的pairs记录下来
|
|
|
|
|
|
for (auto &e : p.unpairedDic) {
|
|
|
|
|
|
auto &unpair = e.second;
|
|
|
|
|
|
auto posKey = unpair.unpairedRE.posKey;
|
|
|
|
|
|
if (p.unpairedPosArr.find(posKey) == p.unpairedPosArr.end())
|
|
|
|
|
|
getEqualRE(unpair.unpairedRE, p.pairs, &p.unpairedPosArr[posKey]);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 处理pairs */
|
|
|
|
|
|
static void processPairs(vector<ReadEnds> &readEnds,
|
|
|
|
|
|
set<int64_t> *dupIdx,
|
|
|
|
|
|
set<int64_t> *opticalDupIdx,
|
|
|
|
|
|
set<int64_t> *notDupIdx = nullptr)
|
|
|
|
|
|
{
|
|
|
|
|
|
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
|
|
|
|
|
const ReadEnds *pReadEnd = nullptr;
|
|
|
|
|
|
for (auto &re : readEnds)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
|
|
|
|
|
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx); // 不一样
|
|
|
|
|
|
vpCache.clear();
|
|
|
|
|
|
vpCache.push_back(&re);
|
|
|
|
|
|
pReadEnd = &re;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, notDupIdx);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 处理frags */
|
|
|
|
|
|
static void processFrags(vector<ReadEnds> &readEnds,
|
|
|
|
|
|
set<int64_t> *dupIdx,
|
|
|
|
|
|
set<int64_t> *notDupIdx = nullptr)
|
|
|
|
|
|
{
|
|
|
|
|
|
bool containsPairs = false;
|
|
|
|
|
|
bool containsFrags = false;
|
|
|
|
|
|
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
|
|
|
|
|
|
const ReadEnds *pReadEnd = nullptr;
|
|
|
|
|
|
for (auto &re : readEnds)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false))
|
|
|
|
|
|
{
|
|
|
|
|
|
vpCache.push_back(&re);
|
|
|
|
|
|
containsPairs = containsPairs || re.IsPaired();
|
|
|
|
|
|
containsFrags = containsFrags || !re.IsPaired();
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
if (vpCache.size() > 1 && containsFrags)
|
|
|
|
|
|
{
|
|
|
|
|
|
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
|
|
|
|
|
}
|
|
|
|
|
|
vpCache.clear();
|
|
|
|
|
|
vpCache.push_back(&re);
|
|
|
|
|
|
pReadEnd = &re;
|
|
|
|
|
|
containsPairs = re.IsPaired();
|
|
|
|
|
|
containsFrags = !re.IsPaired();
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (vpCache.size() > 1 && containsFrags)
|
|
|
|
|
|
{
|
|
|
|
|
|
markDupsForFrags(vpCache, containsPairs, dupIdx, notDupIdx);
|
|
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 单线程markdup (第二步)*/
|
|
|
|
|
|
static void markdups(SerailMarkDupArg *arg)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto &p = *arg;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
p.pairDupIdx.clear();
|
|
|
|
|
|
p.pairOpticalDupIdx.clear();
|
|
|
|
|
|
p.fragDupIdx.clear();
|
2023-11-28 10:45:40 +08:00
|
|
|
|
/* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */
|
|
|
|
|
|
// 先处理 pair
|
2023-12-04 18:02:07 +08:00
|
|
|
|
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
|
|
|
|
|
|
// 再处理frag
|
2023-12-04 18:02:07 +08:00
|
|
|
|
processFrags(p.frags, &p.fragDupIdx);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 获取交叉部分的数据 */
|
|
|
|
|
|
static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
const size_t leftEndIdx = leftArr.size() - 1;
|
|
|
|
|
|
const size_t rightStartIdx = 0;
|
|
|
|
|
|
size_t leftSpan = 0;
|
|
|
|
|
|
size_t rightSpan = 0;
|
|
|
|
|
|
|
|
|
|
|
|
while (!(leftArr[leftEndIdx - leftSpan] < rightArr[rightStartIdx]))
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
leftSpan += 1;
|
|
|
|
|
|
if (leftSpan > leftEndIdx)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
leftSpan = leftArr.size() - 1;
|
|
|
|
|
|
break;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
while (!(leftArr[leftEndIdx] < rightArr[rightSpan]))
|
|
|
|
|
|
{
|
|
|
|
|
|
rightSpan += 1;
|
|
|
|
|
|
if (rightSpan == rightArr.size() - 1)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
|
|
|
|
|
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
|
|
|
|
|
|
std::sort(dst->begin(), dst->end());
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 将重叠部分的dup idx放进数据中 */
|
|
|
|
|
|
static inline void refreshFragDupIdx(set<int64_t> &dupIdx,
|
|
|
|
|
|
set<int64_t> ¬DupIdx,
|
|
|
|
|
|
SerailMarkDupArg * lastArg,
|
|
|
|
|
|
SerailMarkDupArg *curArg)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
|
|
|
|
|
auto &lp = *lastArg;
|
|
|
|
|
|
auto &p = *curArg;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
for (auto idx : dupIdx)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
lp.fragDupIdx.insert(idx);
|
|
|
|
|
|
p.fragDupIdx.erase(idx);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
2023-12-04 18:02:07 +08:00
|
|
|
|
for (auto idx : notDupIdx)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
lp.fragDupIdx.erase(idx);
|
|
|
|
|
|
p.fragDupIdx.erase(idx);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
2023-12-04 18:02:07 +08:00
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
static inline void refreshPairDupIdx(set<int64_t> &dupIdx,
|
|
|
|
|
|
set<int64_t> &opticalDupIdx,
|
|
|
|
|
|
set<int64_t> ¬DupIdx,
|
|
|
|
|
|
SerailMarkDupArg *lastArg,
|
|
|
|
|
|
SerailMarkDupArg *curArg)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto &lp = *lastArg;
|
|
|
|
|
|
auto &p = *curArg;
|
|
|
|
|
|
for (auto idx : dupIdx)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
lp.pairDupIdx.insert(idx);
|
|
|
|
|
|
p.pairDupIdx.erase(idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
for (auto idx : opticalDupIdx)
|
|
|
|
|
|
{
|
|
|
|
|
|
lp.pairOpticalDupIdx.insert(idx);
|
|
|
|
|
|
p.pairOpticalDupIdx.erase(idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
for (auto idx : notDupIdx)
|
|
|
|
|
|
{
|
|
|
|
|
|
lp.pairDupIdx.erase(idx);
|
|
|
|
|
|
lp.pairOpticalDupIdx.erase(idx);
|
|
|
|
|
|
p.pairDupIdx.erase(idx);
|
|
|
|
|
|
p.pairOpticalDupIdx.erase(idx);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 处理未匹配的部分 */
|
|
|
|
|
|
static inline void processUnpairedPosForCalc(UnpairedNameMap &lpUnpairedDic,
|
|
|
|
|
|
UnpairedPositionMap &lpUnpairedPosArr,
|
|
|
|
|
|
UnpairedNameMap &pUnpairedDic,
|
|
|
|
|
|
UnpairedPositionMap &pUnpairedPosArr,
|
|
|
|
|
|
vector<ReadEnds> &pairs,
|
|
|
|
|
|
map<int64_t, int64_t> &recalcPos,
|
|
|
|
|
|
bool addToLast = false)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
recalcPos.clear();
|
|
|
|
|
|
for (auto itr = pUnpairedDic.begin(); itr != pUnpairedDic.end();)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
auto &readName = itr->first;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
if (lpUnpairedDic.find(readName) != lpUnpairedDic.end())
|
|
|
|
|
|
{
|
|
|
|
|
|
auto &posInfo = lpUnpairedDic[readName];
|
|
|
|
|
|
auto posKey = posInfo.unpairedRE.posKey;
|
|
|
|
|
|
auto &posReArr = lpUnpairedPosArr[posKey];
|
|
|
|
|
|
|
|
|
|
|
|
modifyPairedEnds(itr->second.unpairedRE, &posInfo.unpairedRE);
|
|
|
|
|
|
posKey = posInfo.unpairedRE.posKey;
|
|
|
|
|
|
if (recalcPos.find(posKey) == recalcPos.end()) // 如果之前没有这个位点
|
|
|
|
|
|
getEqualRE(posInfo.unpairedRE, pairs, &posReArr);
|
|
|
|
|
|
recalcPos[posKey] = posInfo.taskSeq;
|
|
|
|
|
|
posReArr.push_back(posInfo.unpairedRE);
|
|
|
|
|
|
std::sort(posReArr.begin(), posReArr.end());
|
|
|
|
|
|
|
|
|
|
|
|
lpUnpairedDic.erase(readName);
|
|
|
|
|
|
itr = pUnpairedDic.erase(itr);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
2023-12-04 18:02:07 +08:00
|
|
|
|
else
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
if (addToLast) // 将数据添加进遗留数据中
|
|
|
|
|
|
{
|
|
|
|
|
|
auto posKey = itr->second.unpairedRE.posKey;
|
|
|
|
|
|
lpUnpairedDic[readName] = itr->second;
|
|
|
|
|
|
lpUnpairedPosArr[posKey] = pUnpairedPosArr[posKey];
|
|
|
|
|
|
}
|
|
|
|
|
|
++itr;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
// 用来分别处理dup和optical dup
|
|
|
|
|
|
static void refeshTaskDupInfo(vector<int64_t> &addDup,
|
|
|
|
|
|
map<int64_t, int64_t> &ndPosVal,
|
|
|
|
|
|
set<int64_t> &dupIdx,
|
|
|
|
|
|
set<int64_t> ¬DupIdx,
|
|
|
|
|
|
vector<int64_t> &dupArr)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
addDup.clear();
|
|
|
|
|
|
ndPosVal.clear();
|
|
|
|
|
|
// 去除之前有的,重复的
|
|
|
|
|
|
for (auto i = dupIdx.begin(); i != dupIdx.end();)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
auto itr = binaryFind(dupArr.begin(), dupArr.end(), *i);
|
|
|
|
|
|
if (itr != dupArr.end())
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
i = dupIdx.erase(i);
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
++i;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2023-12-04 18:02:07 +08:00
|
|
|
|
// 添加现有的
|
|
|
|
|
|
auto di = dupIdx.begin();
|
|
|
|
|
|
for (auto nidx : notDupIdx)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto itr = binaryFind(dupArr.begin(), dupArr.end(), nidx);
|
|
|
|
|
|
if (itr != dupArr.end())
|
|
|
|
|
|
{
|
|
|
|
|
|
ndPosVal[itr - dupArr.begin()] = *di++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
while (di != dupIdx.end())
|
|
|
|
|
|
addDup.push_back(*di++);
|
|
|
|
|
|
|
|
|
|
|
|
for (auto pos : ndPosVal)
|
|
|
|
|
|
dupArr[pos.first] = pos.second;
|
|
|
|
|
|
dupArr.insert(dupArr.end(), addDup.begin(), addDup.end());
|
|
|
|
|
|
std::sort(dupArr.begin(), dupArr.end());
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 将遗留的冗余信息添加进对应的任务数据中 */
|
|
|
|
|
|
static void addDupInfoToTask(map<int64_t, TaskSeqDupInfo> &seqTaskChanged, GlobalDataArg *gDataArg)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
auto &g = *gDataArg;
|
|
|
|
|
|
// 更新遗留的结果
|
|
|
|
|
|
vector<int64_t> addDup;
|
|
|
|
|
|
map<int64_t, int64_t> ndPosVal;
|
|
|
|
|
|
for (auto &e : seqTaskChanged)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
refeshTaskDupInfo(addDup, ndPosVal, e.second.dupIdx, e.second.notDupIdx, g.dupIdxArr[e.first]);
|
|
|
|
|
|
refeshTaskDupInfo(addDup, ndPosVal, e.second.opticalDupIdx, e.second.notDupIdx, g.opticalDupIdxArr[e.first]);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 处理相邻的两个任务,有相交叉的数据 */
|
|
|
|
|
|
static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg, GlobalDataArg *gDataArg)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
auto &lp = *lastArg;
|
|
|
|
|
|
auto &p = *curArg;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
auto &g = *gDataArg;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
vector<ReadEnds> reArr;
|
|
|
|
|
|
set<int64_t> dupIdx;
|
|
|
|
|
|
set<int64_t> notDupIdx;
|
|
|
|
|
|
// 先处理重叠的frags
|
|
|
|
|
|
getIntersectData(lp.frags, p.frags, &reArr);
|
|
|
|
|
|
processFrags(reArr, &dupIdx, ¬DupIdx);
|
|
|
|
|
|
refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
|
|
|
|
|
|
|
|
|
|
|
|
// 再处理重叠的pairs
|
|
|
|
|
|
reArr.clear();
|
|
|
|
|
|
dupIdx.clear();
|
|
|
|
|
|
notDupIdx.clear();
|
|
|
|
|
|
set<int64_t> opticalDupIdx;
|
|
|
|
|
|
getIntersectData(lp.pairs, p.pairs, &reArr);
|
|
|
|
|
|
processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx);
|
|
|
|
|
|
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
|
|
|
|
|
|
|
|
|
|
|
// 处理之前未匹配的部分
|
|
|
|
|
|
map<int64_t, int64_t> recalcPos;
|
|
|
|
|
|
processUnpairedPosForCalc(lp.unpairedDic,
|
|
|
|
|
|
lp.unpairedPosArr,
|
|
|
|
|
|
p.unpairedDic,
|
|
|
|
|
|
p.unpairedPosArr,
|
|
|
|
|
|
p.pairs,
|
|
|
|
|
|
recalcPos);
|
|
|
|
|
|
for (auto &e : recalcPos)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto posKey = e.first;
|
|
|
|
|
|
dupIdx.clear();
|
|
|
|
|
|
notDupIdx.clear();
|
|
|
|
|
|
opticalDupIdx.clear();
|
|
|
|
|
|
processPairs(lp.unpairedPosArr[posKey], &dupIdx, &opticalDupIdx, ¬DupIdx);
|
|
|
|
|
|
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 遗留的未匹配的pair
|
|
|
|
|
|
processUnpairedPosForCalc(g.unpairedDic,
|
|
|
|
|
|
g.unpairedPosArr,
|
|
|
|
|
|
lp.unpairedDic,
|
|
|
|
|
|
lp.unpairedPosArr,
|
|
|
|
|
|
lp.pairs,
|
|
|
|
|
|
recalcPos,
|
|
|
|
|
|
true);
|
|
|
|
|
|
map<int64_t, TaskSeqDupInfo> seqTaskChanged;
|
|
|
|
|
|
for (auto &e : recalcPos)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto posKey = e.first;
|
|
|
|
|
|
auto seqNum = e.second;
|
|
|
|
|
|
auto &t = seqTaskChanged[seqNum];
|
|
|
|
|
|
// 在对应的任务包含的dup idx里修改结果数据
|
|
|
|
|
|
processPairs(g.unpairedPosArr[posKey], &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
|
|
|
|
|
g.unpairedPosArr.erase(posKey);
|
|
|
|
|
|
}
|
|
|
|
|
|
addDupInfoToTask(seqTaskChanged, &g);
|
|
|
|
|
|
|
|
|
|
|
|
cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
|
|
|
|
|
|
|
|
|
|
|
// 将dupidx放进全局数据
|
|
|
|
|
|
g.dupIdxArr.push_back(vector<int64_t>());
|
|
|
|
|
|
auto &vIdx = g.dupIdxArr.back();
|
|
|
|
|
|
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
|
|
|
|
|
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
|
|
|
|
|
|
|
|
|
|
|
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
|
|
|
|
|
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
|
|
|
|
|
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
/* 当所有任务结束后,global data里还有未处理的数据 */
|
|
|
|
|
|
static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
auto &lp = *task;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
auto &g = *gDataArg;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
map<int64_t, int64_t> recalcPos;
|
|
|
|
|
|
// 遗留的未匹配的pair
|
|
|
|
|
|
processUnpairedPosForCalc(g.unpairedDic,
|
|
|
|
|
|
g.unpairedPosArr,
|
|
|
|
|
|
lp.unpairedDic,
|
|
|
|
|
|
lp.unpairedPosArr,
|
|
|
|
|
|
lp.pairs,
|
|
|
|
|
|
recalcPos,
|
|
|
|
|
|
true);
|
|
|
|
|
|
map<int64_t, TaskSeqDupInfo> seqTaskChanged;
|
|
|
|
|
|
for (auto &e : recalcPos)
|
|
|
|
|
|
{
|
|
|
|
|
|
auto posKey = e.first;
|
|
|
|
|
|
auto seqNum = e.second;
|
|
|
|
|
|
auto &t = seqTaskChanged[seqNum];
|
|
|
|
|
|
// 在对应的任务包含的dup idx里修改结果数据
|
|
|
|
|
|
processPairs(g.unpairedPosArr[posKey], &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
|
|
|
|
|
g.unpairedPosArr.erase(posKey);
|
|
|
|
|
|
}
|
|
|
|
|
|
// 更新遗留的结果
|
|
|
|
|
|
addDupInfoToTask(seqTaskChanged, &g);
|
|
|
|
|
|
|
|
|
|
|
|
cout << "last unpair info: " << g.unpairedPosArr.size() << '\t' << g.unpairedDic.size() << endl;
|
|
|
|
|
|
|
|
|
|
|
|
// 将dupidx放进全局数据
|
|
|
|
|
|
g.dupIdxArr.push_back(vector<int64_t>());
|
|
|
|
|
|
auto &vIdx = g.dupIdxArr.back();
|
|
|
|
|
|
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
|
|
|
|
|
vIdx.insert(vIdx.end(), lp.pairDupIdx.begin(), lp.pairDupIdx.end());
|
|
|
|
|
|
|
|
|
|
|
|
g.opticalDupIdxArr.push_back(vector<int64_t>());
|
|
|
|
|
|
auto &vOpticalIdx = g.opticalDupIdxArr.back();
|
|
|
|
|
|
vOpticalIdx.insert(vOpticalIdx.end(), lp.pairOpticalDupIdx.begin(), lp.pairOpticalDupIdx.end());
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 串行处理数据,标记冗余 */
|
|
|
|
|
|
static void serialMarkDups()
|
|
|
|
|
|
{
|
|
|
|
|
|
tm_arr[5].acc_start();
|
|
|
|
|
|
Timer::log_time("serial start");
|
|
|
|
|
|
// 读取缓存初始化
|
|
|
|
|
|
BamBufType inBamBuf(g_gArg.use_asyncio);
|
|
|
|
|
|
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
|
|
|
|
|
// BamBufType inBamBuf(false);
|
2023-12-04 18:02:07 +08:00
|
|
|
|
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 20 * 1024 * 1024);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
int64_t processedBamNum = 0;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
SerailMarkDupArg smdArg1, smdArg2;
|
|
|
|
|
|
SerailMarkDupArg *lastArgP = &smdArg1;
|
|
|
|
|
|
SerailMarkDupArg *curArgP = &smdArg2;
|
|
|
|
|
|
|
|
|
|
|
|
bool isFirstRound = true;
|
|
|
|
|
|
int roundNum = 0;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
while (inBamBuf.ReadStat() >= 0)
|
|
|
|
|
|
{
|
2023-12-04 18:02:07 +08:00
|
|
|
|
Timer t_round;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
// 读取bam文件中的read
|
|
|
|
|
|
tm_arr[4].acc_start();
|
|
|
|
|
|
size_t readNum = inBamBuf.ReadBam();
|
|
|
|
|
|
tm_arr[4].acc_end();
|
|
|
|
|
|
cout << "read num: " << readNum << endl;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
// lastArgP = curArgP;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
tm_arr[6].acc_start();
|
2023-12-04 18:02:07 +08:00
|
|
|
|
curArgP->taskSeq = roundNum;
|
|
|
|
|
|
curArgP->bamStartIdx = processedBamNum;
|
|
|
|
|
|
curArgP->bams = inBamBuf.GetBamArr();
|
2023-11-28 10:45:40 +08:00
|
|
|
|
tm_arr[6].acc_end();
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
2023-11-28 10:45:40 +08:00
|
|
|
|
tm_arr[0].acc_start();
|
2023-12-04 18:02:07 +08:00
|
|
|
|
Timer t1;
|
|
|
|
|
|
generateReadEnds(curArgP);
|
|
|
|
|
|
cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
tm_arr[0].acc_end();
|
|
|
|
|
|
|
|
|
|
|
|
tm_arr[1].acc_start();
|
2023-12-04 18:02:07 +08:00
|
|
|
|
t1.reinit();
|
|
|
|
|
|
markdups(curArgP);
|
|
|
|
|
|
cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
tm_arr[1].acc_end();
|
|
|
|
|
|
|
2023-12-04 18:02:07 +08:00
|
|
|
|
if (!isFirstRound)
|
2023-11-28 10:45:40 +08:00
|
|
|
|
{
|
|
|
|
|
|
tm_arr[2].acc_start();
|
2023-12-04 18:02:07 +08:00
|
|
|
|
t1.reinit();
|
|
|
|
|
|
handleIntersectData(lastArgP, curArgP, &gData);
|
|
|
|
|
|
cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl;
|
|
|
|
|
|
// addTaskIdxToSet(lastArgP, &gData);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
tm_arr[2].acc_end();
|
2023-12-04 18:02:07 +08:00
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
isFirstRound = false;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
inBamBuf.ClearAll(); // 清理上一轮读入的数据
|
|
|
|
|
|
processedBamNum += readNum;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
// 交换
|
|
|
|
|
|
auto tmp = lastArgP;
|
|
|
|
|
|
lastArgP = curArgP;
|
|
|
|
|
|
curArgP = tmp;
|
|
|
|
|
|
cout << "round time: " << t_round.seconds_elapsed() << endl;
|
|
|
|
|
|
roundNum++;
|
|
|
|
|
|
if (roundNum > 9){
|
|
|
|
|
|
// break;
|
|
|
|
|
|
}
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
tm_arr[3].acc_start();
|
|
|
|
|
|
// 处理剩下的全局数据
|
2023-12-04 18:02:07 +08:00
|
|
|
|
handleLastTask(lastArgP, &gData);
|
2023-11-28 10:45:40 +08:00
|
|
|
|
tm_arr[3].acc_end();
|
|
|
|
|
|
|
|
|
|
|
|
tm_arr[5].acc_end();
|
|
|
|
|
|
// 统计所有冗余index数量
|
2023-12-04 18:02:07 +08:00
|
|
|
|
int64_t dupNum = 0;
|
|
|
|
|
|
unordered_set<int64_t> dup;
|
|
|
|
|
|
for (auto &arr : gData.dupIdxArr)
|
|
|
|
|
|
for (auto idx : arr)
|
|
|
|
|
|
dup.insert(idx);
|
|
|
|
|
|
dupNum += dup.size();
|
|
|
|
|
|
cout << "dup num : " << dupNum << endl;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
|
|
|
|
|
|
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
|
|
|
|
|
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
|
|
|
|
|
|
cout << "handle tail : " << tm_arr[2].acc_seconds_elapsed() << endl;
|
|
|
|
|
|
cout << "handle last : " << tm_arr[3].acc_seconds_elapsed() << endl;
|
|
|
|
|
|
cout << "read bam : " << tm_arr[4].acc_seconds_elapsed() << endl;
|
|
|
|
|
|
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
|
|
|
|
|
|
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
|
|
|
|
|
|
|
|
|
|
|
Timer::log_time("serial end ");
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
//for (auto i : gData.dupArr)
|
|
|
|
|
|
// cout << i << endl;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
}
|