2024-08-22 02:28:36 +08:00
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
|
|
#include <common/hts/bam_buf.h>
|
2023-12-04 18:02:07 +08:00
|
|
|
|
#include <robin-map/include/tsl/robin_map.h>
|
2024-09-04 11:08:03 +08:00
|
|
|
|
#include <robin-map/include/tsl/robin_set.h>
|
2024-08-22 02:28:36 +08:00
|
|
|
|
#include <sam/utils/read_ends.h>
|
|
|
|
|
|
|
|
|
|
|
|
#include <set>
|
|
|
|
|
|
#include <string>
|
|
|
|
|
|
#include <vector>
|
2024-09-04 11:08:03 +08:00
|
|
|
|
#include <unordered_set>
|
2024-08-22 02:28:36 +08:00
|
|
|
|
|
|
|
|
|
|
using std::set;
|
2024-09-04 11:08:03 +08:00
|
|
|
|
using std::unordered_set;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
using std::string;
|
|
|
|
|
|
using std::vector;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
|
|
|
|
|
/* 存放未匹配readend相同位点的所有readend */
|
2024-08-22 02:28:36 +08:00
|
|
|
|
struct UnpairedREInfo {
|
2023-12-04 18:02:07 +08:00
|
|
|
|
int64_t taskSeq;
|
|
|
|
|
|
ReadEnds unpairedRE;
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2023-12-08 03:57:30 +08:00
|
|
|
|
/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */
|
2024-08-22 02:28:36 +08:00
|
|
|
|
struct CalcKey {
|
2023-12-08 03:57:30 +08:00
|
|
|
|
int64_t read1Pos;
|
|
|
|
|
|
int64_t read2Pos;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
bool operator<(const CalcKey &o) const {
|
2023-12-08 03:57:30 +08:00
|
|
|
|
int comp = (int)(read1Pos - o.read1Pos);
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = (int)(read2Pos - o.read2Pos);
|
|
|
|
|
|
return comp < 0;
|
|
|
|
|
|
}
|
2024-09-04 11:08:03 +08:00
|
|
|
|
bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; }
|
|
|
|
|
|
std::size_t operator()(const CalcKey &o) const {
|
|
|
|
|
|
return std::hash<int64_t>()(read1Pos) ^ std::hash<int64_t>()(read2Pos);
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct CalcKeyHash {
|
|
|
|
|
|
std::size_t operator()(const CalcKey &o) const {
|
|
|
|
|
|
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
|
|
|
|
|
|
}
|
2023-12-04 18:02:07 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
2024-08-29 16:40:52 +08:00
|
|
|
|
/* 用来记录冗余索引相关的信息 */
|
|
|
|
|
|
struct DupInfo {
|
|
|
|
|
|
int64_t idx;
|
|
|
|
|
|
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
|
|
|
|
|
|
int16_t dupSet = 0; // dup set size
|
|
|
|
|
|
|
2024-09-04 11:08:03 +08:00
|
|
|
|
DupInfo() : DupInfo(-1, 0, 0) { }
|
2024-08-29 16:40:52 +08:00
|
|
|
|
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) { }
|
|
|
|
|
|
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
|
|
|
|
|
|
bool operator<(const DupInfo &o) const {
|
|
|
|
|
|
return idx < o.idx;
|
|
|
|
|
|
}
|
|
|
|
|
|
bool operator>(const DupInfo &o) const {
|
|
|
|
|
|
return idx > o.idx;
|
|
|
|
|
|
}
|
|
|
|
|
|
operator int64_t() const {
|
|
|
|
|
|
return idx;
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2024-09-04 11:08:03 +08:00
|
|
|
|
struct DupInfoHash {
|
|
|
|
|
|
std::size_t operator()(const DupInfo &o) const { return std::hash<int64_t>()(o.idx); }
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct DupInfoEqual {
|
|
|
|
|
|
bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; }
|
|
|
|
|
|
bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; }
|
|
|
|
|
|
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
|
|
// using MDSet = set<T>;
|
|
|
|
|
|
// using MDSet = unordered_set<T>;
|
|
|
|
|
|
using MDSet = tsl::robin_set<T>;
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
|
// using DPSet = set<T>;
|
|
|
|
|
|
// using DPSet = unordered_set<T, DupInfoHash, DupInfoEqual>;
|
|
|
|
|
|
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
|
//using CalcSet = set<T>;
|
|
|
|
|
|
using CalcSet = tsl::robin_set<T, CalcKeyHash>;
|
|
|
|
|
|
|
2023-12-08 03:57:30 +08:00
|
|
|
|
/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */
|
2024-08-22 02:28:36 +08:00
|
|
|
|
struct TaskSeqDupInfo {
|
2024-09-04 11:08:03 +08:00
|
|
|
|
DPSet<DupInfo> dupIdx;
|
|
|
|
|
|
MDSet<int64_t> opticalDupIdx;
|
|
|
|
|
|
DPSet<DupInfo> repIdx;
|
|
|
|
|
|
MDSet<int64_t> notDupIdx;
|
|
|
|
|
|
MDSet<int64_t> notOpticalDupIdx;
|
|
|
|
|
|
MDSet<int64_t> notRepIdx;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
2023-12-08 03:57:30 +08:00
|
|
|
|
/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */
|
2024-08-22 02:28:36 +08:00
|
|
|
|
struct UnpairedPosInfo {
|
2023-12-08 03:57:30 +08:00
|
|
|
|
int unpairedNum = 0;
|
|
|
|
|
|
int64_t taskSeq;
|
|
|
|
|
|
vector<ReadEnds> pairArr;
|
2024-09-04 11:08:03 +08:00
|
|
|
|
MDSet<string> readNameSet;
|
2023-12-08 03:57:30 +08:00
|
|
|
|
};
|
2023-12-04 18:02:07 +08:00
|
|
|
|
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
|
2023-12-08 03:57:30 +08:00
|
|
|
|
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
2024-08-22 02:28:36 +08:00
|
|
|
|
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引,保存未匹配的pair read
|
|
|
|
|
|
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
2023-12-04 18:02:07 +08:00
|
|
|
|
|
2023-11-28 10:45:40 +08:00
|
|
|
|
/* 单线程处理冗余参数结构体 */
|
2024-08-22 02:28:36 +08:00
|
|
|
|
struct SerailMarkDupArg {
|
|
|
|
|
|
int64_t taskSeq; // 任务序号
|
|
|
|
|
|
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
|
|
|
|
|
vector<BamWrap *> bams; // 存放待处理的bam read
|
|
|
|
|
|
vector<ReadEnds> pairs; // 成对的reads
|
|
|
|
|
|
vector<ReadEnds> frags; // 暂未找到配对的reads
|
2024-09-04 11:08:03 +08:00
|
|
|
|
DPSet<DupInfo> pairDupIdx; // pair的冗余read的索引
|
|
|
|
|
|
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
|
|
|
|
|
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
|
|
|
|
|
|
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
|
2024-08-22 02:28:36 +08:00
|
|
|
|
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
|
|
|
|
|
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储
|
2023-11-28 10:45:40 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
2024-08-22 02:28:36 +08:00
|
|
|
|
struct GlobalDataArg {
|
|
|
|
|
|
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
2023-12-04 18:02:07 +08:00
|
|
|
|
UnpairedPositionMap unpairedPosArr;
|
|
|
|
|
|
|
|
|
|
|
|
// 每个task对应一个vector
|
2024-08-29 16:40:52 +08:00
|
|
|
|
vector<vector<DupInfo>> dupIdxArr;
|
2023-12-04 18:02:07 +08:00
|
|
|
|
vector<vector<int64_t>> opticalDupIdxArr;
|
2024-09-04 11:08:03 +08:00
|
|
|
|
vector<vector<DupInfo>> repIdxArr;
|
2023-12-08 03:57:30 +08:00
|
|
|
|
|
|
|
|
|
|
// 用来存放后续计算的数据
|
2024-09-04 11:08:03 +08:00
|
|
|
|
vector<DPSet<DupInfo>> latterDupIdxArr;
|
|
|
|
|
|
vector<MDSet<int64_t>> latterOpticalDupIdxArr;
|
|
|
|
|
|
vector<DPSet<DupInfo>> latterRepIdxArr;
|
|
|
|
|
|
vector<MDSet<int64_t>> latterNotDupIdxArr;
|
|
|
|
|
|
vector<MDSet<int64_t>> latterNotOpticalDupIdxArr;
|
|
|
|
|
|
vector<MDSet<int64_t>> latterNotRepIdxArr;
|
2023-11-28 10:45:40 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
2024-08-22 02:28:36 +08:00
|
|
|
|
// 串行运行mark duplicate
|
|
|
|
|
|
void serialMarkDups();
|