148 lines
5.1 KiB
C++
148 lines
5.1 KiB
C++
#pragma once
|
||
|
||
#include <common/hts/bam_buf.h>
|
||
#include <robin-map/include/tsl/robin_map.h>
|
||
#include <robin-map/include/tsl/robin_set.h>
|
||
#include <sam/utils/read_ends.h>
|
||
|
||
#include <set>
|
||
#include <string>
|
||
#include <vector>
|
||
#include <unordered_set>
|
||
|
||
using std::set;
|
||
using std::unordered_set;
|
||
using std::string;
|
||
using std::vector;
|
||
|
||
/* 存放未匹配readend相同位点的所有readend */
|
||
struct UnpairedREInfo {
|
||
int64_t taskSeq;
|
||
ReadEnds unpairedRE;
|
||
};
|
||
|
||
/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */
|
||
struct CalcKey {
|
||
int64_t read1Pos;
|
||
int64_t read2Pos;
|
||
bool operator<(const CalcKey &o) const {
|
||
int comp = (int)(read1Pos - o.read1Pos);
|
||
if (comp == 0)
|
||
comp = (int)(read2Pos - o.read2Pos);
|
||
return comp < 0;
|
||
}
|
||
bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; }
|
||
std::size_t operator()(const CalcKey &o) const {
|
||
return std::hash<int64_t>()(read1Pos) ^ std::hash<int64_t>()(read2Pos);
|
||
}
|
||
};
|
||
|
||
struct CalcKeyHash {
|
||
std::size_t operator()(const CalcKey &o) const {
|
||
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
|
||
}
|
||
};
|
||
|
||
/* 用来记录冗余索引相关的信息 */
|
||
struct DupInfo {
|
||
int64_t idx;
|
||
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
|
||
int16_t dupSet = 0; // dup set size
|
||
|
||
DupInfo() : DupInfo(-1, 0, 0) { }
|
||
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) { }
|
||
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
|
||
bool operator<(const DupInfo &o) const {
|
||
return idx < o.idx;
|
||
}
|
||
bool operator>(const DupInfo &o) const {
|
||
return idx > o.idx;
|
||
}
|
||
operator int64_t() const {
|
||
return idx;
|
||
}
|
||
};
|
||
|
||
struct DupInfoHash {
|
||
std::size_t operator()(const DupInfo &o) const { return std::hash<int64_t>()(o.idx); }
|
||
};
|
||
|
||
struct DupInfoEqual {
|
||
bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; }
|
||
bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; }
|
||
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
|
||
};
|
||
|
||
template<typename T>
|
||
// using MDSet = set<T>;
|
||
// using MDSet = unordered_set<T>;
|
||
using MDSet = tsl::robin_set<T>;
|
||
|
||
template <typename T>
|
||
// using DPSet = set<T>;
|
||
// using DPSet = unordered_set<T, DupInfoHash, DupInfoEqual>;
|
||
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
|
||
|
||
template <typename T>
|
||
//using CalcSet = set<T>;
|
||
using CalcSet = tsl::robin_set<T, CalcKeyHash>;
|
||
|
||
/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */
|
||
struct TaskSeqDupInfo {
|
||
DPSet<DupInfo> dupIdx;
|
||
MDSet<int64_t> opticalDupIdx;
|
||
DPSet<DupInfo> repIdx;
|
||
MDSet<int64_t> notDupIdx;
|
||
MDSet<int64_t> notOpticalDupIdx;
|
||
MDSet<int64_t> notRepIdx;
|
||
};
|
||
|
||
/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */
|
||
struct UnpairedPosInfo {
|
||
int unpairedNum = 0;
|
||
int64_t taskSeq;
|
||
vector<ReadEnds> pairArr;
|
||
MDSet<string> readNameSet;
|
||
};
|
||
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
|
||
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
|
||
|
||
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引,保存未匹配的pair read
|
||
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
||
|
||
/* 单线程处理冗余参数结构体 */
|
||
struct SerailMarkDupArg {
|
||
int64_t taskSeq; // 任务序号
|
||
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
|
||
vector<BamWrap *> bams; // 存放待处理的bam read
|
||
vector<ReadEnds> pairs; // 成对的reads
|
||
vector<ReadEnds> frags; // 暂未找到配对的reads
|
||
DPSet<DupInfo> pairDupIdx; // pair的冗余read的索引
|
||
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
||
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
|
||
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
|
||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd,为了避免重复存储
|
||
};
|
||
|
||
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
||
struct GlobalDataArg {
|
||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||
UnpairedPositionMap unpairedPosArr;
|
||
|
||
// 每个task对应一个vector
|
||
vector<vector<DupInfo>> dupIdxArr;
|
||
vector<vector<int64_t>> opticalDupIdxArr;
|
||
vector<vector<DupInfo>> repIdxArr;
|
||
|
||
// 用来存放后续计算的数据
|
||
vector<DPSet<DupInfo>> latterDupIdxArr;
|
||
vector<MDSet<int64_t>> latterOpticalDupIdxArr;
|
||
vector<DPSet<DupInfo>> latterRepIdxArr;
|
||
vector<MDSet<int64_t>> latterNotDupIdxArr;
|
||
vector<MDSet<int64_t>> latterNotOpticalDupIdxArr;
|
||
vector<MDSet<int64_t>> latterNotRepIdxArr;
|
||
};
|
||
|
||
// 串行运行mark duplicate
|
||
void serialMarkDups(); |