picard_cpp/src/sam/markdups/serial_md.h

148 lines
5.1 KiB
C
Raw Normal View History

#pragma once
#include <common/hts/bam_buf.h>
#include <robin-map/include/tsl/robin_map.h>
#include <robin-map/include/tsl/robin_set.h>
#include <sam/utils/read_ends.h>
#include <set>
#include <string>
#include <vector>
#include <unordered_set>
using std::set;
using std::unordered_set;
using std::string;
using std::vector;
/* 存放未匹配readend相同位点的所有readend */
struct UnpairedREInfo {
int64_t taskSeq;
ReadEnds unpairedRE;
};
/* 对于一个pair数据一个完整的计算点包含read1的比对位置和read2的比对位置 */
struct CalcKey {
int64_t read1Pos;
int64_t read2Pos;
bool operator<(const CalcKey &o) const {
int comp = (int)(read1Pos - o.read1Pos);
if (comp == 0)
comp = (int)(read2Pos - o.read2Pos);
return comp < 0;
}
bool operator==(const CalcKey &o) const { return read1Pos == o.read1Pos && read2Pos == o.read2Pos; }
std::size_t operator()(const CalcKey &o) const {
return std::hash<int64_t>()(read1Pos) ^ std::hash<int64_t>()(read2Pos);
}
};
struct CalcKeyHash {
std::size_t operator()(const CalcKey &o) const {
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
}
};
2024-08-29 16:40:52 +08:00
/* 用来记录冗余索引相关的信息 */
struct DupInfo {
int64_t idx;
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
int16_t dupSet = 0; // dup set size
DupInfo() : DupInfo(-1, 0, 0) { }
2024-08-29 16:40:52 +08:00
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) { }
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
bool operator<(const DupInfo &o) const {
return idx < o.idx;
}
bool operator>(const DupInfo &o) const {
return idx > o.idx;
}
operator int64_t() const {
return idx;
}
};
struct DupInfoHash {
std::size_t operator()(const DupInfo &o) const { return std::hash<int64_t>()(o.idx); }
};
struct DupInfoEqual {
bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; }
bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; }
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
};
template<typename T>
// using MDSet = set<T>;
// using MDSet = unordered_set<T>;
using MDSet = tsl::robin_set<T>;
template <typename T>
// using DPSet = set<T>;
// using DPSet = unordered_set<T, DupInfoHash, DupInfoEqual>;
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
template <typename T>
//using CalcSet = set<T>;
using CalcSet = tsl::robin_set<T, CalcKeyHash>;
/* 当遗留数据在当前任务找到了pair read后进行冗余计算时候存放结果的数据结构 */
struct TaskSeqDupInfo {
DPSet<DupInfo> dupIdx;
MDSet<int64_t> opticalDupIdx;
DPSet<DupInfo> repIdx;
MDSet<int64_t> notDupIdx;
MDSet<int64_t> notOpticalDupIdx;
MDSet<int64_t> notRepIdx;
};
/* 保存有未匹配pair位点的信息包括read end数组和有几个未匹配的read end */
struct UnpairedPosInfo {
int unpairedNum = 0;
int64_t taskSeq;
vector<ReadEnds> pairArr;
MDSet<string> readNameSet;
};
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引保存未匹配的pair read
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
/* 单线程处理冗余参数结构体 */
struct SerailMarkDupArg {
int64_t taskSeq; // 任务序号
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
vector<BamWrap *> bams; // 存放待处理的bam read
vector<ReadEnds> pairs; // 成对的reads
vector<ReadEnds> frags; // 暂未找到配对的reads
DPSet<DupInfo> pairDupIdx; // pair的冗余read的索引
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储
};
/* 全局保留的数据因为有些paired数据比对到了不同的染色体相距甚远 */
struct GlobalDataArg {
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr;
// 每个task对应一个vector
2024-08-29 16:40:52 +08:00
vector<vector<DupInfo>> dupIdxArr;
vector<vector<int64_t>> opticalDupIdxArr;
vector<vector<DupInfo>> repIdxArr;
// 用来存放后续计算的数据
vector<DPSet<DupInfo>> latterDupIdxArr;
vector<MDSet<int64_t>> latterOpticalDupIdxArr;
vector<DPSet<DupInfo>> latterRepIdxArr;
vector<MDSet<int64_t>> latterNotDupIdxArr;
vector<MDSet<int64_t>> latterNotOpticalDupIdxArr;
vector<MDSet<int64_t>> latterNotRepIdxArr;
};
// 串行运行mark duplicate
void serialMarkDups();