完成去除singleton相关的代码

This commit is contained in:
zzh 2024-11-21 23:30:13 +08:00
parent 153e05399d
commit 6de538670f
2 changed files with 30 additions and 82 deletions

4
run.sh
View File

@ -6,8 +6,8 @@
nthread=32
#nthread=64
#nthread=128
input=/home/zzh/data/bam/zy_normal.bam
#input=/home/zzh/data/bam/zy_tumor.bam
#input=/home/zzh/data/bam/zy_normal.bam
input=/home/zzh/data/bam/zy_tumor.bam
#input=/home/zzh/data/wgs_na12878.bam
#input=~/data/bam/100w.bam
#input=~/data/bam/t100w.sam

View File

@ -58,9 +58,8 @@ static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
/* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *singletonIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr,
MDSet<int64_t> *notSingletonIdx = nullptr) {
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr ) {
if (vpRe.size() < 2) {
return;
}
@ -171,9 +170,8 @@ static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnd
/* 处理pairs */
static void processPairs(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *singletonIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr,
MDSet<int64_t> *notSingletonIdx = nullptr) {
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) {
// return;
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
const ReadEnds *pReadEnd = nullptr;
@ -181,15 +179,14 @@ static void processPairs(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDS
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组
else {
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx,
notRepIdx, notSingletonIdx); // 不一样
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx,
notRepIdx); // 不一样
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
}
}
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx, notRepIdx,
notSingletonIdx);
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx);
}
/* 处理frags */
@ -279,54 +276,12 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &not
}
}
/* 将pairs重叠部分的dup idx放进数据中 */
static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &singletonIdx, MDSet<int64_t> &notDupIdx,
MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
MDSet<int64_t> &notSingletonIdx, MarkDupDataArg *lastArg, MarkDupDataArg *curArg) {
auto &lp = *lastArg;
auto &p = *curArg;
for (auto idx : dupIdx) {
lp.pairDupIdx.insert(idx);
p.pairDupIdx.erase(idx);
}
for (auto idx : opticalDupIdx) {
lp.pairOpticalDupIdx.insert(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : repIdx) {
lp.pairRepIdx.insert(idx);
p.pairRepIdx.erase(idx);
}
for (auto idx : singletonIdx) {
lp.pairSingletonIdx.insert(idx);
p.pairSingletonIdx.erase(idx);
}
for (auto idx : notDupIdx) {
lp.pairDupIdx.erase(idx);
p.pairDupIdx.erase(idx);
}
for (auto idx : notOpticalDupIdx) {
lp.pairOpticalDupIdx.erase(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : notRepIdx) {
lp.pairRepIdx.erase(idx);
p.pairRepIdx.erase(idx);
}
for (auto idx : notSingletonIdx) {
lp.pairSingletonIdx.erase(idx);
p.pairSingletonIdx.erase(idx);
}
}
// 用来分别处理dup和optical dup
static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
MDSet<int64_t> &notSingletonIdx, DPSet<DupInfo> &latterDupIdx,
MDSet<int64_t> &latterOpticalDupIdx, DPSet<DupInfo> &latterRepIdx,
MDSet<int64_t> &latterNotDupIdx, MDSet<int64_t> &latterNotOpticalDupIdx,
MDSet<int64_t> &latterNotRepIdx) {
DPSet<DupInfo> &latterDupIdx, MDSet<int64_t> &latterOpticalDupIdx,
DPSet<DupInfo> &latterRepIdx, MDSet<int64_t> &latterNotDupIdx,
MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotRepIdx) {
for (auto idx : dupIdx) latterDupIdx.insert(idx);
for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx);
for (auto idx : repIdx) latterRepIdx.insert(idx);
@ -511,8 +466,7 @@ static void doMarkDup(PipelineArg &pipeArg) {
tm_arr[5].acc_end();
SortMarkData &smd = *(SortMarkData *)mdData.dataPtr;
// 先处理 pair
processPairs(smd.pairs, &mdData.pairDupIdx, &mdData.pairOpticalDupIdx, &mdData.pairRepIdx,
nullptr);
processPairs(smd.pairs, &mdData.pairDupIdx, &mdData.pairOpticalDupIdx, &mdData.pairRepIdx);
// 再处理frag
processFrags(smd.frags, &mdData.fragDupIdx);
}
@ -533,9 +487,8 @@ static void refreshNotDupIdx(T1 &srcArr, T2 &delArr1, T2 &delArr2) {
}
static void refreshMarkDupData(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &singletonIdx, MDSet<int64_t> &notDupIdx,
MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
MDSet<int64_t> &notSingletonIdx, MarkDupData &lp, MarkDupData &p) {
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
MarkDupData &lp, MarkDupData &p) {
refreshDupIdx(dupIdx, lp.pairDupIdx, p.pairDupIdx);
refreshDupIdx(opticalDupIdx, lp.pairOpticalDupIdx, p.pairOpticalDupIdx);
refreshDupIdx(repIdx, lp.pairRepIdx, p.pairRepIdx);
@ -558,11 +511,9 @@ static void doIntersect(PipelineArg &pipeArg) {
DPSet<DupInfo> dupIdx;
MDSet<int64_t> opticalDupIdx;
DPSet<DupInfo> repIdx;
MDSet<int64_t> singletonIdx;
MDSet<int64_t> notOpticalDupIdx;
MDSet<int64_t> notDupIdx;
MDSet<int64_t> notRepIdx;
MDSet<int64_t> notSingletonIdx;
// 先处理重叠的frags
getIntersectData(lpSM.frags, pSM.frags, &reArr);
@ -576,10 +527,8 @@ static void doIntersect(PipelineArg &pipeArg) {
notDupIdx.clear();
getIntersectData(lpSM.pairs, pSM.pairs, &reArr, true);
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &singletonIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx,
&notSingletonIdx);
refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx, notRepIdx,
notSingletonIdx, lp, p);
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx);
refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, lp, p);
// 处理之前未匹配的部分
map<CalcKey, int64_t> recalcPos;
@ -727,8 +676,8 @@ static void doIntersect(PipelineArg &pipeArg) {
pairArrP = &g.unpairedPosArr[posKey].pairArr;
else
pairArrP = &lpSM.unpairedPosArr[posKey].pairArr;
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.singletonIdx, &t.notDupIdx,
&t.notOpticalDupIdx, &t.notRepIdx, &t.notSingletonIdx);
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx,
&t.notRepIdx);
if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey);
}
@ -746,15 +695,15 @@ static void doIntersect(PipelineArg &pipeArg) {
auto &t = e.second;
if (taskSeq < lp.taskSeq) {
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx,
t.notSingletonIdx, g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq],
g.latterRepIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq],
g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]);
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq],
g.latterNotRepIdxArr[taskSeq]);
} else if (taskSeq == lp.taskSeq) {
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.singletonIdx, t.notDupIdx, t.notOpticalDupIdx,
t.notRepIdx, t.notSingletonIdx, lp, p);
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx,
t.notRepIdx, lp, p);
} else {
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.singletonIdx, t.notDupIdx, t.notOpticalDupIdx,
t.notRepIdx, t.notSingletonIdx, p, lp); // 把结果放到p中
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx,
t.notRepIdx, p, lp); // 把结果放到p中
}
}
@ -1015,8 +964,7 @@ static void mergeAllTask(PipelineArg &pipeArg) {
if (arr.size() > 1) {
std::sort(arr.begin(), arr.end());
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.singletonIdx, &t.notDupIdx,
&t.notOpticalDupIdx, &t.notRepIdx, &t.notSingletonIdx);
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
}
}
// 更新结果
@ -1026,9 +974,9 @@ static void mergeAllTask(PipelineArg &pipeArg) {
auto taskSeq = e.first;
auto &t = e.second;
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx,
t.notSingletonIdx, g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq],
g.latterRepIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq],
g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]);
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq],
g.latterNotRepIdxArr[taskSeq]);
}
g.unpairedPosArr.clear();