完成去除singleton相关的代码

This commit is contained in:
zzh 2024-11-21 23:30:13 +08:00
parent 153e05399d
commit 6de538670f
2 changed files with 30 additions and 82 deletions

4
run.sh
View File

@ -6,8 +6,8 @@
nthread=32 nthread=32
#nthread=64 #nthread=64
#nthread=128 #nthread=128
input=/home/zzh/data/bam/zy_normal.bam #input=/home/zzh/data/bam/zy_normal.bam
#input=/home/zzh/data/bam/zy_tumor.bam input=/home/zzh/data/bam/zy_tumor.bam
#input=/home/zzh/data/wgs_na12878.bam #input=/home/zzh/data/wgs_na12878.bam
#input=~/data/bam/100w.bam #input=~/data/bam/100w.bam
#input=~/data/bam/t100w.sam #input=~/data/bam/t100w.sam

View File

@ -58,9 +58,8 @@ static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
/* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/ /* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx, static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *singletonIdx, MDSet<int64_t> *notDupIdx = nullptr, DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr, MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr ) {
MDSet<int64_t> *notSingletonIdx = nullptr) {
if (vpRe.size() < 2) { if (vpRe.size() < 2) {
return; return;
} }
@ -171,9 +170,8 @@ static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnd
/* 处理pairs */ /* 处理pairs */
static void processPairs(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx, static void processPairs(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *singletonIdx, MDSet<int64_t> *notDupIdx = nullptr, DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr, MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) {
MDSet<int64_t> *notSingletonIdx = nullptr) {
// return; // return;
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
const ReadEnds *pReadEnd = nullptr; const ReadEnds *pReadEnd = nullptr;
@ -181,15 +179,14 @@ static void processPairs(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDS
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组 vpCache.push_back(&re); // 处理一个潜在的冗余组
else { else {
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx, markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx,
notRepIdx, notSingletonIdx); // 不一样 notRepIdx); // 不一样
vpCache.clear(); vpCache.clear();
vpCache.push_back(&re); vpCache.push_back(&re);
pReadEnd = &re; pReadEnd = &re;
} }
} }
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx, notRepIdx, markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx);
notSingletonIdx);
} }
/* 处理frags */ /* 处理frags */
@ -279,54 +276,12 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &not
} }
} }
/* 将pairs重叠部分的dup idx放进数据中 */
static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &singletonIdx, MDSet<int64_t> &notDupIdx,
MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
MDSet<int64_t> &notSingletonIdx, MarkDupDataArg *lastArg, MarkDupDataArg *curArg) {
auto &lp = *lastArg;
auto &p = *curArg;
for (auto idx : dupIdx) {
lp.pairDupIdx.insert(idx);
p.pairDupIdx.erase(idx);
}
for (auto idx : opticalDupIdx) {
lp.pairOpticalDupIdx.insert(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : repIdx) {
lp.pairRepIdx.insert(idx);
p.pairRepIdx.erase(idx);
}
for (auto idx : singletonIdx) {
lp.pairSingletonIdx.insert(idx);
p.pairSingletonIdx.erase(idx);
}
for (auto idx : notDupIdx) {
lp.pairDupIdx.erase(idx);
p.pairDupIdx.erase(idx);
}
for (auto idx : notOpticalDupIdx) {
lp.pairOpticalDupIdx.erase(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : notRepIdx) {
lp.pairRepIdx.erase(idx);
p.pairRepIdx.erase(idx);
}
for (auto idx : notSingletonIdx) {
lp.pairSingletonIdx.erase(idx);
p.pairSingletonIdx.erase(idx);
}
}
// 用来分别处理dup和optical dup // 用来分别处理dup和optical dup
static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx, static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx, MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
MDSet<int64_t> &notSingletonIdx, DPSet<DupInfo> &latterDupIdx, DPSet<DupInfo> &latterDupIdx, MDSet<int64_t> &latterOpticalDupIdx,
MDSet<int64_t> &latterOpticalDupIdx, DPSet<DupInfo> &latterRepIdx, DPSet<DupInfo> &latterRepIdx, MDSet<int64_t> &latterNotDupIdx,
MDSet<int64_t> &latterNotDupIdx, MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotRepIdx) {
MDSet<int64_t> &latterNotRepIdx) {
for (auto idx : dupIdx) latterDupIdx.insert(idx); for (auto idx : dupIdx) latterDupIdx.insert(idx);
for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx); for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx);
for (auto idx : repIdx) latterRepIdx.insert(idx); for (auto idx : repIdx) latterRepIdx.insert(idx);
@ -511,8 +466,7 @@ static void doMarkDup(PipelineArg &pipeArg) {
tm_arr[5].acc_end(); tm_arr[5].acc_end();
SortMarkData &smd = *(SortMarkData *)mdData.dataPtr; SortMarkData &smd = *(SortMarkData *)mdData.dataPtr;
// 先处理 pair // 先处理 pair
processPairs(smd.pairs, &mdData.pairDupIdx, &mdData.pairOpticalDupIdx, &mdData.pairRepIdx, processPairs(smd.pairs, &mdData.pairDupIdx, &mdData.pairOpticalDupIdx, &mdData.pairRepIdx);
nullptr);
// 再处理frag // 再处理frag
processFrags(smd.frags, &mdData.fragDupIdx); processFrags(smd.frags, &mdData.fragDupIdx);
} }
@ -533,9 +487,8 @@ static void refreshNotDupIdx(T1 &srcArr, T2 &delArr1, T2 &delArr2) {
} }
static void refreshMarkDupData(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx, static void refreshMarkDupData(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &singletonIdx, MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx, MarkDupData &lp, MarkDupData &p) {
MDSet<int64_t> &notSingletonIdx, MarkDupData &lp, MarkDupData &p) {
refreshDupIdx(dupIdx, lp.pairDupIdx, p.pairDupIdx); refreshDupIdx(dupIdx, lp.pairDupIdx, p.pairDupIdx);
refreshDupIdx(opticalDupIdx, lp.pairOpticalDupIdx, p.pairOpticalDupIdx); refreshDupIdx(opticalDupIdx, lp.pairOpticalDupIdx, p.pairOpticalDupIdx);
refreshDupIdx(repIdx, lp.pairRepIdx, p.pairRepIdx); refreshDupIdx(repIdx, lp.pairRepIdx, p.pairRepIdx);
@ -558,11 +511,9 @@ static void doIntersect(PipelineArg &pipeArg) {
DPSet<DupInfo> dupIdx; DPSet<DupInfo> dupIdx;
MDSet<int64_t> opticalDupIdx; MDSet<int64_t> opticalDupIdx;
DPSet<DupInfo> repIdx; DPSet<DupInfo> repIdx;
MDSet<int64_t> singletonIdx;
MDSet<int64_t> notOpticalDupIdx; MDSet<int64_t> notOpticalDupIdx;
MDSet<int64_t> notDupIdx; MDSet<int64_t> notDupIdx;
MDSet<int64_t> notRepIdx; MDSet<int64_t> notRepIdx;
MDSet<int64_t> notSingletonIdx;
// 先处理重叠的frags // 先处理重叠的frags
getIntersectData(lpSM.frags, pSM.frags, &reArr); getIntersectData(lpSM.frags, pSM.frags, &reArr);
@ -576,10 +527,8 @@ static void doIntersect(PipelineArg &pipeArg) {
notDupIdx.clear(); notDupIdx.clear();
getIntersectData(lpSM.pairs, pSM.pairs, &reArr, true); getIntersectData(lpSM.pairs, pSM.pairs, &reArr, true);
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &singletonIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx, processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx);
&notSingletonIdx); refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, lp, p);
refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx, notRepIdx,
notSingletonIdx, lp, p);
// 处理之前未匹配的部分 // 处理之前未匹配的部分
map<CalcKey, int64_t> recalcPos; map<CalcKey, int64_t> recalcPos;
@ -727,8 +676,8 @@ static void doIntersect(PipelineArg &pipeArg) {
pairArrP = &g.unpairedPosArr[posKey].pairArr; pairArrP = &g.unpairedPosArr[posKey].pairArr;
else else
pairArrP = &lpSM.unpairedPosArr[posKey].pairArr; pairArrP = &lpSM.unpairedPosArr[posKey].pairArr;
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.singletonIdx, &t.notDupIdx, processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx,
&t.notOpticalDupIdx, &t.notRepIdx, &t.notSingletonIdx); &t.notRepIdx);
if (taskSeq < lp.taskSeq) if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey); g.unpairedPosArr.erase(posKey);
} }
@ -746,15 +695,15 @@ static void doIntersect(PipelineArg &pipeArg) {
auto &t = e.second; auto &t = e.second;
if (taskSeq < lp.taskSeq) { if (taskSeq < lp.taskSeq) {
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx,
t.notSingletonIdx, g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterRepIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq],
g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]); g.latterNotRepIdxArr[taskSeq]);
} else if (taskSeq == lp.taskSeq) { } else if (taskSeq == lp.taskSeq) {
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.singletonIdx, t.notDupIdx, t.notOpticalDupIdx, refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx,
t.notRepIdx, t.notSingletonIdx, lp, p); t.notRepIdx, lp, p);
} else { } else {
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.singletonIdx, t.notDupIdx, t.notOpticalDupIdx, refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx,
t.notRepIdx, t.notSingletonIdx, p, lp); // 把结果放到p中 t.notRepIdx, p, lp); // 把结果放到p中
} }
} }
@ -1015,8 +964,7 @@ static void mergeAllTask(PipelineArg &pipeArg) {
if (arr.size() > 1) { if (arr.size() > 1) {
std::sort(arr.begin(), arr.end()); std::sort(arr.begin(), arr.end());
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.singletonIdx, &t.notDupIdx, processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
&t.notOpticalDupIdx, &t.notRepIdx, &t.notSingletonIdx);
} }
} }
// 更新结果 // 更新结果
@ -1026,9 +974,9 @@ static void mergeAllTask(PipelineArg &pipeArg) {
auto taskSeq = e.first; auto taskSeq = e.first;
auto &t = e.second; auto &t = e.second;
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx,
t.notSingletonIdx, g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterRepIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq],
g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]); g.latterNotRepIdxArr[taskSeq]);
} }
g.unpairedPosArr.clear(); g.unpairedPosArr.clear();