完美解决了所有问题,基本都是由排序带来的,排序过程涉及的变量太多,导致范围跟预想的不一致,现在都解决了

This commit is contained in:
zzh 2025-03-04 00:40:57 +08:00
parent 64d23b3f2d
commit 3606562de9
6 changed files with 187 additions and 338 deletions

View File

@ -164,6 +164,7 @@ int MarkDuplicates() {
spdlog::info("{} optical reads found", opticalIdxQue.Size()); spdlog::info("{} optical reads found", opticalIdxQue.Size());
spdlog::info("{} represent reads found", repIdxQue.Size()); spdlog::info("{} represent reads found", repIdxQue.Size());
// spdlog::info("real dup size: {}", dupIdxQue.RealSize()); // spdlog::info("real dup size: {}", dupIdxQue.RealSize());
// spdlog::info("real optical dup size: {}", opticalIdxQue.RealSize());
return 0; return 0;

View File

@ -64,7 +64,7 @@ struct MarkDupsArg {
int NUM_THREADS = 1; int NUM_THREADS = 1;
size_t MAX_MEM = ((size_t)1) << 28; // << 31 // 最小2G size_t MAX_MEM = ((size_t)1) << 30; // // 最小1G
bool DUPLEX_IO = true; // 同时读写 bool DUPLEX_IO = true; // 同时读写

View File

@ -18,6 +18,7 @@
#include "util/yarn.h" #include "util/yarn.h"
using std::vector; using std::vector;
using namespace std;
namespace nsgv { namespace nsgv {
@ -30,40 +31,6 @@ extern DupResult gDupRes;
extern PipelineArg gPipe; extern PipelineArg gPipe;
}; // namespace nsgv }; // namespace nsgv
/* 排序 */
static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
size_t blockSize = 64 * 1024;
if (arr.size() < blockSize) {
std::sort(arr.begin(), arr.end());
return;
}
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
size_t crossNum = 1024;
size_t start, end, i, left, right;
std::sort(arr.begin(), arr.begin() + blockSize);
for (i = 1; i < blockNum; ++i) {
start = i * blockSize;
end = min(start + blockSize, arr.size());
std::sort(arr.begin() + start, arr.begin() + end);
left = crossNum;
while (!(arr[start - left] < arr[start])) {
left = left << 1;
if (left >= blockSize) {
std::sort(arr.begin(), arr.end()); // 退化到普通排序
return;
}
}
right = min(crossNum, end - start - 1);
while (!(arr[start - 1] < arr[start + right])) {
right = min(right << 1, end - start - 1);
if (right == end - start - 1)
break;
}
std::sort(arr.begin() + start - left, arr.begin() + start + right);
}
}
/* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/ /* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx, static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr, DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
@ -71,7 +38,6 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
if (vpRe.size() < 2) { if (vpRe.size() < 2) {
return; return;
} }
int maxScore = 0; int maxScore = 0;
const ReadEnds *pBest = nullptr; const ReadEnds *pBest = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/ /** All read ends should have orientation FF, FR, RF, or RR **/
@ -110,14 +76,6 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
} }
for (auto pe : vpRe) { // 对非best read标记冗余 for (auto pe : vpRe) { // 对非best read标记冗余
if (pe != pBest) { // 非best if (pe != pBest) { // 非best
// if (1555341360 == pe->read1IndexInFile) {
// for (auto p : vpRe) {
// cout << "zzh find: " << p->read1IndexInFile << '\t' << p->read2IndexInFile << endl;
// }
// cout << "split" << endl;
// }
dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1 dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile) if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2, dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2,
@ -177,7 +135,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
/* 找到与readend pos相等的所有readend */ /* 找到与readend pos相等的所有readend */
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) { static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点 auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::CorLittleThan); // 只比对位点
dst->insert(dst->end(), range.first, range.second); dst->insert(dst->end(), range.first, range.second);
} }
@ -232,19 +190,6 @@ static void processFrags(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDS
} }
} }
/* 单线程markdup (第二步)*/
static void markdups(MarkDupDataArg *arg) {
auto &p = *arg;
p.fragDupIdx.clear();
p.pairDupIdx.clear();
p.pairOpticalDupIdx.clear();
p.pairRepIdx.clear();
/* generateDuplicateIndexes计算冗余read在所有read中的位置索引 */
// 先处理 pair
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx, &p.pairRepIdx);
// 再处理frag
processFrags(p.frags, &p.fragDupIdx);
}
/* 获取交叉部分的数据 */ /* 获取交叉部分的数据 */
static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst, static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst,
@ -272,7 +217,10 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
} }
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end()); dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan); dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
std::sort(dst->begin(), dst->end()); if (isPairCmp)
std::sort(dst->begin(), dst->end(), ReadEnds::PairLittleThan);
else
std::sort(dst->begin(), dst->end(), ReadEnds::FragLittleThan);
} }
/* 将frags重叠部分的dup idx放进数据中 */ /* 将frags重叠部分的dup idx放进数据中 */
@ -290,59 +238,7 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &not
} }
} }
/* 最后合并数据并排序 */
template <typename DupContainer, typename T>
static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupIdx, vector<T> &dupArr,
vector<T> &cacheDupIdx1, vector<T> &midArr1) {
// midArr.resize(0);
// cacheDupIdx.resize(0);
vector<T> cacheDupIdx;
vector<T> midArr;
cacheDupIdx.insert(cacheDupIdx.end(), dupIdx.begin(), dupIdx.end());
std::sort(cacheDupIdx.begin(), cacheDupIdx.end());
auto ai = dupArr.begin();
auto ae = dupArr.end();
auto bi = cacheDupIdx.begin();
auto be = cacheDupIdx.end();
T val = 0;
while (ai != ae && bi != be) {
if (*ai < *bi) {
val = *ai++;
} else if (*bi < *ai) {
val = *bi++;
} else {
val = *bi++; // 相等的时候取后面的作为结果
ai++;
}
if (notDupIdx.find(val) == notDupIdx.end()) {
midArr.push_back(val);
}
}
while (ai != ae) {
val = *ai++;
if (notDupIdx.find(val) == notDupIdx.end()) {
midArr.push_back(val);
}
}
while (bi != be) {
val = *bi++;
if (notDupIdx.find(val) == notDupIdx.end()) {
midArr.push_back(val);
}
}
// spdlog::info("midArr & dupArr size: {}-{}", midArr.size(), dupArr.size());
// dupArr = midArr;
dupArr.clear();
dupArr.assign(midArr.begin(), midArr.end());
spdlog::info("midArr & dupArr size: {}-{}", midArr.size(), dupArr.size());
}
// for step 2 generate read ends // for step 2 generate read ends
// multi-thread generate read ends // multi-thread generate read ends
static void mtGenerateReadEnds(void *data, long idx, int tid) { static void mtGenerateReadEnds(void *data, long idx, int tid) {
auto &p = *(PipelineArg *)data; auto &p = *(PipelineArg *)data;
@ -369,9 +265,6 @@ static void mtGenerateReadEnds(void *data, long idx, int tid) {
if (bw->GetReadUnmappedFlag()) { if (bw->GetReadUnmappedFlag()) {
if (bw->b->core.tid == -1) if (bw->b->core.tid == -1)
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort). // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
//if (p.genREOrder >= 3719) {
// cout << "inner core.tid check\t" << bw->b->core.tid << '\t' << i << "\t" << start_id << '\t' << end_id << endl;
//}
break; break;
} else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对 } else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
ReadEnds fragEnd; ReadEnds fragEnd;
@ -379,9 +272,6 @@ static void mtGenerateReadEnds(void *data, long idx, int tid) {
frags.push_back(fragEnd); // 添加进frag集合 frags.push_back(fragEnd); // 添加进frag集合
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了 if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
string key = bw->query_name(); string key = bw->query_name();
// if ("ERR194147.780864524" == key) {
// cout << "zzh find: " << key << '\t' << fragEnd.read1IndexInFile << '\t' << fragEnd.read2IndexInFile << endl;
// }
if (unpairedDic.find(key) == unpairedDic.end()) { if (unpairedDic.find(key) == unpairedDic.end()) {
unpairedDic[key] = {taskSeq, fragEnd}; unpairedDic[key] = {taskSeq, fragEnd};
} else { // 找到了pairend } else { // 找到了pairend
@ -396,25 +286,15 @@ static void mtGenerateReadEnds(void *data, long idx, int tid) {
PROF_END(tprof[TP_gen][tid], gen); PROF_END(tprof[TP_gen][tid], gen);
PROF_START(sort_frag); PROF_START(sort_frag);
// sortReadEndsArr(frags); sort(frags.begin(), frags.end(), ReadEnds::FragLittleThan);
sort(frags.begin(), frags.end());
PROF_END(tprof[TP_sort_frag][tid], sort_frag); PROF_END(tprof[TP_sort_frag][tid], sort_frag);
PROF_START(sort_pair); PROF_START(sort_pair);
sort(pairs.begin(), pairs.end()); sort(pairs.begin(), pairs.end(), ReadEnds::PairLittleThan);
PROF_END(tprof[TP_sort_pair][tid], sort_pair); PROF_END(tprof[TP_sort_pair][tid], sort_pair);
// if (p.genREOrder >= 3719) {
// cout << "zzh genRE size: " << p.genREOrder << '\t' << tid << '\t' << frags.size() << '\t' << pairs.size()
// << '\t' << start_id << '\t' << end_id << '\t' << bams.size() << endl;
// }
} }
static void doGenRE(PipelineArg &pipeArg) { static void doGenRE(PipelineArg &pipeArg) {
// return;
//if (pipeArg.genREOrder < 895) return;
//if (pipeArg.genREOrder < 440) return;
GenREData &genREData = pipeArg.genREData[pipeArg.genREOrder % pipeArg.GENBUFNUM]; GenREData &genREData = pipeArg.genREData[pipeArg.genREOrder % pipeArg.GENBUFNUM];
ReadData &readData = pipeArg.readData; ReadData &readData = pipeArg.readData;
@ -434,36 +314,17 @@ static void doGenRE(PipelineArg &pipeArg) {
for (auto &val : genREData.unpairedDicArr[i]) { for (auto &val : genREData.unpairedDicArr[i]) {
const string &key = val.first; const string &key = val.first;
const ReadEnds &fragEnd = val.second.unpairedRE; const ReadEnds &fragEnd = val.second.unpairedRE;
// if ("ERR194147.783448001" == key) {
// cout << "zzh find in doGen: " << key << '\t' << fragEnd.read1IndexInFile << '\t' << fragEnd.read2IndexInFile
// << endl;
// }
if (genREData.unpairedDic.find(key) == genREData.unpairedDic.end()) { if (genREData.unpairedDic.find(key) == genREData.unpairedDic.end()) {
genREData.unpairedDic[key] = {readData.taskSeq, fragEnd}; genREData.unpairedDic[key] = {readData.taskSeq, fragEnd};
// if (fragEnd.read1IndexInFile == 1524046492 || fragEnd.read2IndexInFile == 1524046492) {
// cout << "zzh not find paired: " << key << "\t" << fragEnd.read1IndexInFile << '\t'
// << fragEnd.read2IndexInFile << endl;
// }
} else { // 找到了pairend } else { // 找到了pairend
auto &pairedEnds = genREData.unpairedDic.at(key).unpairedRE; auto &pairedEnds = genREData.unpairedDic.at(key).unpairedRE;
modifyPairedEnds(fragEnd, &pairedEnds); modifyPairedEnds(fragEnd, &pairedEnds);
pairs.push_back(pairedEnds); pairs.push_back(pairedEnds);
// if (pairedEnds.read1IndexInFile == 1524046492 || pairedEnds.read2IndexInFile == 1524046492) {
// cout << "zzh find paired: " << key << '\t' << pairedEnds.read1IndexInFile << '\t'
// << pairedEnds.read2IndexInFile << "\t" << fragEnd.read1IndexInFile << '\t'
// << fragEnd.read2IndexInFile << '\t' << endl;
// }
genREData.unpairedDic.erase(key); // 删除找到的pairend genREData.unpairedDic.erase(key); // 删除找到的pairend
} }
} }
} }
sort(pairs.begin(), pairs.end()); sort(pairs.begin(), pairs.end(), ReadEnds::PairLittleThan);
// if (pipeArg.genREOrder >= 3719) {
// cout << "zzh genRE size: " << pipeArg.genREOrder << '\t' << pairs.size() << '\t' << genREData.unpairedDic.size()
// << '\t' << readData.bams.size() << '\t' << testNum << endl;
// }
} }
// end for step 2 generate read ends // end for step 2 generate read ends
@ -479,37 +340,26 @@ static void doSort(PipelineArg &pipeArg) {
smd.pairs.clear(); smd.pairs.clear();
smd.frags.clear(); smd.frags.clear();
const ReadEnds *pRE; const ReadEnds *pRE;
ReadEndsHeap pairsHeap, fragsHeap; ReadEndsHeap<REFragGreaterThan> fragsHeap;
ReadEndsHeap<REPairGreaterThan> pairsHeap;
PROF_START(sort_pair); PROF_START(sort_pair);
pairsHeap.Init(&genREData.pairsArr); pairsHeap.Init(&genREData.pairsArr);
// if (pipeArg.sortOrder >= 3719) {
// cout << "zzh sort pair size: " << pairsHeap.Size() << endl;
// }
while ((pRE = pairsHeap.Pop()) != nullptr) { while ((pRE = pairsHeap.Pop()) != nullptr) {
smd.pairs.push_back(*pRE); smd.pairs.push_back(*pRE);
} }
PROF_END(gprof[GP_sort_pair], sort_pair); PROF_END(gprof[GP_sort_pair], sort_pair);
PROF_START(sort_frag); PROF_START(sort_frag);
fragsHeap.Init(&genREData.fragsArr); fragsHeap.Init(&genREData.fragsArr);
// if (pipeArg.sortOrder >= 3719) {
// cout << "zzh sort frag size: " << fragsHeap.Size() << endl;
// }
while ((pRE = fragsHeap.Pop()) != nullptr) { while ((pRE = fragsHeap.Pop()) != nullptr) {
smd.frags.push_back(*pRE); smd.frags.push_back(*pRE);
} }
PROF_END(gprof[GP_sort_frag], sort_frag); PROF_END(gprof[GP_sort_frag], sort_frag);
// if (pipeArg.sortOrder >= 929) {
// cout << "zzh sort size: " << pipeArg.sortOrder << '\t' << smd.pairs.size() << '\t' << smd.frags.size() << '\t'
// << genREData.pairsArr.size() << '\t' << genREData.fragsArr.size() << endl;
// }
} }
// for step-4 sort // for step-4 sort
static void doMarkDup(PipelineArg &pipeArg) { static void doMarkDup(PipelineArg &pipeArg) {
// return;
MarkDupData &mdData = pipeArg.markDupData[pipeArg.markDupOrder % pipeArg.MARKBUFNUM]; MarkDupData &mdData = pipeArg.markDupData[pipeArg.markDupOrder % pipeArg.MARKBUFNUM];
SortData &sortData = pipeArg.sortData[pipeArg.markDupOrder % pipeArg.SORTBUFNUM]; SortData &sortData = pipeArg.sortData[pipeArg.markDupOrder % pipeArg.SORTBUFNUM];
mdData.taskSeq = pipeArg.markDupOrder; mdData.taskSeq = pipeArg.markDupOrder;
cout << "zzh markdup: " << mdData.taskSeq << '\t' << pipeArg.markDupOrder << '\t' << pipeArg.intersectOrder << endl;
mdData.clear(); mdData.clear();
auto tmpPtr = mdData.dataPtr; auto tmpPtr = mdData.dataPtr;
@ -525,10 +375,6 @@ static void doMarkDup(PipelineArg &pipeArg) {
PROF_START(markdup_frag); PROF_START(markdup_frag);
processFrags(smd.frags, &mdData.fragDupIdx); processFrags(smd.frags, &mdData.fragDupIdx);
PROF_END(gprof[GP_markdup_frag], markdup_frag); PROF_END(gprof[GP_markdup_frag], markdup_frag);
// if (mdData.taskSeq >= 929) {
// cout << "zzh markdup size: " << mdData.taskSeq << '\t' << smd.pairs.size() << '\t' << smd.frags.size() << '\t'
// << smd.unpairedDic.size() << '\t' << mdData.pairDupIdx.size() << '\t' << mdData.fragDupIdx.size() << endl;
// }
} }
template <typename T> template <typename T>
@ -599,7 +445,7 @@ static void processIntersectFragPairs(MarkDupData &lp, MarkDupData &cp) {
getIntersectData(lsm.pairs, csm.pairs, &reArr, true); getIntersectData(lsm.pairs, csm.pairs, &reArr, true);
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx); processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx);
refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, lp, cp); refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, cp, lp); // 放在cp里因为后面global里可能有相同的dup防止多次出现
} }
// 在相邻的数据块之间寻找未匹配的readends, 找到匹配的放到lp里 // 在相邻的数据块之间寻找未匹配的readends, 找到匹配的放到lp里
@ -607,15 +453,6 @@ static void findUnpairedInDatas(MarkDupData &lp, MarkDupData &cp) {
auto &interPairedData = lp.ckeyReadEndsMap; auto &interPairedData = lp.ckeyReadEndsMap;
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr; SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
SortMarkData &csm = *(SortMarkData *)cp.dataPtr; SortMarkData &csm = *(SortMarkData *)cp.dataPtr;
//if (lp.taskSeq >= 1856)
// return; //
//if (lp.taskSeq >= 1800) {
// cout << "zzh find in findUnpairedInDatas: " << lp.taskSeq << '\t' << cp.taskSeq << '\t'
// << lsm.unpairedDic.size() << '\t' << csm.unpairedDic.size() << '\t' << lp.ckeyReadEndsMap.size() << '\t'
// << cp.ckeyReadEndsMap.size() << endl;
//}
for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end(); ) { // 遍历上一个任务中的每个未匹配的read for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end(); ) { // 遍历上一个任务中的每个未匹配的read
auto &lastUnpair = *itr; auto &lastUnpair = *itr;
auto &readName = lastUnpair.first; auto &readName = lastUnpair.first;
@ -626,22 +463,8 @@ static void findUnpairedInDatas(MarkDupData &lp, MarkDupData &cp) {
auto &curRE = curUnpairInfo.unpairedRE; auto &curRE = curUnpairInfo.unpairedRE;
modifyPairedEnds(curRE, &lastRE); // lastRE当做找到匹配后完整的ReadEnds modifyPairedEnds(curRE, &lastRE); // lastRE当做找到匹配后完整的ReadEnds
CalcKey ck(lastRE); CalcKey ck(lastRE);
// if (lp.taskSeq == 1856) {
// cout << "zzh ck: " << ck.Read1Pos() << ' ' << ck.Read2Pos()
// << "\t find: " << (interPairedData.find(ck) != interPairedData.end()) << '\t' << interPairedData.size() << endl;
// }
auto &pairArr = interPairedData[ck]; auto &pairArr = interPairedData[ck];
pairArr.push_back(lastRE); pairArr.push_back(lastRE);
// if (lastRE.read1IndexInFile == 1555341360 || lastRE.read2IndexInFile == 1555341360 ||
// lastRE.read1IndexInFile == 1555341145 || lastRE.read2IndexInFile == 1555341145) {
// cout << "zzh find in lp: " << pairArr.size() << '\t' << readName << '\t' << lastRE.read1IndexInFile
// << '\t' << lastRE.read2IndexInFile << '\t' << ck.Read1Pos() << '\t' << ck.Read2Pos() << endl;
// for (auto &p : pairArr) {
// cout << "reads in arr: " << p.read1IndexInFile << '\t' << p.read2IndexInFile << '\t' << p.score
// << endl;
// }
// }
// 从dict中清除配对后的readends // 从dict中清除配对后的readends
csm.unpairedDic.erase(readName); csm.unpairedDic.erase(readName);
itr = lsm.unpairedDic.erase(itr); itr = lsm.unpairedDic.erase(itr);
@ -655,12 +478,6 @@ static void findUnpairedInDatas(MarkDupData &lp, MarkDupData &cp) {
static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) { static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) {
auto &interPairedData = g.ckeyReadEndsMap; auto &interPairedData = g.ckeyReadEndsMap;
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr; SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
if (lp.taskSeq >= 929) {
cout << "zzh findUnpairedInGlobal size: " << lp.taskSeq << '\t' << lsm.unpairedDic.size() << '\t'
<< g.unpairedDic.size() << '\t' << g.ckeyReadEndsMap.size() << endl;
}
for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end();) { // 遍历上一个任务中的每个未匹配的read for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end();) { // 遍历上一个任务中的每个未匹配的read
auto &lastUnpair = *itr; auto &lastUnpair = *itr;
auto &readName = lastUnpair.first; auto &readName = lastUnpair.first;
@ -673,14 +490,6 @@ static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) {
CalcKey ck(gRE); CalcKey ck(gRE);
auto &pairArr = interPairedData[ck]; auto &pairArr = interPairedData[ck];
pairArr.push_back(gRE); pairArr.push_back(gRE);
// if (gRE.read1IndexInFile == 1555341360 || gRE.read2IndexInFile == 1555341360 ||
// gRE.read1IndexInFile == 1555341145 || gRE.read2IndexInFile == 1555341145) {
// cout << "zzh find in global: " << pairArr.size() << '\t' << readName << '\t' << gRE.read1IndexInFile << '\t' << gRE.read2IndexInFile
// << '\t' << ck.Read1Pos() << '\t' << ck.Read2Pos() << endl;
// for (auto &p : pairArr) {
// cout << "reads in arr: " << p.read1IndexInFile << '\t' << p.read2IndexInFile << '\t' << p.score << endl;
// }
// }
// 从dict中清除配对后的readends // 从dict中清除配对后的readends
g.unpairedDic.erase(readName); g.unpairedDic.erase(readName);
itr = lsm.unpairedDic.erase(itr); itr = lsm.unpairedDic.erase(itr);
@ -688,9 +497,6 @@ static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) {
++itr; ++itr;
} }
} }
for (auto &unPair : lsm.unpairedDic) {
g.unpairedDic.insert(unPair);
}
} }
static void putDupinfoToGlobal(IntersectData &g, MarkDupData &lp) { static void putDupinfoToGlobal(IntersectData &g, MarkDupData &lp) {
@ -714,57 +520,37 @@ static void putDupinfoToGlobal(IntersectData &g, MarkDupData &lp) {
// for step-5 handle intersect data // for step-5 handle intersect data
static void doIntersect(PipelineArg &pipeArg) { static void doIntersect(PipelineArg &pipeArg) {
// spdlog::info("intersect order: {}", pipeArg.intersectOrder); // spdlog::info("intersect order: {}", pipeArg.intersectOrder);
// return; const int kInitIntersectOrder = 1;
// last, current, next
const int kInitIntersectOrder = 2;
IntersectData &g = pipeArg.intersectData; IntersectData &g = pipeArg.intersectData;
MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 2) % pipeArg.MARKBUFNUM]; MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM];
MarkDupData &cp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM]; MarkDupData &cp = pipeArg.markDupData[(pipeArg.intersectOrder) % pipeArg.MARKBUFNUM];
MarkDupData &np = pipeArg.markDupData[(pipeArg.intersectOrder) % pipeArg.MARKBUFNUM];
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr; SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
SortMarkData &csm = *(SortMarkData *)cp.dataPtr; SortMarkData &csm = *(SortMarkData *)cp.dataPtr;
SortMarkData &nsm = *(SortMarkData *)np.dataPtr;
// if (pipeArg.sortOrder >= 929) {
// cout << "zzh intersect size: " << pipeArg.intersectOrder << '\t' << lp.taskSeq << '\t'
// << lsm.pairs.size() << '\t' << lsm.frags.size() << '\t' << lsm.unpairedDic.size()<< '\t'
// << csm.pairs.size() << '\t' << csm.frags.size() << '\t' << csm.unpairedDic.size()<< '\t'
// << nsm.pairs.size() << '\t' << nsm.frags.size() << '\t' << nsm.unpairedDic.size()<< '\t'
// << endl;
// }
// 处理相邻数据块之间重叠的部分 // 处理相邻数据块之间重叠的部分
if (pipeArg.intersectOrder == kInitIntersectOrder) { // 第一次运行需要处理lp和cp
processIntersectFragPairs(lp, cp); processIntersectFragPairs(lp, cp);
}
processIntersectFragPairs(cp, np);
// if (pipeArg.sortOrder >= 929) {
// cout << "zzh intersect lp taskSeq: " << lp.taskSeq << '\t' << pipeArg.markDupData[0].taskSeq << '\t'
// << pipeArg.markDupData[1].taskSeq << '\t' << pipeArg.markDupData[2].taskSeq << '\t'
// << pipeArg.markDupData[3].taskSeq << '\t' << endl;
// }
// 检查确保lp和np之间没有数据交叉 // 检查确保lp和np之间没有数据交叉
int64_t lastRightPos = 0, curLeftPos = INT64_MAX, curRightPos = INT64_MAX, nextLeftPos = INT64_MAX; int64_t lastLeft = INT64_MIN, lastRight = INT64_MAX, curLeft = INT64_MAX, curRight = INT64_MAX;
if (lsm.frags.size() > 0) lastRightPos = lsm.frags.back().posKey; if (lsm.pairs.size() > 0) {
if (csm.frags.size() > 0) { lastLeft = lsm.frags[0].Left();
curLeftPos = csm.frags[0].posKey; lastRight = lsm.frags.back().Left();
curRightPos = csm.frags.back().posKey;
} }
if (nsm.frags.size() > 0) nextLeftPos = nsm.frags[0].posKey; if (csm.pairs.size() > 0) {
if (lastRightPos >= nextLeftPos) { curLeft = csm.frags[0].Left();
spdlog::error("previous data can not contain readends included by next data block! {} {}", lastRightPos, nextLeftPos); curRight = csm.frags.back().Left();
} }
// 在相邻数据块之间查找之前未匹配的readends
if (pipeArg.intersectOrder == kInitIntersectOrder) { // 第一次运行需要处理lp和cp if (g.rightPos >= curLeft) {
findUnpairedInDatas(lp, cp); spdlog::error("previous data can not contain readends included by next data block! {} {} {} {} {} {}",
lp.taskSeq, cp.taskSeq, g.rightPos, curLeft, lsm.pairs.size(), csm.pairs.size());
} }
findUnpairedInDatas(lp, np); // 找到的匹配放到lp里 g.rightPos = lastRight;
findUnpairedInDatas(cp, np); // 找到的匹配放到cp里
// 把lp中未匹配的放到global里保存 findUnpairedInDatas(lp, cp); // 找到的匹配放到lp里
findUnpairedInGlobal(g, lp); findUnpairedInGlobal(g, cp); // 把cp中未匹配的放到global里保存
// 处理lp中的新找到的匹配 // 处理lp中的新找到的匹配
TaskSeqDupInfo t; TaskSeqDupInfo t;
@ -772,46 +558,58 @@ static void doIntersect(PipelineArg &pipeArg) {
auto &ckVal = *itr; auto &ckVal = *itr;
auto &ck = ckVal.first; auto &ck = ckVal.first;
auto &pairArr = ckVal.second; auto &pairArr = ckVal.second;
getEqualRE(pairArr[0], lsm.pairs, &pairArr); getEqualRE(pairArr[0], lsm.pairs, &pairArr); // 如果不在计算范围内会放在global里
if (ck.Right() <= lastRight) { // 必须在当前数据块的范围内, 才进行处理
if (ck.Left() >= curLeft) { // 在交叉的范围内才去加上这些在cp中的数据
getEqualRE(pairArr[0], csm.pairs, &pairArr); getEqualRE(pairArr[0], csm.pairs, &pairArr);
// 在cp的ckeyReadEndsMap里找一下
auto cpKeyItr = cp.ckeyReadEndsMap.find(ck);
if (cpKeyItr != cp.ckeyReadEndsMap.end()) {
auto &cpPairArr = cpKeyItr->second;
pairArr.insert(pairArr.end(), cpPairArr.begin(), cpPairArr.end());
cp.ckeyReadEndsMap.erase(cpKeyItr);
} }
if (ck.Read2Pos() <= curRightPos) { // 必须在当前数据块的范围内否则放到global里 // 在global里找一找ck
sort(pairArr.begin(), pairArr.end()); auto gitr = g.ckeyReadEndsMap.find(ck);
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, if (gitr != g.ckeyReadEndsMap.end()) {
&t.notRepIdx); auto &gPairArr = gitr->second;
pairArr.insert(pairArr.end(), gPairArr.begin(), gPairArr.end());
g.ckeyReadEndsMap.erase(gitr);
}
sort(pairArr.begin(), pairArr.end(), ReadEnds::PairLittleThan);
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
itr = lp.ckeyReadEndsMap.erase(itr); itr = lp.ckeyReadEndsMap.erase(itr);
} else { } else {
++itr; ++itr;
} }
} }
// 处理找到匹配的global数据 // 处理找到匹配的global数据
for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) { for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) {
auto &ckVal = *itr; auto &ckVal = *itr;
auto &ck = ckVal.first; auto &ck = ckVal.first;
if (ck.Read2Pos() < curLeftPos) { // 只有在这个范围内对应位点的所有reads才完全都包含了
auto &pairArr = ckVal.second; auto &pairArr = ckVal.second;
sort(pairArr.begin(), pairArr.end()); if (ck.Left() >= lastLeft) {
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx); getEqualRE(pairArr[0], lsm.pairs, &pairArr);
}
if (ck.Right() <= lastRight) { // 只有在这个范围内对应位点的所有reads才完全都包含了
sort(pairArr.begin(), pairArr.end(), ReadEnds::PairLittleThan);
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
itr = g.ckeyReadEndsMap.erase(itr); itr = g.ckeyReadEndsMap.erase(itr);
} else { } else {
++itr; ++itr;
} }
} }
// 剩余的在lp中没处理的放到global里
for (auto &ckVal : lp.ckeyReadEndsMap) { for (auto &ckVal : lp.ckeyReadEndsMap) {
g.ckeyReadEndsMap.insert(ckVal); auto &pairArr = g.ckeyReadEndsMap[ckVal.first];
pairArr.insert(pairArr.end(), ckVal.second.begin(), ckVal.second.end());
} }
lp.ckeyReadEndsMap.clear(); lp.ckeyReadEndsMap.clear();
// 更新一下冗余结果 // 更新一下冗余结果
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp, cp); refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp, cp);
// 处理g中新找到的匹配 // 处理g中新找到的匹配
putDupinfoToGlobal(g, lp); putDupinfoToGlobal(g, lp);
for (auto &unPair : lsm.unpairedDic) {
g.unpairedDic.insert(unPair);
}
} }
static void *pipeRead(void *data) { static void *pipeRead(void *data) {
@ -845,10 +643,6 @@ static void *pipeRead(void *data) {
inBamBuf.ClearAll(); // 清理上一轮读入的数据 inBamBuf.ClearAll(); // 清理上一轮读入的数据
pipeArg.readOrder += 1; // for next pipeArg.readOrder += 1; // for next
yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // 读入了一轮数据 yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // 读入了一轮数据
if (pipeArg.readOrder >= 3719) {
cout << "zzh read size: " << pipeArg.readData.bams.size() << endl;
}
} }
spdlog::info("total reads processed {}, last order {}", readNumSum, pipeArg.readOrder); spdlog::info("total reads processed {}, last order {}", readNumSum, pipeArg.readOrder);
return 0; return 0;
@ -976,7 +770,7 @@ static void *pipeMarkDup(void *data) {
cout << "zzh pipeMarkDup: " << pipeArg.sortOrder << '\t' << pipeArg.markDupOrder << endl; cout << "zzh pipeMarkDup: " << pipeArg.sortOrder << '\t' << pipeArg.markDupOrder << endl;
yarn::POSSESS(pipeArg.markDupSig); yarn::POSSESS(pipeArg.markDupSig);
pipeArg.markDupFinish = 1; pipeArg.markDupFinish = 1;
yarn::TWIST(pipeArg.markDupSig, yarn::TO, 3 + pipeArg.MARKBUFNUM); yarn::TWIST(pipeArg.markDupSig, yarn::TO, 2 + pipeArg.MARKBUFNUM);
break; break;
} }
/* 冗余检测 readends */ /* 冗余检测 readends */
@ -995,11 +789,12 @@ static void *pipeMarkDup(void *data) {
} }
static void *pipeIntersect(void *data) { static void *pipeIntersect(void *data) {
PipelineArg &pipeArg = *(PipelineArg *)data; PipelineArg &pipeArg = *(PipelineArg *)data;
pipeArg.intersectOrder = 2; const int kInitIntersectOrder = 1;
pipeArg.intersectOrder = kInitIntersectOrder;
while (1) { while (1) {
PROF_START(intersect_wait); PROF_START(intersect_wait);
yarn::POSSESS(pipeArg.markDupSig); yarn::POSSESS(pipeArg.markDupSig);
yarn::WAIT_FOR(pipeArg.markDupSig, yarn::TO_BE_MORE_THAN, 2); // 等待有数据 yarn::WAIT_FOR(pipeArg.markDupSig, yarn::TO_BE_MORE_THAN, kInitIntersectOrder); // 等待有数据
yarn::RELEASE(pipeArg.markDupSig); yarn::RELEASE(pipeArg.markDupSig);
PROF_END(gprof[GP_intersect_wait], intersect_wait); PROF_END(gprof[GP_intersect_wait], intersect_wait);
@ -1031,49 +826,29 @@ static void *pipeIntersect(void *data) {
/* 当所有任务结束后global data里还有未处理的数据 */ /* 当所有任务结束后global data里还有未处理的数据 */
static void processLastData(PipelineArg &pipeArg) { static void processLastData(PipelineArg &pipeArg) {
IntersectData &g = pipeArg.intersectData; IntersectData &g = pipeArg.intersectData;
MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 2) % pipeArg.MARKBUFNUM]; MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM];
MarkDupData &cp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM];
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr; SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
SortMarkData &csm = *(SortMarkData *)cp.dataPtr; int64_t lastLeft = INT64_MIN;
if (lsm.pairs.size() > 0) {
spdlog::info("last taskseq: lp {}, cp {}, {}, {}", lp.taskSeq, cp.taskSeq, lp.pairDupIdx.size(), cp.pairDupIdx.size()); lastLeft = lsm.frags[0].Left();
spdlog::info("last data size: g{}-lp{}-cp{}", g.unpairedDic.size(), lsm.unpairedDic.size(), csm.unpairedDic.size());
spdlog::info("last pair frags: {}, {}, {}, {}", lsm.frags.size(), lsm.pairs.size(), csm.frags.size(), csm.pairs.size());
// 把lp中未匹配的放到global里保存
findUnpairedInGlobal(g, lp);
findUnpairedInGlobal(g, cp);
spdlog::info("last data size after: g{}-lp{}-cp{}", g.unpairedDic.size(), lsm.unpairedDic.size(), csm.unpairedDic.size());
// 处理lp中的新找到的匹配
TaskSeqDupInfo t;
for (auto &ckVal : lp.ckeyReadEndsMap) {
auto &ck = ckVal.first;
auto &pairArr = ckVal.second;
getEqualRE(pairArr[0], lsm.pairs, &pairArr);
getEqualRE(pairArr[0], csm.pairs, &pairArr);
sort(pairArr.begin(), pairArr.end());
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
} }
// 处理找到匹配的global数据 // 处理找到匹配的global数据
TaskSeqDupInfo t;
for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) { for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) {
auto &ckVal = *itr; auto &ckVal = *itr;
auto &ck = ckVal.first; auto &ck = ckVal.first;
auto &pairArr = ckVal.second; auto &pairArr = ckVal.second;
sort(pairArr.begin(), pairArr.end()); if (ck.Left() >= lastLeft) {
getEqualRE(pairArr[0], lsm.pairs, &pairArr);
}
sort(pairArr.begin(), pairArr.end(), ReadEnds::PairLittleThan );
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx); processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx);
itr = g.ckeyReadEndsMap.erase(itr); itr = g.ckeyReadEndsMap.erase(itr);
} }
// 更新一下冗余结果 // 更新一下冗余结果
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp, cp); refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp);
// 处理g中新找到的匹配 // 处理g中新找到的匹配
putDupinfoToGlobal(g, lp); putDupinfoToGlobal(g, lp);
putDupinfoToGlobal(g, cp);
spdlog::info("last data size: g{}-lp{}-cp{}", g.unpairedDic.size(), lsm.unpairedDic.size(), csm.unpairedDic.size());
} }
static void parallelPipeline() { static void parallelPipeline() {

View File

@ -113,6 +113,7 @@ struct DupResult {
struct IntersectData { struct IntersectData {
UnpairedNameMap unpairedDic; // 用来寻找pair end UnpairedNameMap unpairedDic; // 用来寻找pair end
CkeyReadEndsMap ckeyReadEndsMap; CkeyReadEndsMap ckeyReadEndsMap;
int64_t rightPos = 0;
// 每个task对应一个vector // 每个task对应一个vector
vector<vector<DupInfo>> &dupIdxArr; vector<vector<DupInfo>> &dupIdxArr;
@ -141,7 +142,7 @@ struct IntersectData {
struct PipelineArg { struct PipelineArg {
static const int GENBUFNUM = 2; static const int GENBUFNUM = 2;
static const int SORTBUFNUM = 2; static const int SORTBUFNUM = 2;
static const int MARKBUFNUM = 4; static const int MARKBUFNUM = 3;
uint64_t readOrder = 0; uint64_t readOrder = 0;
uint64_t genREOrder = 0; uint64_t genREOrder = 0;
uint64_t sortOrder = 0; uint64_t sortOrder = 0;
@ -212,14 +213,20 @@ struct REArrIdIdx {
const ReadEnds *pE = nullptr; const ReadEnds *pE = nullptr;
}; };
struct REGreaterThan { struct REFragGreaterThan {
bool operator()(const REArrIdIdx &a, const REArrIdIdx &b) { return *b.pE < *a.pE; } bool operator()(const REArrIdIdx &a, const REArrIdIdx &b) { return ReadEnds::FragLittleThan(*b.pE, *a.pE); }
}; };
struct REPairGreaterThan {
bool operator()(const REArrIdIdx &a, const REArrIdIdx &b) { return ReadEnds::PairLittleThan(*b.pE, *a.pE);
}
};
template <typename CMP>
struct ReadEndsHeap { struct ReadEndsHeap {
// 将冗余索引和他对应的task vector对应起来 // 将冗余索引和他对应的task vector对应起来
vector<vector<ReadEnds>> *arr2d; vector<vector<ReadEnds>> *arr2d;
priority_queue<REArrIdIdx, vector<REArrIdIdx>, REGreaterThan> minHeap; priority_queue<REArrIdIdx, vector<REArrIdIdx>, CMP> minHeap;
uint64_t popNum = 0; uint64_t popNum = 0;
int Init(vector<vector<ReadEnds>> *_arr2d) { int Init(vector<vector<ReadEnds>> *_arr2d) {

View File

@ -32,38 +32,70 @@ struct CalcKey {
int32_t read1Coordinate = -1; int32_t read1Coordinate = -1;
int32_t read2ReferenceIndex = -1; int32_t read2ReferenceIndex = -1;
int32_t read2Coordinate = -1; int32_t read2Coordinate = -1;
int64_t left = -1;
int64_t right = -1;
CalcKey() {} CalcKey() {}
static CalcKey MaxCK() {
CalcKey ck;
ck.left = ck.right = INT64_MAX;
return ck;
}
static CalcKey MinCK() {
CalcKey ck;
ck.left = ck.right = INT64_MIN;
return ck;
}
CalcKey(const ReadEnds &re) { CalcKey(const ReadEnds &re) {
orientation = re.orientation; orientation = re.orientation;
read1ReferenceIndex = re.read1ReferenceIndex; read1ReferenceIndex = re.read1ReferenceIndex;
read1Coordinate = re.read1Coordinate; read1Coordinate = re.read1Coordinate;
read2ReferenceIndex = re.read2ReferenceIndex; read2ReferenceIndex = re.read2ReferenceIndex;
read2Coordinate = re.read2Coordinate; read2Coordinate = re.read2Coordinate;
left = Read1Pos();
right = Read2Pos();
} }
int64_t Read1Pos() const { return BamWrap::bam_global_pos(read1ReferenceIndex, read1Coordinate); } int64_t Read1Pos() const { return BamWrap::bam_global_pos(read1ReferenceIndex, read1Coordinate); }
int64_t Read2Pos() const { return BamWrap::bam_global_pos(read2ReferenceIndex, read2Coordinate); } int64_t Read2Pos() const { return BamWrap::bam_global_pos(read2ReferenceIndex, read2Coordinate); }
int64_t Left() const { return left; }
int64_t Right() const { return right; }
bool operator<(const CalcKey &o) const { bool operator<(const CalcKey &o) const {
int comp = read1ReferenceIndex - o.read1ReferenceIndex; int comp = read1ReferenceIndex - o.read1ReferenceIndex;
if (comp == 0) if (comp == 0)
comp = read1Coordinate - o.read1Coordinate; comp = read1Coordinate - o.read1Coordinate;
// 需要orientation因为要跟排序的比较方式和顺序一致
if (comp == 0)
comp = orientation - o.orientation;
if (comp == 0) if (comp == 0)
comp = read2ReferenceIndex - o.read2ReferenceIndex; comp = read2ReferenceIndex - o.read2ReferenceIndex;
if (comp == 0) if (comp == 0)
comp = read2Coordinate - o.read2Coordinate; comp = read2Coordinate - o.read2Coordinate;
// 需要orientation因为要跟排序的比较方式和顺序一致
if (comp == 0)
comp = orientation - o.orientation;
return comp < 0; return comp < 0;
} }
bool operator <= (const CalcKey &o) const {
return *this < o || *this == o;
}
bool operator==(const CalcKey &o) const { bool operator==(const CalcKey &o) const {
return read1ReferenceIndex == o.read1ReferenceIndex && read1Coordinate == o.read1Coordinate && return read1ReferenceIndex == o.read1ReferenceIndex && read1Coordinate == o.read1Coordinate &&
orientation == o.orientation && read2ReferenceIndex == o.read2ReferenceIndex && orientation == o.orientation && read2ReferenceIndex == o.read2ReferenceIndex &&
read2Coordinate == o.read2Coordinate; read2Coordinate == o.read2Coordinate;
} }
bool operator<(const ReadEnds &o) const {
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
if (comp == 0)
comp = read1Coordinate - o.read1Coordinate;
if (comp == 0)
comp = read2ReferenceIndex - o.read2ReferenceIndex;
if (comp == 0)
comp = read2Coordinate - o.read2Coordinate;
// 需要orientation因为要跟排序的比较方式和顺序一致
if (comp == 0)
comp = orientation - o.orientation;
return comp < 0;
}
std::size_t operator()() const { std::size_t operator()() const {
size_t h1 = read1ReferenceIndex; size_t h1 = read1ReferenceIndex;
h1 = (h1 << 40) | (read1Coordinate << 8) | orientation; h1 = (h1 << 40) | (read1Coordinate << 8) | orientation;
@ -247,8 +279,7 @@ struct DupIdxQueue {
DupInfo nextDup = dupIdx; DupInfo nextDup = dupIdx;
auto topIdx = minHeap.top(); auto topIdx = minHeap.top();
ofstream ofs("na12878.txt"); // ofstream ofs("n.dup"); ofstream ofs1("n-all.dup");
ofstream ofs1("na12878-all.txt");
while (dupIdx != -1) { while (dupIdx != -1) {
++len; ++len;
@ -264,12 +295,14 @@ struct DupIdxQueue {
<< endl; << endl;
} }
} }
ofs1 << topIdx.arrId << '\t' << topIdx.arrIdx << '\t' << topIdx.dupIdx << endl;
ofs << topIdx.dupIdx << endl; // ofs << topIdx.dupIdx << endl; ofs1 << topIdx.arrId << '\t' << topIdx.arrIdx << '\t' << topIdx.dupIdx << endl;
dupIdx = nextDup; dupIdx = nextDup;
preTop = topIdx; preTop = topIdx;
} }
ofs.close(); // ofs.close(); ofs1.close();
cout << "RealSize: " << len << endl;
return len; return len;
} }
}; };

View File

@ -11,8 +11,11 @@ Date : 2023/11/3
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <algorithm> #include <util/bam_wrap.h>
#include <algorithm>
#include <iostream>
using namespace std;
/** /**
* Small interface that provides access to the physical location information * Small interface that provides access to the physical location information
* about a cluster. All values should be defaulted to -1 if unavailable. * about a cluster. All values should be defaulted to -1 if unavailable.
@ -123,55 +126,85 @@ struct ReadEnds : PhysicalLocation {
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex; int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
if (comp == 0) if (comp == 0)
comp = a.read1Coordinate - b.read1Coordinate; comp = a.read1Coordinate - b.read1Coordinate;
if (comp == 0)
comp = a.orientation - b.orientation;
if (compareRead2) { if (compareRead2) {
if (comp == 0) if (comp == 0)
comp = a.read2ReferenceIndex - b.read2ReferenceIndex; comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
if (comp == 0) if (comp == 0)
comp = a.read2Coordinate - b.read2Coordinate; comp = a.read2Coordinate - b.read2Coordinate;
} }
if (comp == 0)
comp = a.orientation - b.orientation;
return comp < 0; return comp < 0;
} }
// 找某一个位置的所有readend时需要, 只对比位点不对比orientation static bool FragLittleThan(const ReadEnds &a, const ReadEnds &b) {
static bool PairsLittleThan(const ReadEnds &a, const ReadEnds &b) {
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex; int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
if (comp == 0) if (comp == 0)
comp = a.read1Coordinate - b.read1Coordinate; comp = a.read1Coordinate - b.read1Coordinate;
// 需要orientation因为要跟排序的比较方式和顺序一致 if (comp == 0) // 这个放在坐标比较之前因为在解析bam的时候可能有的给read2ReferenceIndex赋值了,有的则没赋值
if (comp == 0)
comp = a.orientation - b.orientation; comp = a.orientation - b.orientation;
if (comp == 0) if (comp == 0)
comp = a.read2ReferenceIndex - b.read2ReferenceIndex; comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
if (comp == 0) if (comp == 0)
comp = a.read2Coordinate - b.read2Coordinate; comp = a.read2Coordinate - b.read2Coordinate;
// if (comp == 0)
// comp = a.tile - b.tile;
// if (comp == 0)
// comp = a.x - b.x; // 由于picard的bug用short类型来表示xy导致其可能为负数
// if (comp == 0)
// comp - a.y - b.y;
if (comp == 0)
comp = (int)(a.read1IndexInFile - b.read1IndexInFile);
if (comp == 0)
comp = (int)(a.read2IndexInFile - b.read2IndexInFile);
return comp < 0; return comp < 0;
} }
// 比较函数 void Print() {
bool operator<(const ReadEnds &o) const { std::cout << read1ReferenceIndex << '\t' << read1Coordinate << '\t' << read2ReferenceIndex << '\t' << read2Coordinate
int comp = read1ReferenceIndex - o.read1ReferenceIndex; << '\t' << orientation << '\t' << read1IndexInFile << '\t' << read2IndexInFile << std::endl;
}
static bool PairLittleThan(const ReadEnds &a, const ReadEnds &b) {
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
if (comp == 0) if (comp == 0)
comp = read1Coordinate - o.read1Coordinate; comp = a.read1Coordinate - b.read1Coordinate;
if (comp == 0) if (comp == 0)
comp = orientation - o.orientation; comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
if (comp == 0) if (comp == 0)
comp = read2ReferenceIndex - o.read2ReferenceIndex; comp = a.read2Coordinate - b.read2Coordinate;
if (comp == 0) // 这个放在坐标比较了之后,把坐标范围的放在之前,这样对分段数据块比较好处理
comp = a.orientation - b.orientation;
// if (comp == 0)
// comp = a.tile - b.tile;
// if (comp == 0)
// comp = a.x - b.x; // 由于picard的bug用short类型来表示xy导致其可能为负数
// if (comp == 0)
// comp - a.y - b.y;
if (comp == 0) if (comp == 0)
comp = read2Coordinate - o.read2Coordinate; comp = (int)(a.read1IndexInFile - b.read1IndexInFile);
//if (comp == 0)
// comp = tile - o.tile;
//if (comp == 0)
// comp = x - o.x;
//if (comp == 0)
// comp - y - o.y;
if (comp == 0) if (comp == 0)
comp = (int)(read1IndexInFile - o.read1IndexInFile); comp = (int)(a.read2IndexInFile - b.read2IndexInFile);
if (comp == 0)
comp = (int)(read2IndexInFile - o.read2IndexInFile);
return comp < 0; return comp < 0;
} }
static bool CorLittleThan(const ReadEnds &a, const ReadEnds &b) {
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
if (comp == 0)
comp = a.read1Coordinate - b.read1Coordinate;
if (comp == 0)
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
if (comp == 0)
comp = a.read2Coordinate - b.read2Coordinate;
if (comp == 0) // 这个放在坐标比较了之后,把坐标范围的放在之前,这样对分段数据块比较好处理
comp = a.orientation - b.orientation;
return comp < 0;
}
// for pairs only
int64_t Left() { return BamWrap::bam_global_pos(read1ReferenceIndex, read1Coordinate); }
int64_t Right() { return BamWrap::bam_global_pos(read2ReferenceIndex, read2Coordinate); }
}; };
struct ReadEndsHash { struct ReadEndsHash {