完美解决了所有问题,基本都是由排序带来的,排序过程涉及的变量太多,导致范围跟预想的不一致,现在都解决了
This commit is contained in:
parent
64d23b3f2d
commit
3606562de9
|
|
@ -164,6 +164,7 @@ int MarkDuplicates() {
|
||||||
spdlog::info("{} optical reads found", opticalIdxQue.Size());
|
spdlog::info("{} optical reads found", opticalIdxQue.Size());
|
||||||
spdlog::info("{} represent reads found", repIdxQue.Size());
|
spdlog::info("{} represent reads found", repIdxQue.Size());
|
||||||
// spdlog::info("real dup size: {}", dupIdxQue.RealSize());
|
// spdlog::info("real dup size: {}", dupIdxQue.RealSize());
|
||||||
|
// spdlog::info("real optical dup size: {}", opticalIdxQue.RealSize());
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ struct MarkDupsArg {
|
||||||
|
|
||||||
int NUM_THREADS = 1;
|
int NUM_THREADS = 1;
|
||||||
|
|
||||||
size_t MAX_MEM = ((size_t)1) << 28; // << 31 // 最小2G
|
size_t MAX_MEM = ((size_t)1) << 30; // // 最小1G
|
||||||
|
|
||||||
bool DUPLEX_IO = true; // 同时读写
|
bool DUPLEX_IO = true; // 同时读写
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,7 @@
|
||||||
#include "util/yarn.h"
|
#include "util/yarn.h"
|
||||||
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
namespace nsgv {
|
namespace nsgv {
|
||||||
|
|
||||||
|
|
@ -30,40 +31,6 @@ extern DupResult gDupRes;
|
||||||
extern PipelineArg gPipe;
|
extern PipelineArg gPipe;
|
||||||
}; // namespace nsgv
|
}; // namespace nsgv
|
||||||
|
|
||||||
/* 排序 */
|
|
||||||
static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
|
|
||||||
size_t blockSize = 64 * 1024;
|
|
||||||
if (arr.size() < blockSize) {
|
|
||||||
std::sort(arr.begin(), arr.end());
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
size_t blockNum = (arr.size() + blockSize - 1) / blockSize;
|
|
||||||
size_t crossNum = 1024;
|
|
||||||
size_t start, end, i, left, right;
|
|
||||||
std::sort(arr.begin(), arr.begin() + blockSize);
|
|
||||||
for (i = 1; i < blockNum; ++i) {
|
|
||||||
start = i * blockSize;
|
|
||||||
end = min(start + blockSize, arr.size());
|
|
||||||
std::sort(arr.begin() + start, arr.begin() + end);
|
|
||||||
left = crossNum;
|
|
||||||
while (!(arr[start - left] < arr[start])) {
|
|
||||||
left = left << 1;
|
|
||||||
if (left >= blockSize) {
|
|
||||||
std::sort(arr.begin(), arr.end()); // 退化到普通排序
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
right = min(crossNum, end - start - 1);
|
|
||||||
|
|
||||||
while (!(arr[start - 1] < arr[start + right])) {
|
|
||||||
right = min(right << 1, end - start - 1);
|
|
||||||
if (right == end - start - 1)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
std::sort(arr.begin() + start - left, arr.begin() + start + right);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理一组pairend的readends,标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
|
/* 处理一组pairend的readends,标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
|
||||||
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
|
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
|
||||||
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
|
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
|
||||||
|
|
@ -71,7 +38,6 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
|
||||||
if (vpRe.size() < 2) {
|
if (vpRe.size() < 2) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int maxScore = 0;
|
int maxScore = 0;
|
||||||
const ReadEnds *pBest = nullptr;
|
const ReadEnds *pBest = nullptr;
|
||||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||||
|
|
@ -110,14 +76,6 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
|
||||||
}
|
}
|
||||||
for (auto pe : vpRe) { // 对非best read标记冗余
|
for (auto pe : vpRe) { // 对非best read标记冗余
|
||||||
if (pe != pBest) { // 非best
|
if (pe != pBest) { // 非best
|
||||||
|
|
||||||
// if (1555341360 == pe->read1IndexInFile) {
|
|
||||||
// for (auto p : vpRe) {
|
|
||||||
// cout << "zzh find: " << p->read1IndexInFile << '\t' << p->read2IndexInFile << endl;
|
|
||||||
// }
|
|
||||||
// cout << "split" << endl;
|
|
||||||
// }
|
|
||||||
|
|
||||||
dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1
|
dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1
|
||||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||||
dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2,
|
dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2,
|
||||||
|
|
@ -177,7 +135,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
|
||||||
|
|
||||||
/* 找到与readend pos相等的所有readend */
|
/* 找到与readend pos相等的所有readend */
|
||||||
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
|
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
|
||||||
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点
|
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::CorLittleThan); // 只比对位点
|
||||||
dst->insert(dst->end(), range.first, range.second);
|
dst->insert(dst->end(), range.first, range.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -232,19 +190,6 @@ static void processFrags(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDS
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 单线程markdup (第二步)*/
|
|
||||||
static void markdups(MarkDupDataArg *arg) {
|
|
||||||
auto &p = *arg;
|
|
||||||
p.fragDupIdx.clear();
|
|
||||||
p.pairDupIdx.clear();
|
|
||||||
p.pairOpticalDupIdx.clear();
|
|
||||||
p.pairRepIdx.clear();
|
|
||||||
/* generateDuplicateIndexes,计算冗余read在所有read中的位置索引 */
|
|
||||||
// 先处理 pair
|
|
||||||
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx, &p.pairRepIdx);
|
|
||||||
// 再处理frag
|
|
||||||
processFrags(p.frags, &p.fragDupIdx);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 获取交叉部分的数据 */
|
/* 获取交叉部分的数据 */
|
||||||
static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst,
|
static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst,
|
||||||
|
|
@ -272,7 +217,10 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
|
||||||
}
|
}
|
||||||
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
||||||
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
|
dst->insert(dst->end(), rightArr.begin(), rightArr.begin() + rightSpan);
|
||||||
std::sort(dst->begin(), dst->end());
|
if (isPairCmp)
|
||||||
|
std::sort(dst->begin(), dst->end(), ReadEnds::PairLittleThan);
|
||||||
|
else
|
||||||
|
std::sort(dst->begin(), dst->end(), ReadEnds::FragLittleThan);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 将frags重叠部分的dup idx放进数据中 */
|
/* 将frags重叠部分的dup idx放进数据中 */
|
||||||
|
|
@ -290,59 +238,7 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> ¬
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 最后合并数据并排序 */
|
|
||||||
template <typename DupContainer, typename T>
|
|
||||||
static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> ¬DupIdx, vector<T> &dupArr,
|
|
||||||
vector<T> &cacheDupIdx1, vector<T> &midArr1) {
|
|
||||||
// midArr.resize(0);
|
|
||||||
// cacheDupIdx.resize(0);
|
|
||||||
|
|
||||||
vector<T> cacheDupIdx;
|
|
||||||
vector<T> midArr;
|
|
||||||
|
|
||||||
cacheDupIdx.insert(cacheDupIdx.end(), dupIdx.begin(), dupIdx.end());
|
|
||||||
std::sort(cacheDupIdx.begin(), cacheDupIdx.end());
|
|
||||||
|
|
||||||
auto ai = dupArr.begin();
|
|
||||||
auto ae = dupArr.end();
|
|
||||||
auto bi = cacheDupIdx.begin();
|
|
||||||
auto be = cacheDupIdx.end();
|
|
||||||
|
|
||||||
T val = 0;
|
|
||||||
while (ai != ae && bi != be) {
|
|
||||||
if (*ai < *bi) {
|
|
||||||
val = *ai++;
|
|
||||||
} else if (*bi < *ai) {
|
|
||||||
val = *bi++;
|
|
||||||
} else {
|
|
||||||
val = *bi++; // 相等的时候取后面的作为结果
|
|
||||||
ai++;
|
|
||||||
}
|
|
||||||
if (notDupIdx.find(val) == notDupIdx.end()) {
|
|
||||||
midArr.push_back(val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
while (ai != ae) {
|
|
||||||
val = *ai++;
|
|
||||||
if (notDupIdx.find(val) == notDupIdx.end()) {
|
|
||||||
midArr.push_back(val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
while (bi != be) {
|
|
||||||
val = *bi++;
|
|
||||||
if (notDupIdx.find(val) == notDupIdx.end()) {
|
|
||||||
midArr.push_back(val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// spdlog::info("midArr & dupArr size: {}-{}", midArr.size(), dupArr.size());
|
|
||||||
// dupArr = midArr;
|
|
||||||
dupArr.clear();
|
|
||||||
dupArr.assign(midArr.begin(), midArr.end());
|
|
||||||
spdlog::info("midArr & dupArr size: {}-{}", midArr.size(), dupArr.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
// for step 2 generate read ends
|
// for step 2 generate read ends
|
||||||
|
|
||||||
// multi-thread generate read ends
|
// multi-thread generate read ends
|
||||||
static void mtGenerateReadEnds(void *data, long idx, int tid) {
|
static void mtGenerateReadEnds(void *data, long idx, int tid) {
|
||||||
auto &p = *(PipelineArg *)data;
|
auto &p = *(PipelineArg *)data;
|
||||||
|
|
@ -369,9 +265,6 @@ static void mtGenerateReadEnds(void *data, long idx, int tid) {
|
||||||
if (bw->GetReadUnmappedFlag()) {
|
if (bw->GetReadUnmappedFlag()) {
|
||||||
if (bw->b->core.tid == -1)
|
if (bw->b->core.tid == -1)
|
||||||
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
||||||
//if (p.genREOrder >= 3719) {
|
|
||||||
// cout << "inner core.tid check\t" << bw->b->core.tid << '\t' << i << "\t" << start_id << '\t' << end_id << endl;
|
|
||||||
//}
|
|
||||||
break;
|
break;
|
||||||
} else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
|
} else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
|
||||||
ReadEnds fragEnd;
|
ReadEnds fragEnd;
|
||||||
|
|
@ -379,9 +272,6 @@ static void mtGenerateReadEnds(void *data, long idx, int tid) {
|
||||||
frags.push_back(fragEnd); // 添加进frag集合
|
frags.push_back(fragEnd); // 添加进frag集合
|
||||||
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
|
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
|
||||||
string key = bw->query_name();
|
string key = bw->query_name();
|
||||||
// if ("ERR194147.780864524" == key) {
|
|
||||||
// cout << "zzh find: " << key << '\t' << fragEnd.read1IndexInFile << '\t' << fragEnd.read2IndexInFile << endl;
|
|
||||||
// }
|
|
||||||
if (unpairedDic.find(key) == unpairedDic.end()) {
|
if (unpairedDic.find(key) == unpairedDic.end()) {
|
||||||
unpairedDic[key] = {taskSeq, fragEnd};
|
unpairedDic[key] = {taskSeq, fragEnd};
|
||||||
} else { // 找到了pairend
|
} else { // 找到了pairend
|
||||||
|
|
@ -396,25 +286,15 @@ static void mtGenerateReadEnds(void *data, long idx, int tid) {
|
||||||
PROF_END(tprof[TP_gen][tid], gen);
|
PROF_END(tprof[TP_gen][tid], gen);
|
||||||
|
|
||||||
PROF_START(sort_frag);
|
PROF_START(sort_frag);
|
||||||
// sortReadEndsArr(frags);
|
sort(frags.begin(), frags.end(), ReadEnds::FragLittleThan);
|
||||||
sort(frags.begin(), frags.end());
|
|
||||||
PROF_END(tprof[TP_sort_frag][tid], sort_frag);
|
PROF_END(tprof[TP_sort_frag][tid], sort_frag);
|
||||||
|
|
||||||
PROF_START(sort_pair);
|
PROF_START(sort_pair);
|
||||||
sort(pairs.begin(), pairs.end());
|
sort(pairs.begin(), pairs.end(), ReadEnds::PairLittleThan);
|
||||||
PROF_END(tprof[TP_sort_pair][tid], sort_pair);
|
PROF_END(tprof[TP_sort_pair][tid], sort_pair);
|
||||||
|
|
||||||
// if (p.genREOrder >= 3719) {
|
|
||||||
// cout << "zzh genRE size: " << p.genREOrder << '\t' << tid << '\t' << frags.size() << '\t' << pairs.size()
|
|
||||||
// << '\t' << start_id << '\t' << end_id << '\t' << bams.size() << endl;
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void doGenRE(PipelineArg &pipeArg) {
|
static void doGenRE(PipelineArg &pipeArg) {
|
||||||
// return;
|
|
||||||
//if (pipeArg.genREOrder < 895) return;
|
|
||||||
//if (pipeArg.genREOrder < 440) return;
|
|
||||||
|
|
||||||
GenREData &genREData = pipeArg.genREData[pipeArg.genREOrder % pipeArg.GENBUFNUM];
|
GenREData &genREData = pipeArg.genREData[pipeArg.genREOrder % pipeArg.GENBUFNUM];
|
||||||
ReadData &readData = pipeArg.readData;
|
ReadData &readData = pipeArg.readData;
|
||||||
|
|
||||||
|
|
@ -434,36 +314,17 @@ static void doGenRE(PipelineArg &pipeArg) {
|
||||||
for (auto &val : genREData.unpairedDicArr[i]) {
|
for (auto &val : genREData.unpairedDicArr[i]) {
|
||||||
const string &key = val.first;
|
const string &key = val.first;
|
||||||
const ReadEnds &fragEnd = val.second.unpairedRE;
|
const ReadEnds &fragEnd = val.second.unpairedRE;
|
||||||
|
|
||||||
// if ("ERR194147.783448001" == key) {
|
|
||||||
// cout << "zzh find in doGen: " << key << '\t' << fragEnd.read1IndexInFile << '\t' << fragEnd.read2IndexInFile
|
|
||||||
// << endl;
|
|
||||||
// }
|
|
||||||
|
|
||||||
if (genREData.unpairedDic.find(key) == genREData.unpairedDic.end()) {
|
if (genREData.unpairedDic.find(key) == genREData.unpairedDic.end()) {
|
||||||
genREData.unpairedDic[key] = {readData.taskSeq, fragEnd};
|
genREData.unpairedDic[key] = {readData.taskSeq, fragEnd};
|
||||||
// if (fragEnd.read1IndexInFile == 1524046492 || fragEnd.read2IndexInFile == 1524046492) {
|
|
||||||
// cout << "zzh not find paired: " << key << "\t" << fragEnd.read1IndexInFile << '\t'
|
|
||||||
// << fragEnd.read2IndexInFile << endl;
|
|
||||||
// }
|
|
||||||
} else { // 找到了pairend
|
} else { // 找到了pairend
|
||||||
auto &pairedEnds = genREData.unpairedDic.at(key).unpairedRE;
|
auto &pairedEnds = genREData.unpairedDic.at(key).unpairedRE;
|
||||||
modifyPairedEnds(fragEnd, &pairedEnds);
|
modifyPairedEnds(fragEnd, &pairedEnds);
|
||||||
pairs.push_back(pairedEnds);
|
pairs.push_back(pairedEnds);
|
||||||
// if (pairedEnds.read1IndexInFile == 1524046492 || pairedEnds.read2IndexInFile == 1524046492) {
|
|
||||||
// cout << "zzh find paired: " << key << '\t' << pairedEnds.read1IndexInFile << '\t'
|
|
||||||
// << pairedEnds.read2IndexInFile << "\t" << fragEnd.read1IndexInFile << '\t'
|
|
||||||
// << fragEnd.read2IndexInFile << '\t' << endl;
|
|
||||||
// }
|
|
||||||
genREData.unpairedDic.erase(key); // 删除找到的pairend
|
genREData.unpairedDic.erase(key); // 删除找到的pairend
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sort(pairs.begin(), pairs.end());
|
sort(pairs.begin(), pairs.end(), ReadEnds::PairLittleThan);
|
||||||
// if (pipeArg.genREOrder >= 3719) {
|
|
||||||
// cout << "zzh genRE size: " << pipeArg.genREOrder << '\t' << pairs.size() << '\t' << genREData.unpairedDic.size()
|
|
||||||
// << '\t' << readData.bams.size() << '\t' << testNum << endl;
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
// end for step 2 generate read ends
|
// end for step 2 generate read ends
|
||||||
|
|
||||||
|
|
@ -479,37 +340,26 @@ static void doSort(PipelineArg &pipeArg) {
|
||||||
smd.pairs.clear();
|
smd.pairs.clear();
|
||||||
smd.frags.clear();
|
smd.frags.clear();
|
||||||
const ReadEnds *pRE;
|
const ReadEnds *pRE;
|
||||||
ReadEndsHeap pairsHeap, fragsHeap;
|
ReadEndsHeap<REFragGreaterThan> fragsHeap;
|
||||||
|
ReadEndsHeap<REPairGreaterThan> pairsHeap;
|
||||||
PROF_START(sort_pair);
|
PROF_START(sort_pair);
|
||||||
pairsHeap.Init(&genREData.pairsArr);
|
pairsHeap.Init(&genREData.pairsArr);
|
||||||
// if (pipeArg.sortOrder >= 3719) {
|
|
||||||
// cout << "zzh sort pair size: " << pairsHeap.Size() << endl;
|
|
||||||
// }
|
|
||||||
while ((pRE = pairsHeap.Pop()) != nullptr) {
|
while ((pRE = pairsHeap.Pop()) != nullptr) {
|
||||||
smd.pairs.push_back(*pRE);
|
smd.pairs.push_back(*pRE);
|
||||||
}
|
}
|
||||||
PROF_END(gprof[GP_sort_pair], sort_pair);
|
PROF_END(gprof[GP_sort_pair], sort_pair);
|
||||||
PROF_START(sort_frag);
|
PROF_START(sort_frag);
|
||||||
fragsHeap.Init(&genREData.fragsArr);
|
fragsHeap.Init(&genREData.fragsArr);
|
||||||
// if (pipeArg.sortOrder >= 3719) {
|
|
||||||
// cout << "zzh sort frag size: " << fragsHeap.Size() << endl;
|
|
||||||
// }
|
|
||||||
while ((pRE = fragsHeap.Pop()) != nullptr) {
|
while ((pRE = fragsHeap.Pop()) != nullptr) {
|
||||||
smd.frags.push_back(*pRE);
|
smd.frags.push_back(*pRE);
|
||||||
}
|
}
|
||||||
PROF_END(gprof[GP_sort_frag], sort_frag);
|
PROF_END(gprof[GP_sort_frag], sort_frag);
|
||||||
// if (pipeArg.sortOrder >= 929) {
|
|
||||||
// cout << "zzh sort size: " << pipeArg.sortOrder << '\t' << smd.pairs.size() << '\t' << smd.frags.size() << '\t'
|
|
||||||
// << genREData.pairsArr.size() << '\t' << genREData.fragsArr.size() << endl;
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
// for step-4 sort
|
// for step-4 sort
|
||||||
static void doMarkDup(PipelineArg &pipeArg) {
|
static void doMarkDup(PipelineArg &pipeArg) {
|
||||||
// return;
|
|
||||||
MarkDupData &mdData = pipeArg.markDupData[pipeArg.markDupOrder % pipeArg.MARKBUFNUM];
|
MarkDupData &mdData = pipeArg.markDupData[pipeArg.markDupOrder % pipeArg.MARKBUFNUM];
|
||||||
SortData &sortData = pipeArg.sortData[pipeArg.markDupOrder % pipeArg.SORTBUFNUM];
|
SortData &sortData = pipeArg.sortData[pipeArg.markDupOrder % pipeArg.SORTBUFNUM];
|
||||||
mdData.taskSeq = pipeArg.markDupOrder;
|
mdData.taskSeq = pipeArg.markDupOrder;
|
||||||
cout << "zzh markdup: " << mdData.taskSeq << '\t' << pipeArg.markDupOrder << '\t' << pipeArg.intersectOrder << endl;
|
|
||||||
mdData.clear();
|
mdData.clear();
|
||||||
|
|
||||||
auto tmpPtr = mdData.dataPtr;
|
auto tmpPtr = mdData.dataPtr;
|
||||||
|
|
@ -525,10 +375,6 @@ static void doMarkDup(PipelineArg &pipeArg) {
|
||||||
PROF_START(markdup_frag);
|
PROF_START(markdup_frag);
|
||||||
processFrags(smd.frags, &mdData.fragDupIdx);
|
processFrags(smd.frags, &mdData.fragDupIdx);
|
||||||
PROF_END(gprof[GP_markdup_frag], markdup_frag);
|
PROF_END(gprof[GP_markdup_frag], markdup_frag);
|
||||||
// if (mdData.taskSeq >= 929) {
|
|
||||||
// cout << "zzh markdup size: " << mdData.taskSeq << '\t' << smd.pairs.size() << '\t' << smd.frags.size() << '\t'
|
|
||||||
// << smd.unpairedDic.size() << '\t' << mdData.pairDupIdx.size() << '\t' << mdData.fragDupIdx.size() << endl;
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
@ -599,7 +445,7 @@ static void processIntersectFragPairs(MarkDupData &lp, MarkDupData &cp) {
|
||||||
|
|
||||||
getIntersectData(lsm.pairs, csm.pairs, &reArr, true);
|
getIntersectData(lsm.pairs, csm.pairs, &reArr, true);
|
||||||
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, ¬DupIdx, ¬OpticalDupIdx, ¬RepIdx);
|
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, ¬DupIdx, ¬OpticalDupIdx, ¬RepIdx);
|
||||||
refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, lp, cp);
|
refreshMarkDupData(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, cp, lp); // 放在cp里,因为后面global里可能有相同的dup,防止多次出现
|
||||||
}
|
}
|
||||||
|
|
||||||
// 在相邻的数据块之间寻找未匹配的readends, 找到匹配的放到lp里
|
// 在相邻的数据块之间寻找未匹配的readends, 找到匹配的放到lp里
|
||||||
|
|
@ -607,15 +453,6 @@ static void findUnpairedInDatas(MarkDupData &lp, MarkDupData &cp) {
|
||||||
auto &interPairedData = lp.ckeyReadEndsMap;
|
auto &interPairedData = lp.ckeyReadEndsMap;
|
||||||
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
|
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
|
||||||
SortMarkData &csm = *(SortMarkData *)cp.dataPtr;
|
SortMarkData &csm = *(SortMarkData *)cp.dataPtr;
|
||||||
|
|
||||||
//if (lp.taskSeq >= 1856)
|
|
||||||
// return; //
|
|
||||||
//if (lp.taskSeq >= 1800) {
|
|
||||||
// cout << "zzh find in findUnpairedInDatas: " << lp.taskSeq << '\t' << cp.taskSeq << '\t'
|
|
||||||
// << lsm.unpairedDic.size() << '\t' << csm.unpairedDic.size() << '\t' << lp.ckeyReadEndsMap.size() << '\t'
|
|
||||||
// << cp.ckeyReadEndsMap.size() << endl;
|
|
||||||
//}
|
|
||||||
|
|
||||||
for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end(); ) { // 遍历上一个任务中的每个未匹配的read
|
for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end(); ) { // 遍历上一个任务中的每个未匹配的read
|
||||||
auto &lastUnpair = *itr;
|
auto &lastUnpair = *itr;
|
||||||
auto &readName = lastUnpair.first;
|
auto &readName = lastUnpair.first;
|
||||||
|
|
@ -626,22 +463,8 @@ static void findUnpairedInDatas(MarkDupData &lp, MarkDupData &cp) {
|
||||||
auto &curRE = curUnpairInfo.unpairedRE;
|
auto &curRE = curUnpairInfo.unpairedRE;
|
||||||
modifyPairedEnds(curRE, &lastRE); // lastRE当做找到匹配后,完整的ReadEnds
|
modifyPairedEnds(curRE, &lastRE); // lastRE当做找到匹配后,完整的ReadEnds
|
||||||
CalcKey ck(lastRE);
|
CalcKey ck(lastRE);
|
||||||
// if (lp.taskSeq == 1856) {
|
|
||||||
// cout << "zzh ck: " << ck.Read1Pos() << ' ' << ck.Read2Pos()
|
|
||||||
// << "\t find: " << (interPairedData.find(ck) != interPairedData.end()) << '\t' << interPairedData.size() << endl;
|
|
||||||
// }
|
|
||||||
auto &pairArr = interPairedData[ck];
|
auto &pairArr = interPairedData[ck];
|
||||||
pairArr.push_back(lastRE);
|
pairArr.push_back(lastRE);
|
||||||
|
|
||||||
// if (lastRE.read1IndexInFile == 1555341360 || lastRE.read2IndexInFile == 1555341360 ||
|
|
||||||
// lastRE.read1IndexInFile == 1555341145 || lastRE.read2IndexInFile == 1555341145) {
|
|
||||||
// cout << "zzh find in lp: " << pairArr.size() << '\t' << readName << '\t' << lastRE.read1IndexInFile
|
|
||||||
// << '\t' << lastRE.read2IndexInFile << '\t' << ck.Read1Pos() << '\t' << ck.Read2Pos() << endl;
|
|
||||||
// for (auto &p : pairArr) {
|
|
||||||
// cout << "reads in arr: " << p.read1IndexInFile << '\t' << p.read2IndexInFile << '\t' << p.score
|
|
||||||
// << endl;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// 从dict中清除配对后的readends
|
// 从dict中清除配对后的readends
|
||||||
csm.unpairedDic.erase(readName);
|
csm.unpairedDic.erase(readName);
|
||||||
itr = lsm.unpairedDic.erase(itr);
|
itr = lsm.unpairedDic.erase(itr);
|
||||||
|
|
@ -655,12 +478,6 @@ static void findUnpairedInDatas(MarkDupData &lp, MarkDupData &cp) {
|
||||||
static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) {
|
static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) {
|
||||||
auto &interPairedData = g.ckeyReadEndsMap;
|
auto &interPairedData = g.ckeyReadEndsMap;
|
||||||
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
|
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
|
||||||
|
|
||||||
if (lp.taskSeq >= 929) {
|
|
||||||
cout << "zzh findUnpairedInGlobal size: " << lp.taskSeq << '\t' << lsm.unpairedDic.size() << '\t'
|
|
||||||
<< g.unpairedDic.size() << '\t' << g.ckeyReadEndsMap.size() << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end();) { // 遍历上一个任务中的每个未匹配的read
|
for (auto itr = lsm.unpairedDic.begin(); itr != lsm.unpairedDic.end();) { // 遍历上一个任务中的每个未匹配的read
|
||||||
auto &lastUnpair = *itr;
|
auto &lastUnpair = *itr;
|
||||||
auto &readName = lastUnpair.first;
|
auto &readName = lastUnpair.first;
|
||||||
|
|
@ -673,14 +490,6 @@ static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) {
|
||||||
CalcKey ck(gRE);
|
CalcKey ck(gRE);
|
||||||
auto &pairArr = interPairedData[ck];
|
auto &pairArr = interPairedData[ck];
|
||||||
pairArr.push_back(gRE);
|
pairArr.push_back(gRE);
|
||||||
// if (gRE.read1IndexInFile == 1555341360 || gRE.read2IndexInFile == 1555341360 ||
|
|
||||||
// gRE.read1IndexInFile == 1555341145 || gRE.read2IndexInFile == 1555341145) {
|
|
||||||
// cout << "zzh find in global: " << pairArr.size() << '\t' << readName << '\t' << gRE.read1IndexInFile << '\t' << gRE.read2IndexInFile
|
|
||||||
// << '\t' << ck.Read1Pos() << '\t' << ck.Read2Pos() << endl;
|
|
||||||
// for (auto &p : pairArr) {
|
|
||||||
// cout << "reads in arr: " << p.read1IndexInFile << '\t' << p.read2IndexInFile << '\t' << p.score << endl;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// 从dict中清除配对后的readends
|
// 从dict中清除配对后的readends
|
||||||
g.unpairedDic.erase(readName);
|
g.unpairedDic.erase(readName);
|
||||||
itr = lsm.unpairedDic.erase(itr);
|
itr = lsm.unpairedDic.erase(itr);
|
||||||
|
|
@ -688,9 +497,6 @@ static void findUnpairedInGlobal(IntersectData &g, MarkDupData &lp) {
|
||||||
++itr;
|
++itr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto &unPair : lsm.unpairedDic) {
|
|
||||||
g.unpairedDic.insert(unPair);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void putDupinfoToGlobal(IntersectData &g, MarkDupData &lp) {
|
static void putDupinfoToGlobal(IntersectData &g, MarkDupData &lp) {
|
||||||
|
|
@ -714,57 +520,37 @@ static void putDupinfoToGlobal(IntersectData &g, MarkDupData &lp) {
|
||||||
// for step-5 handle intersect data
|
// for step-5 handle intersect data
|
||||||
static void doIntersect(PipelineArg &pipeArg) {
|
static void doIntersect(PipelineArg &pipeArg) {
|
||||||
// spdlog::info("intersect order: {}", pipeArg.intersectOrder);
|
// spdlog::info("intersect order: {}", pipeArg.intersectOrder);
|
||||||
// return;
|
const int kInitIntersectOrder = 1;
|
||||||
// last, current, next
|
|
||||||
const int kInitIntersectOrder = 2;
|
|
||||||
IntersectData &g = pipeArg.intersectData;
|
IntersectData &g = pipeArg.intersectData;
|
||||||
MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 2) % pipeArg.MARKBUFNUM];
|
MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM];
|
||||||
MarkDupData &cp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM];
|
MarkDupData &cp = pipeArg.markDupData[(pipeArg.intersectOrder) % pipeArg.MARKBUFNUM];
|
||||||
MarkDupData &np = pipeArg.markDupData[(pipeArg.intersectOrder) % pipeArg.MARKBUFNUM];
|
|
||||||
|
|
||||||
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
|
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
|
||||||
SortMarkData &csm = *(SortMarkData *)cp.dataPtr;
|
SortMarkData &csm = *(SortMarkData *)cp.dataPtr;
|
||||||
SortMarkData &nsm = *(SortMarkData *)np.dataPtr;
|
|
||||||
|
|
||||||
// if (pipeArg.sortOrder >= 929) {
|
|
||||||
// cout << "zzh intersect size: " << pipeArg.intersectOrder << '\t' << lp.taskSeq << '\t'
|
|
||||||
// << lsm.pairs.size() << '\t' << lsm.frags.size() << '\t' << lsm.unpairedDic.size()<< '\t'
|
|
||||||
// << csm.pairs.size() << '\t' << csm.frags.size() << '\t' << csm.unpairedDic.size()<< '\t'
|
|
||||||
// << nsm.pairs.size() << '\t' << nsm.frags.size() << '\t' << nsm.unpairedDic.size()<< '\t'
|
|
||||||
// << endl;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// 处理相邻数据块之间重叠的部分
|
// 处理相邻数据块之间重叠的部分
|
||||||
if (pipeArg.intersectOrder == kInitIntersectOrder) { // 第一次运行,需要处理lp和cp
|
processIntersectFragPairs(lp, cp);
|
||||||
processIntersectFragPairs(lp, cp);
|
|
||||||
}
|
|
||||||
processIntersectFragPairs(cp, np);
|
|
||||||
// if (pipeArg.sortOrder >= 929) {
|
|
||||||
// cout << "zzh intersect lp taskSeq: " << lp.taskSeq << '\t' << pipeArg.markDupData[0].taskSeq << '\t'
|
|
||||||
// << pipeArg.markDupData[1].taskSeq << '\t' << pipeArg.markDupData[2].taskSeq << '\t'
|
|
||||||
// << pipeArg.markDupData[3].taskSeq << '\t' << endl;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// 检查确保lp和np之间没有数据交叉
|
// 检查确保lp和np之间没有数据交叉
|
||||||
int64_t lastRightPos = 0, curLeftPos = INT64_MAX, curRightPos = INT64_MAX, nextLeftPos = INT64_MAX;
|
int64_t lastLeft = INT64_MIN, lastRight = INT64_MAX, curLeft = INT64_MAX, curRight = INT64_MAX;
|
||||||
if (lsm.frags.size() > 0) lastRightPos = lsm.frags.back().posKey;
|
if (lsm.pairs.size() > 0) {
|
||||||
if (csm.frags.size() > 0) {
|
lastLeft = lsm.frags[0].Left();
|
||||||
curLeftPos = csm.frags[0].posKey;
|
lastRight = lsm.frags.back().Left();
|
||||||
curRightPos = csm.frags.back().posKey;
|
|
||||||
}
|
}
|
||||||
if (nsm.frags.size() > 0) nextLeftPos = nsm.frags[0].posKey;
|
if (csm.pairs.size() > 0) {
|
||||||
if (lastRightPos >= nextLeftPos) {
|
curLeft = csm.frags[0].Left();
|
||||||
spdlog::error("previous data can not contain readends included by next data block! {} {}", lastRightPos, nextLeftPos);
|
curRight = csm.frags.back().Left();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 在相邻数据块之间查找之前未匹配的readends
|
|
||||||
if (pipeArg.intersectOrder == kInitIntersectOrder) { // 第一次运行,需要处理lp和cp
|
if (g.rightPos >= curLeft) {
|
||||||
findUnpairedInDatas(lp, cp);
|
spdlog::error("previous data can not contain readends included by next data block! {} {} {} {} {} {}",
|
||||||
|
lp.taskSeq, cp.taskSeq, g.rightPos, curLeft, lsm.pairs.size(), csm.pairs.size());
|
||||||
}
|
}
|
||||||
findUnpairedInDatas(lp, np); // 找到的匹配放到lp里
|
g.rightPos = lastRight;
|
||||||
findUnpairedInDatas(cp, np); // 找到的匹配放到cp里
|
|
||||||
// 把lp中未匹配的放到global里保存
|
findUnpairedInDatas(lp, cp); // 找到的匹配放到lp里
|
||||||
findUnpairedInGlobal(g, lp);
|
findUnpairedInGlobal(g, cp); // 把cp中未匹配的放到global里保存
|
||||||
|
|
||||||
// 处理lp中的新找到的匹配
|
// 处理lp中的新找到的匹配
|
||||||
TaskSeqDupInfo t;
|
TaskSeqDupInfo t;
|
||||||
|
|
@ -772,46 +558,58 @@ static void doIntersect(PipelineArg &pipeArg) {
|
||||||
auto &ckVal = *itr;
|
auto &ckVal = *itr;
|
||||||
auto &ck = ckVal.first;
|
auto &ck = ckVal.first;
|
||||||
auto &pairArr = ckVal.second;
|
auto &pairArr = ckVal.second;
|
||||||
getEqualRE(pairArr[0], lsm.pairs, &pairArr);
|
getEqualRE(pairArr[0], lsm.pairs, &pairArr); // 如果不在计算范围内,会放在global里
|
||||||
getEqualRE(pairArr[0], csm.pairs, &pairArr);
|
if (ck.Right() <= lastRight) { // 必须在当前数据块的范围内, 才进行处理
|
||||||
// 在cp的ckeyReadEndsMap里找一下
|
if (ck.Left() >= curLeft) { // 在交叉的范围内才去加上这些在cp中的数据
|
||||||
auto cpKeyItr = cp.ckeyReadEndsMap.find(ck);
|
getEqualRE(pairArr[0], csm.pairs, &pairArr);
|
||||||
if (cpKeyItr != cp.ckeyReadEndsMap.end()) {
|
}
|
||||||
auto &cpPairArr = cpKeyItr->second;
|
// 在global里找一找ck
|
||||||
pairArr.insert(pairArr.end(), cpPairArr.begin(), cpPairArr.end());
|
auto gitr = g.ckeyReadEndsMap.find(ck);
|
||||||
cp.ckeyReadEndsMap.erase(cpKeyItr);
|
if (gitr != g.ckeyReadEndsMap.end()) {
|
||||||
}
|
auto &gPairArr = gitr->second;
|
||||||
if (ck.Read2Pos() <= curRightPos) { // 必须在当前数据块的范围内,否则放到global里
|
pairArr.insert(pairArr.end(), gPairArr.begin(), gPairArr.end());
|
||||||
sort(pairArr.begin(), pairArr.end());
|
g.ckeyReadEndsMap.erase(gitr);
|
||||||
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx,
|
}
|
||||||
&t.notRepIdx);
|
sort(pairArr.begin(), pairArr.end(), ReadEnds::PairLittleThan);
|
||||||
|
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
|
||||||
itr = lp.ckeyReadEndsMap.erase(itr);
|
itr = lp.ckeyReadEndsMap.erase(itr);
|
||||||
} else {
|
} else {
|
||||||
++itr;
|
++itr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 处理找到匹配的global数据
|
// 处理找到匹配的global数据
|
||||||
for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) {
|
for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) {
|
||||||
auto &ckVal = *itr;
|
auto &ckVal = *itr;
|
||||||
auto &ck = ckVal.first;
|
auto &ck = ckVal.first;
|
||||||
if (ck.Read2Pos() < curLeftPos) { // 只有在这个范围内,对应位点的所有reads才完全都包含了
|
auto &pairArr = ckVal.second;
|
||||||
auto &pairArr = ckVal.second;
|
if (ck.Left() >= lastLeft) {
|
||||||
sort(pairArr.begin(), pairArr.end());
|
getEqualRE(pairArr[0], lsm.pairs, &pairArr);
|
||||||
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx);
|
}
|
||||||
|
if (ck.Right() <= lastRight) { // 只有在这个范围内,对应位点的所有reads才完全都包含了
|
||||||
|
sort(pairArr.begin(), pairArr.end(), ReadEnds::PairLittleThan);
|
||||||
|
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
|
||||||
itr = g.ckeyReadEndsMap.erase(itr);
|
itr = g.ckeyReadEndsMap.erase(itr);
|
||||||
} else {
|
} else {
|
||||||
++itr;
|
++itr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 剩余的在lp中没处理的放到global里
|
||||||
for (auto &ckVal : lp.ckeyReadEndsMap) {
|
for (auto &ckVal : lp.ckeyReadEndsMap) {
|
||||||
g.ckeyReadEndsMap.insert(ckVal);
|
auto &pairArr = g.ckeyReadEndsMap[ckVal.first];
|
||||||
|
pairArr.insert(pairArr.end(), ckVal.second.begin(), ckVal.second.end());
|
||||||
|
|
||||||
}
|
}
|
||||||
lp.ckeyReadEndsMap.clear();
|
lp.ckeyReadEndsMap.clear();
|
||||||
// 更新一下冗余结果
|
// 更新一下冗余结果
|
||||||
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp, cp);
|
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp, cp);
|
||||||
// 处理g中新找到的匹配
|
// 处理g中新找到的匹配
|
||||||
|
|
||||||
putDupinfoToGlobal(g, lp);
|
putDupinfoToGlobal(g, lp);
|
||||||
|
|
||||||
|
for (auto &unPair : lsm.unpairedDic) {
|
||||||
|
g.unpairedDic.insert(unPair);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void *pipeRead(void *data) {
|
static void *pipeRead(void *data) {
|
||||||
|
|
@ -845,10 +643,6 @@ static void *pipeRead(void *data) {
|
||||||
inBamBuf.ClearAll(); // 清理上一轮读入的数据
|
inBamBuf.ClearAll(); // 清理上一轮读入的数据
|
||||||
pipeArg.readOrder += 1; // for next
|
pipeArg.readOrder += 1; // for next
|
||||||
yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // 读入了一轮数据
|
yarn::TWIST(pipeArg.readSig, yarn::BY, 1); // 读入了一轮数据
|
||||||
|
|
||||||
if (pipeArg.readOrder >= 3719) {
|
|
||||||
cout << "zzh read size: " << pipeArg.readData.bams.size() << endl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
spdlog::info("total reads processed {}, last order {}", readNumSum, pipeArg.readOrder);
|
spdlog::info("total reads processed {}, last order {}", readNumSum, pipeArg.readOrder);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -976,7 +770,7 @@ static void *pipeMarkDup(void *data) {
|
||||||
cout << "zzh pipeMarkDup: " << pipeArg.sortOrder << '\t' << pipeArg.markDupOrder << endl;
|
cout << "zzh pipeMarkDup: " << pipeArg.sortOrder << '\t' << pipeArg.markDupOrder << endl;
|
||||||
yarn::POSSESS(pipeArg.markDupSig);
|
yarn::POSSESS(pipeArg.markDupSig);
|
||||||
pipeArg.markDupFinish = 1;
|
pipeArg.markDupFinish = 1;
|
||||||
yarn::TWIST(pipeArg.markDupSig, yarn::TO, 3 + pipeArg.MARKBUFNUM);
|
yarn::TWIST(pipeArg.markDupSig, yarn::TO, 2 + pipeArg.MARKBUFNUM);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* 冗余检测 readends */
|
/* 冗余检测 readends */
|
||||||
|
|
@ -995,11 +789,12 @@ static void *pipeMarkDup(void *data) {
|
||||||
}
|
}
|
||||||
static void *pipeIntersect(void *data) {
|
static void *pipeIntersect(void *data) {
|
||||||
PipelineArg &pipeArg = *(PipelineArg *)data;
|
PipelineArg &pipeArg = *(PipelineArg *)data;
|
||||||
pipeArg.intersectOrder = 2;
|
const int kInitIntersectOrder = 1;
|
||||||
|
pipeArg.intersectOrder = kInitIntersectOrder;
|
||||||
while (1) {
|
while (1) {
|
||||||
PROF_START(intersect_wait);
|
PROF_START(intersect_wait);
|
||||||
yarn::POSSESS(pipeArg.markDupSig);
|
yarn::POSSESS(pipeArg.markDupSig);
|
||||||
yarn::WAIT_FOR(pipeArg.markDupSig, yarn::TO_BE_MORE_THAN, 2); // 等待有数据
|
yarn::WAIT_FOR(pipeArg.markDupSig, yarn::TO_BE_MORE_THAN, kInitIntersectOrder); // 等待有数据
|
||||||
yarn::RELEASE(pipeArg.markDupSig);
|
yarn::RELEASE(pipeArg.markDupSig);
|
||||||
PROF_END(gprof[GP_intersect_wait], intersect_wait);
|
PROF_END(gprof[GP_intersect_wait], intersect_wait);
|
||||||
|
|
||||||
|
|
@ -1031,49 +826,29 @@ static void *pipeIntersect(void *data) {
|
||||||
/* 当所有任务结束后,global data里还有未处理的数据 */
|
/* 当所有任务结束后,global data里还有未处理的数据 */
|
||||||
static void processLastData(PipelineArg &pipeArg) {
|
static void processLastData(PipelineArg &pipeArg) {
|
||||||
IntersectData &g = pipeArg.intersectData;
|
IntersectData &g = pipeArg.intersectData;
|
||||||
MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 2) % pipeArg.MARKBUFNUM];
|
MarkDupData &lp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM];
|
||||||
MarkDupData &cp = pipeArg.markDupData[(pipeArg.intersectOrder - 1) % pipeArg.MARKBUFNUM];
|
|
||||||
|
|
||||||
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
|
SortMarkData &lsm = *(SortMarkData *)lp.dataPtr;
|
||||||
SortMarkData &csm = *(SortMarkData *)cp.dataPtr;
|
int64_t lastLeft = INT64_MIN;
|
||||||
|
if (lsm.pairs.size() > 0) {
|
||||||
spdlog::info("last taskseq: lp {}, cp {}, {}, {}", lp.taskSeq, cp.taskSeq, lp.pairDupIdx.size(), cp.pairDupIdx.size());
|
lastLeft = lsm.frags[0].Left();
|
||||||
spdlog::info("last data size: g{}-lp{}-cp{}", g.unpairedDic.size(), lsm.unpairedDic.size(), csm.unpairedDic.size());
|
|
||||||
spdlog::info("last pair frags: {}, {}, {}, {}", lsm.frags.size(), lsm.pairs.size(), csm.frags.size(), csm.pairs.size());
|
|
||||||
|
|
||||||
// 把lp中未匹配的放到global里保存
|
|
||||||
findUnpairedInGlobal(g, lp);
|
|
||||||
findUnpairedInGlobal(g, cp);
|
|
||||||
|
|
||||||
spdlog::info("last data size after: g{}-lp{}-cp{}", g.unpairedDic.size(), lsm.unpairedDic.size(), csm.unpairedDic.size());
|
|
||||||
|
|
||||||
// 处理lp中的新找到的匹配
|
|
||||||
TaskSeqDupInfo t;
|
|
||||||
for (auto &ckVal : lp.ckeyReadEndsMap) {
|
|
||||||
auto &ck = ckVal.first;
|
|
||||||
auto &pairArr = ckVal.second;
|
|
||||||
getEqualRE(pairArr[0], lsm.pairs, &pairArr);
|
|
||||||
getEqualRE(pairArr[0], csm.pairs, &pairArr);
|
|
||||||
sort(pairArr.begin(), pairArr.end());
|
|
||||||
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
|
|
||||||
}
|
}
|
||||||
// 处理找到匹配的global数据
|
// 处理找到匹配的global数据
|
||||||
|
TaskSeqDupInfo t;
|
||||||
for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) {
|
for (auto itr = g.ckeyReadEndsMap.begin(); itr != g.ckeyReadEndsMap.end();) {
|
||||||
auto &ckVal = *itr;
|
auto &ckVal = *itr;
|
||||||
auto &ck = ckVal.first;
|
auto &ck = ckVal.first;
|
||||||
auto &pairArr = ckVal.second;
|
auto &pairArr = ckVal.second;
|
||||||
sort(pairArr.begin(), pairArr.end());
|
if (ck.Left() >= lastLeft) {
|
||||||
|
getEqualRE(pairArr[0], lsm.pairs, &pairArr);
|
||||||
|
}
|
||||||
|
sort(pairArr.begin(), pairArr.end(), ReadEnds::PairLittleThan );
|
||||||
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx);
|
processPairs(pairArr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx);
|
||||||
itr = g.ckeyReadEndsMap.erase(itr);
|
itr = g.ckeyReadEndsMap.erase(itr);
|
||||||
}
|
}
|
||||||
// 更新一下冗余结果
|
// 更新一下冗余结果
|
||||||
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp, cp);
|
refreshMarkDupData(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, lp);
|
||||||
// 处理g中新找到的匹配
|
// 处理g中新找到的匹配
|
||||||
|
|
||||||
putDupinfoToGlobal(g, lp);
|
putDupinfoToGlobal(g, lp);
|
||||||
putDupinfoToGlobal(g, cp);
|
|
||||||
|
|
||||||
spdlog::info("last data size: g{}-lp{}-cp{}", g.unpairedDic.size(), lsm.unpairedDic.size(), csm.unpairedDic.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void parallelPipeline() {
|
static void parallelPipeline() {
|
||||||
|
|
|
||||||
|
|
@ -113,6 +113,7 @@ struct DupResult {
|
||||||
struct IntersectData {
|
struct IntersectData {
|
||||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||||
CkeyReadEndsMap ckeyReadEndsMap;
|
CkeyReadEndsMap ckeyReadEndsMap;
|
||||||
|
int64_t rightPos = 0;
|
||||||
|
|
||||||
// 每个task对应一个vector
|
// 每个task对应一个vector
|
||||||
vector<vector<DupInfo>> &dupIdxArr;
|
vector<vector<DupInfo>> &dupIdxArr;
|
||||||
|
|
@ -141,7 +142,7 @@ struct IntersectData {
|
||||||
struct PipelineArg {
|
struct PipelineArg {
|
||||||
static const int GENBUFNUM = 2;
|
static const int GENBUFNUM = 2;
|
||||||
static const int SORTBUFNUM = 2;
|
static const int SORTBUFNUM = 2;
|
||||||
static const int MARKBUFNUM = 4;
|
static const int MARKBUFNUM = 3;
|
||||||
uint64_t readOrder = 0;
|
uint64_t readOrder = 0;
|
||||||
uint64_t genREOrder = 0;
|
uint64_t genREOrder = 0;
|
||||||
uint64_t sortOrder = 0;
|
uint64_t sortOrder = 0;
|
||||||
|
|
@ -212,14 +213,20 @@ struct REArrIdIdx {
|
||||||
const ReadEnds *pE = nullptr;
|
const ReadEnds *pE = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct REGreaterThan {
|
struct REFragGreaterThan {
|
||||||
bool operator()(const REArrIdIdx &a, const REArrIdIdx &b) { return *b.pE < *a.pE; }
|
bool operator()(const REArrIdIdx &a, const REArrIdIdx &b) { return ReadEnds::FragLittleThan(*b.pE, *a.pE); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct REPairGreaterThan {
|
||||||
|
bool operator()(const REArrIdIdx &a, const REArrIdIdx &b) { return ReadEnds::PairLittleThan(*b.pE, *a.pE);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename CMP>
|
||||||
struct ReadEndsHeap {
|
struct ReadEndsHeap {
|
||||||
// 将冗余索引和他对应的task vector对应起来
|
// 将冗余索引和他对应的task vector对应起来
|
||||||
vector<vector<ReadEnds>> *arr2d;
|
vector<vector<ReadEnds>> *arr2d;
|
||||||
priority_queue<REArrIdIdx, vector<REArrIdIdx>, REGreaterThan> minHeap;
|
priority_queue<REArrIdIdx, vector<REArrIdIdx>, CMP> minHeap;
|
||||||
uint64_t popNum = 0;
|
uint64_t popNum = 0;
|
||||||
|
|
||||||
int Init(vector<vector<ReadEnds>> *_arr2d) {
|
int Init(vector<vector<ReadEnds>> *_arr2d) {
|
||||||
|
|
|
||||||
|
|
@ -32,38 +32,70 @@ struct CalcKey {
|
||||||
int32_t read1Coordinate = -1;
|
int32_t read1Coordinate = -1;
|
||||||
int32_t read2ReferenceIndex = -1;
|
int32_t read2ReferenceIndex = -1;
|
||||||
int32_t read2Coordinate = -1;
|
int32_t read2Coordinate = -1;
|
||||||
|
int64_t left = -1;
|
||||||
|
int64_t right = -1;
|
||||||
|
|
||||||
CalcKey() {}
|
CalcKey() {}
|
||||||
|
static CalcKey MaxCK() {
|
||||||
|
CalcKey ck;
|
||||||
|
ck.left = ck.right = INT64_MAX;
|
||||||
|
return ck;
|
||||||
|
}
|
||||||
|
static CalcKey MinCK() {
|
||||||
|
CalcKey ck;
|
||||||
|
ck.left = ck.right = INT64_MIN;
|
||||||
|
return ck;
|
||||||
|
}
|
||||||
|
|
||||||
CalcKey(const ReadEnds &re) {
|
CalcKey(const ReadEnds &re) {
|
||||||
orientation = re.orientation;
|
orientation = re.orientation;
|
||||||
read1ReferenceIndex = re.read1ReferenceIndex;
|
read1ReferenceIndex = re.read1ReferenceIndex;
|
||||||
read1Coordinate = re.read1Coordinate;
|
read1Coordinate = re.read1Coordinate;
|
||||||
read2ReferenceIndex = re.read2ReferenceIndex;
|
read2ReferenceIndex = re.read2ReferenceIndex;
|
||||||
read2Coordinate = re.read2Coordinate;
|
read2Coordinate = re.read2Coordinate;
|
||||||
|
left = Read1Pos();
|
||||||
|
right = Read2Pos();
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t Read1Pos() const { return BamWrap::bam_global_pos(read1ReferenceIndex, read1Coordinate); }
|
int64_t Read1Pos() const { return BamWrap::bam_global_pos(read1ReferenceIndex, read1Coordinate); }
|
||||||
|
|
||||||
int64_t Read2Pos() const { return BamWrap::bam_global_pos(read2ReferenceIndex, read2Coordinate); }
|
int64_t Read2Pos() const { return BamWrap::bam_global_pos(read2ReferenceIndex, read2Coordinate); }
|
||||||
|
int64_t Left() const { return left; }
|
||||||
|
int64_t Right() const { return right; }
|
||||||
|
|
||||||
bool operator<(const CalcKey &o) const {
|
bool operator<(const CalcKey &o) const {
|
||||||
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
|
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = read1Coordinate - o.read1Coordinate;
|
comp = read1Coordinate - o.read1Coordinate;
|
||||||
// 需要orientation,因为要跟排序的比较方式和顺序一致
|
|
||||||
if (comp == 0)
|
|
||||||
comp = orientation - o.orientation;
|
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = read2ReferenceIndex - o.read2ReferenceIndex;
|
comp = read2ReferenceIndex - o.read2ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = read2Coordinate - o.read2Coordinate;
|
comp = read2Coordinate - o.read2Coordinate;
|
||||||
|
// 需要orientation,因为要跟排序的比较方式和顺序一致
|
||||||
|
if (comp == 0)
|
||||||
|
comp = orientation - o.orientation;
|
||||||
return comp < 0;
|
return comp < 0;
|
||||||
}
|
}
|
||||||
|
bool operator <= (const CalcKey &o) const {
|
||||||
|
return *this < o || *this == o;
|
||||||
|
}
|
||||||
bool operator==(const CalcKey &o) const {
|
bool operator==(const CalcKey &o) const {
|
||||||
return read1ReferenceIndex == o.read1ReferenceIndex && read1Coordinate == o.read1Coordinate &&
|
return read1ReferenceIndex == o.read1ReferenceIndex && read1Coordinate == o.read1Coordinate &&
|
||||||
orientation == o.orientation && read2ReferenceIndex == o.read2ReferenceIndex &&
|
orientation == o.orientation && read2ReferenceIndex == o.read2ReferenceIndex &&
|
||||||
read2Coordinate == o.read2Coordinate;
|
read2Coordinate == o.read2Coordinate;
|
||||||
}
|
}
|
||||||
|
bool operator<(const ReadEnds &o) const {
|
||||||
|
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = read1Coordinate - o.read1Coordinate;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = read2ReferenceIndex - o.read2ReferenceIndex;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = read2Coordinate - o.read2Coordinate;
|
||||||
|
// 需要orientation,因为要跟排序的比较方式和顺序一致
|
||||||
|
if (comp == 0)
|
||||||
|
comp = orientation - o.orientation;
|
||||||
|
return comp < 0;
|
||||||
|
}
|
||||||
std::size_t operator()() const {
|
std::size_t operator()() const {
|
||||||
size_t h1 = read1ReferenceIndex;
|
size_t h1 = read1ReferenceIndex;
|
||||||
h1 = (h1 << 40) | (read1Coordinate << 8) | orientation;
|
h1 = (h1 << 40) | (read1Coordinate << 8) | orientation;
|
||||||
|
|
@ -247,8 +279,7 @@ struct DupIdxQueue {
|
||||||
DupInfo nextDup = dupIdx;
|
DupInfo nextDup = dupIdx;
|
||||||
auto topIdx = minHeap.top();
|
auto topIdx = minHeap.top();
|
||||||
|
|
||||||
ofstream ofs("na12878.txt");
|
// ofstream ofs("n.dup"); ofstream ofs1("n-all.dup");
|
||||||
ofstream ofs1("na12878-all.txt");
|
|
||||||
|
|
||||||
while (dupIdx != -1) {
|
while (dupIdx != -1) {
|
||||||
++len;
|
++len;
|
||||||
|
|
@ -264,12 +295,14 @@ struct DupIdxQueue {
|
||||||
<< endl;
|
<< endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ofs1 << topIdx.arrId << '\t' << topIdx.arrIdx << '\t' << topIdx.dupIdx << endl;
|
|
||||||
ofs << topIdx.dupIdx << endl;
|
// ofs << topIdx.dupIdx << endl; ofs1 << topIdx.arrId << '\t' << topIdx.arrIdx << '\t' << topIdx.dupIdx << endl;
|
||||||
|
|
||||||
dupIdx = nextDup;
|
dupIdx = nextDup;
|
||||||
preTop = topIdx;
|
preTop = topIdx;
|
||||||
}
|
}
|
||||||
ofs.close();
|
// ofs.close(); ofs1.close();
|
||||||
|
cout << "RealSize: " << len << endl;
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -11,8 +11,11 @@ Date : 2023/11/3
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <algorithm>
|
#include <util/bam_wrap.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
using namespace std;
|
||||||
/**
|
/**
|
||||||
* Small interface that provides access to the physical location information
|
* Small interface that provides access to the physical location information
|
||||||
* about a cluster. All values should be defaulted to -1 if unavailable.
|
* about a cluster. All values should be defaulted to -1 if unavailable.
|
||||||
|
|
@ -123,55 +126,85 @@ struct ReadEnds : PhysicalLocation {
|
||||||
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.read1Coordinate - b.read1Coordinate;
|
comp = a.read1Coordinate - b.read1Coordinate;
|
||||||
if (comp == 0)
|
|
||||||
comp = a.orientation - b.orientation;
|
|
||||||
if (compareRead2) {
|
if (compareRead2) {
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.read2Coordinate - b.read2Coordinate;
|
comp = a.read2Coordinate - b.read2Coordinate;
|
||||||
}
|
}
|
||||||
|
if (comp == 0)
|
||||||
|
comp = a.orientation - b.orientation;
|
||||||
|
|
||||||
return comp < 0;
|
return comp < 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 找某一个位置的所有readend时需要, 只对比位点,不对比orientation
|
static bool FragLittleThan(const ReadEnds &a, const ReadEnds &b) {
|
||||||
static bool PairsLittleThan(const ReadEnds &a, const ReadEnds &b) {
|
|
||||||
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.read1Coordinate - b.read1Coordinate;
|
comp = a.read1Coordinate - b.read1Coordinate;
|
||||||
// 需要orientation,因为要跟排序的比较方式和顺序一致
|
if (comp == 0) // 这个放在坐标比较之前,因为在解析bam的时候,可能有的给read2ReferenceIndex赋值了,有的则没赋值
|
||||||
if (comp == 0)
|
|
||||||
comp = a.orientation - b.orientation;
|
comp = a.orientation - b.orientation;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = a.read2Coordinate - b.read2Coordinate;
|
comp = a.read2Coordinate - b.read2Coordinate;
|
||||||
|
// if (comp == 0)
|
||||||
|
// comp = a.tile - b.tile;
|
||||||
|
// if (comp == 0)
|
||||||
|
// comp = a.x - b.x; // 由于picard的bug,用short类型来表示x,y,导致其可能为负数
|
||||||
|
// if (comp == 0)
|
||||||
|
// comp - a.y - b.y;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = (int)(a.read1IndexInFile - b.read1IndexInFile);
|
||||||
|
if (comp == 0)
|
||||||
|
comp = (int)(a.read2IndexInFile - b.read2IndexInFile);
|
||||||
return comp < 0;
|
return comp < 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 比较函数
|
void Print() {
|
||||||
bool operator<(const ReadEnds &o) const {
|
std::cout << read1ReferenceIndex << '\t' << read1Coordinate << '\t' << read2ReferenceIndex << '\t' << read2Coordinate
|
||||||
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
|
<< '\t' << orientation << '\t' << read1IndexInFile << '\t' << read2IndexInFile << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool PairLittleThan(const ReadEnds &a, const ReadEnds &b) {
|
||||||
|
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = read1Coordinate - o.read1Coordinate;
|
comp = a.read1Coordinate - b.read1Coordinate;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = orientation - o.orientation;
|
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = read2ReferenceIndex - o.read2ReferenceIndex;
|
comp = a.read2Coordinate - b.read2Coordinate;
|
||||||
|
if (comp == 0) // 这个放在坐标比较了之后,把坐标范围的放在之前,这样对分段数据块比较好处理
|
||||||
|
comp = a.orientation - b.orientation;
|
||||||
|
// if (comp == 0)
|
||||||
|
// comp = a.tile - b.tile;
|
||||||
|
// if (comp == 0)
|
||||||
|
// comp = a.x - b.x; // 由于picard的bug,用short类型来表示x,y,导致其可能为负数
|
||||||
|
// if (comp == 0)
|
||||||
|
// comp - a.y - b.y;
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = read2Coordinate - o.read2Coordinate;
|
comp = (int)(a.read1IndexInFile - b.read1IndexInFile);
|
||||||
//if (comp == 0)
|
|
||||||
// comp = tile - o.tile;
|
|
||||||
//if (comp == 0)
|
|
||||||
// comp = x - o.x;
|
|
||||||
//if (comp == 0)
|
|
||||||
// comp - y - o.y;
|
|
||||||
if (comp == 0)
|
if (comp == 0)
|
||||||
comp = (int)(read1IndexInFile - o.read1IndexInFile);
|
comp = (int)(a.read2IndexInFile - b.read2IndexInFile);
|
||||||
if (comp == 0)
|
|
||||||
comp = (int)(read2IndexInFile - o.read2IndexInFile);
|
|
||||||
return comp < 0;
|
return comp < 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool CorLittleThan(const ReadEnds &a, const ReadEnds &b) {
|
||||||
|
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = a.read1Coordinate - b.read1Coordinate;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = a.read2Coordinate - b.read2Coordinate;
|
||||||
|
if (comp == 0) // 这个放在坐标比较了之后,把坐标范围的放在之前,这样对分段数据块比较好处理
|
||||||
|
comp = a.orientation - b.orientation;
|
||||||
|
return comp < 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// for pairs only
|
||||||
|
int64_t Left() { return BamWrap::bam_global_pos(read1ReferenceIndex, read1Coordinate); }
|
||||||
|
int64_t Right() { return BamWrap::bam_global_pos(read2ReferenceIndex, read2Coordinate); }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ReadEndsHash {
|
struct ReadEndsHash {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue