串行程序没有错误了,用vector代替set成功,时间减少了一半
This commit is contained in:
parent
2d114058a1
commit
38bc489004
|
|
@ -0,0 +1,15 @@
|
||||||
|
{
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Linux",
|
||||||
|
"includePath": [
|
||||||
|
"${workspaceFolder}/**"
|
||||||
|
],
|
||||||
|
"defines": [],
|
||||||
|
"cStandard": "c17",
|
||||||
|
"cppStandard": "gnu++17",
|
||||||
|
"intelliSenseMode": "linux-gcc-x64"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"version": 4
|
||||||
|
}
|
||||||
1
run.sh
1
run.sh
|
|
@ -12,3 +12,4 @@ time /home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
|
||||||
# --INPUT /mnt/d/data/100w.bam \
|
# --INPUT /mnt/d/data/100w.bam \
|
||||||
# --INPUT /mnt/d/data/zy_normal.bam \
|
# --INPUT /mnt/d/data/zy_normal.bam \
|
||||||
# zy_tumor
|
# zy_tumor
|
||||||
|
# tumor_region
|
||||||
|
|
@ -39,7 +39,7 @@ using std::cout;
|
||||||
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
|
#define BAM_BLOCK_SIZE 2 * 1024 * 1024
|
||||||
#define NO_SUCH_INDEX INT64_MAX
|
#define NO_SUCH_INDEX INT64_MAX
|
||||||
|
|
||||||
static Timer tm_arr[10]; // 用来测试性能
|
static Timer tm_arr[20]; // 用来测试性能
|
||||||
/* 全局本地变量 */
|
/* 全局本地变量 */
|
||||||
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
static vector<ReadNameParser> g_vRnParser; // 每个线程一个read name parser
|
||||||
static samFile *g_inBamFp; // 输入的bam文件
|
static samFile *g_inBamFp; // 输入的bam文件
|
||||||
|
|
@ -91,7 +91,8 @@ int MarkDuplicates(int argc, char *argv[])
|
||||||
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||||||
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
||||||
htsPoolRead.pool = hts_tpool_init(16);
|
htsPoolRead.pool = hts_tpool_init(16);
|
||||||
htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
||||||
|
htsPoolWrite.pool = hts_tpool_init(16);
|
||||||
if (!htsPoolRead.pool || !htsPoolWrite.pool)
|
if (!htsPoolRead.pool || !htsPoolWrite.pool)
|
||||||
{
|
{
|
||||||
Error("[%d] failed to set up thread pool", __LINE__);
|
Error("[%d] failed to set up thread pool", __LINE__);
|
||||||
|
|
@ -150,30 +151,34 @@ int MarkDuplicates(int argc, char *argv[])
|
||||||
// BamBufType inBuf(false); // inBuf(g_gArg.use_asyncio);
|
// BamBufType inBuf(false); // inBuf(g_gArg.use_asyncio);
|
||||||
BamBufType inBuf(g_gArg.use_asyncio);
|
BamBufType inBuf(g_gArg.use_asyncio);
|
||||||
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
inBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
||||||
// while(inBuf.ReadStat() >= 0)
|
Timer tw;
|
||||||
// {
|
while (inBuf.ReadStat() >= 0)
|
||||||
// size_t readNum = inBuf.ReadBam();
|
{
|
||||||
// cout << "read: " << readNum << endl;
|
Timer tw1;
|
||||||
// for (size_t i = 0; i < inBuf.Size(); ++i)
|
size_t readNum = inBuf.ReadBam();
|
||||||
// {
|
cout << "read: " << readNum << endl;
|
||||||
// /* 判断是否冗余 */
|
for (size_t i = 0; i < inBuf.Size(); ++i)
|
||||||
// if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0)
|
{
|
||||||
// {
|
/* 判断是否冗余 */
|
||||||
// Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0)
|
||||||
// sam_close(g_outBamFp);
|
{
|
||||||
// sam_close(g_inBamFp);
|
Error("failed writing header to \"%s\"", g_gArg.out_fn.c_str());
|
||||||
// return -1;
|
sam_close(g_outBamFp);
|
||||||
// }
|
sam_close(g_inBamFp);
|
||||||
// }
|
return -1;
|
||||||
// inBuf.ClearAll();
|
}
|
||||||
// }
|
}
|
||||||
// if (sam_idx_save(g_outBamFp) < 0)
|
inBuf.ClearAll();
|
||||||
// {
|
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
|
||||||
// Error("writing index failed");
|
}
|
||||||
// sam_close(g_outBamFp);
|
if (sam_idx_save(g_outBamFp) < 0)
|
||||||
// sam_close(g_inBamFp);
|
{
|
||||||
// return -1;
|
Error("writing index failed");
|
||||||
// }
|
sam_close(g_outBamFp);
|
||||||
|
sam_close(g_inBamFp);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
cout << "write time: " << tw.seconds_elapsed() << " s" << endl;
|
||||||
|
|
||||||
/* 关闭文件,收尾清理 */
|
/* 关闭文件,收尾清理 */
|
||||||
sam_close(g_outBamFp);
|
sam_close(g_outBamFp);
|
||||||
|
|
|
||||||
|
|
@ -8,12 +8,21 @@ struct UnpairedREInfo
|
||||||
ReadEnds unpairedRE;
|
ReadEnds unpairedRE;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct GlobalUnpairedInfo
|
/* 对于一个pair数据,一个完整的计算点,包含read1的比对位置和read2的比对位置 */
|
||||||
|
struct CalcKey
|
||||||
{
|
{
|
||||||
int64_t taskSeq;
|
int64_t read1Pos;
|
||||||
vector<ReadEnds> reArr;
|
int64_t read2Pos;
|
||||||
|
bool operator<(const CalcKey &o) const
|
||||||
|
{
|
||||||
|
int comp = (int)(read1Pos - o.read1Pos);
|
||||||
|
if (comp == 0)
|
||||||
|
comp = (int)(read2Pos - o.read2Pos);
|
||||||
|
return comp < 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* 当遗留数据在当前任务找到了pair read后,进行冗余计算时候存放结果的数据结构 */
|
||||||
struct TaskSeqDupInfo
|
struct TaskSeqDupInfo
|
||||||
{
|
{
|
||||||
set<int64_t> dupIdx;
|
set<int64_t> dupIdx;
|
||||||
|
|
@ -21,11 +30,19 @@ struct TaskSeqDupInfo
|
||||||
set<int64_t> notDupIdx;
|
set<int64_t> notDupIdx;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* 保存有未匹配pair位点的信息,包括read end数组和有几个未匹配的read end */
|
||||||
|
struct UnpairedPosInfo
|
||||||
|
{
|
||||||
|
int unpairedNum = 0;
|
||||||
|
int64_t taskSeq;
|
||||||
|
vector<ReadEnds> pairArr;
|
||||||
|
set<string> readNameSet;
|
||||||
|
};
|
||||||
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
|
// typedef unordered_map<string, UnpairedREInfo> UnpairedNameMap;
|
||||||
// typedef unordered_map<int64_t, vector<ReadEnds>> UnpairedPositionMap;
|
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
|
||||||
|
|
||||||
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap;
|
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引,保存未匹配的pair read
|
||||||
typedef tsl::robin_map<int64_t, vector<ReadEnds>> UnpairedPositionMap;
|
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引,保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
|
||||||
|
|
||||||
/* 单线程处理冗余参数结构体 */
|
/* 单线程处理冗余参数结构体 */
|
||||||
struct SerailMarkDupArg
|
struct SerailMarkDupArg
|
||||||
|
|
@ -45,29 +62,29 @@ struct SerailMarkDupArg
|
||||||
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
/* 全局保留的数据,因为有些paired数据比对到了不同的染色体,相距甚远 */
|
||||||
struct GlobalDataArg
|
struct GlobalDataArg
|
||||||
{
|
{
|
||||||
set<int64_t> pairDupIdx; // pair的冗余read的索引
|
|
||||||
set<int64_t> pairOpticalDupIdx; // optical冗余read的索引
|
|
||||||
set<int64_t> notDupIdx; // 不是冗余
|
|
||||||
//unordered_map<string, UnpairedREInfo> unpairedDic; // 用来寻找pair end
|
|
||||||
//unordered_map<int64_t, vector<ReadEnds>> unpairedPosArr;
|
|
||||||
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
UnpairedNameMap unpairedDic; // 用来寻找pair end
|
||||||
UnpairedPositionMap unpairedPosArr;
|
UnpairedPositionMap unpairedPosArr;
|
||||||
|
|
||||||
// 每个task对应一个vector
|
// 每个task对应一个vector
|
||||||
vector<vector<int64_t>> dupIdxArr;
|
vector<vector<int64_t>> dupIdxArr;
|
||||||
vector<vector<int64_t>> opticalDupIdxArr;
|
vector<vector<int64_t>> opticalDupIdxArr;
|
||||||
|
|
||||||
|
// 用来存放后续计算的数据
|
||||||
|
vector<set<int64_t>> latterDupIdxArr;
|
||||||
|
vector<set<int64_t>> latterOpticalDupIdxArr;
|
||||||
|
vector<set<int64_t>> latterNotDupIdxArr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static GlobalDataArg gData;
|
static GlobalDataArg gData;
|
||||||
|
|
||||||
|
|
||||||
/* 查找 */
|
/* 查找 */
|
||||||
template<class Itr, class T>
|
// template<class Itr, class T>
|
||||||
static inline Itr binaryFind(Itr first, Itr last, const T &val)
|
// static inline Itr binaryFind(Itr first, Itr last, const T &val)
|
||||||
{
|
// {
|
||||||
first = std::lower_bound(first, last, val);
|
// first = std::lower_bound(first, last, val);
|
||||||
return (first != last && *first == val) ? first : last;
|
// return (first != last && *first == val) ? first : last;
|
||||||
}
|
// }
|
||||||
|
|
||||||
/* 排序 */
|
/* 排序 */
|
||||||
static inline void sortReadEndsArr(vector<ReadEnds> &arr)
|
static inline void sortReadEndsArr(vector<ReadEnds> &arr)
|
||||||
|
|
@ -101,9 +118,6 @@ static inline void sortReadEndsArr(vector<ReadEnds> &arr)
|
||||||
if (right == end - start - 1)
|
if (right == end - start - 1)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// cout << "sort: " << left << ' ' << right << ' '
|
|
||||||
// << arr[start - left].posKey << ' ' << arr[start - 1].posKey << ' '
|
|
||||||
// << arr[start].posKey << ' ' << arr[start + right].posKey << endl;
|
|
||||||
std::sort(arr.begin() + start - left, arr.begin() + start + right);
|
std::sort(arr.begin() + start - left, arr.begin() + start + right);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -203,7 +217,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe,
|
||||||
/* 找到与readend pos相等的所有readend */
|
/* 找到与readend pos相等的所有readend */
|
||||||
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst)
|
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst)
|
||||||
{
|
{
|
||||||
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsCmp);
|
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::pairsLittleThan); // 只比对位点
|
||||||
dst->insert(dst->end(), range.first, range.second);
|
dst->insert(dst->end(), range.first, range.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -251,40 +265,29 @@ static void generateReadEnds(SerailMarkDupArg *arg)
|
||||||
{
|
{
|
||||||
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
||||||
modifyPairedEnds(fragEnd, &pairedEnds);
|
modifyPairedEnds(fragEnd, &pairedEnds);
|
||||||
// if (pairedEnds.read1IndexInFile == 94 || pairedEnds.read1IndexInFile == 95)
|
|
||||||
// {
|
|
||||||
// cout << "pair score: " << pairedEnds.read1IndexInFile << ' ' << pairedEnds.score << endl;
|
|
||||||
// }
|
|
||||||
// if (pairedEnds.read1IndexInFile == 94)
|
|
||||||
// {
|
|
||||||
// lastRe = pairedEnds;
|
|
||||||
// }
|
|
||||||
// if (pairedEnds.read1IndexInFile == 95)
|
|
||||||
// {
|
|
||||||
// cout << "compare: " << (lastRe < pairedEnds) << ' ' << (pairedEnds < lastRe) << endl;
|
|
||||||
//
|
|
||||||
// }
|
|
||||||
p.pairs.push_back(pairedEnds);
|
p.pairs.push_back(pairedEnds);
|
||||||
p.unpairedDic.erase(key); // 删除找到的pairend
|
p.unpairedDic.erase(key); // 删除找到的pairend
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// cout << "sort frags" << endl;
|
tm_arr[9].acc_start();
|
||||||
sortReadEndsArr(p.frags);
|
sortReadEndsArr(p.frags);
|
||||||
// sort(p.frags.begin(), p.frags.end());
|
// sort(p.frags.begin(), p.frags.end());
|
||||||
|
tm_arr[9].acc_end();
|
||||||
// cout << "sort pairs" << endl;
|
// cout << "sort pairs" << endl;
|
||||||
// sortReadEndsArr(p.pairs);
|
tm_arr[10].acc_start();
|
||||||
sort(p.pairs.begin(), p.pairs.end());
|
sort(p.pairs.begin(), p.pairs.end());
|
||||||
// cout << "unpaired num: " << p.unpairedDic.size() << endl;
|
tm_arr[10].acc_end();
|
||||||
|
// 记录位点上的未匹配的read个数
|
||||||
// 把未匹配的pair对应的每个位点的pairs记录下来
|
|
||||||
for (auto &e : p.unpairedDic) {
|
for (auto &e : p.unpairedDic) {
|
||||||
auto &unpair = e.second;
|
auto posKey = e.second.unpairedRE.posKey;
|
||||||
auto posKey = unpair.unpairedRE.posKey;
|
auto &unpairArrInfo = p.unpairedPosArr[posKey];
|
||||||
if (p.unpairedPosArr.find(posKey) == p.unpairedPosArr.end())
|
unpairArrInfo.unpairedNum++;
|
||||||
getEqualRE(unpair.unpairedRE, p.pairs, &p.unpairedPosArr[posKey]);
|
unpairArrInfo.taskSeq = p.taskSeq;
|
||||||
|
unpairArrInfo.readNameSet.insert(e.first);
|
||||||
}
|
}
|
||||||
|
cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 处理pairs */
|
/* 处理pairs */
|
||||||
|
|
@ -362,14 +365,17 @@ static void markdups(SerailMarkDupArg *arg)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 获取交叉部分的数据 */
|
/* 获取交叉部分的数据 */
|
||||||
static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds> &rightArr, vector<ReadEnds> *dst)
|
static inline void getIntersectData(vector<ReadEnds> &leftArr,
|
||||||
|
vector<ReadEnds> &rightArr,
|
||||||
|
vector<ReadEnds> *dst,
|
||||||
|
bool isPairCmp = false)
|
||||||
{
|
{
|
||||||
const size_t leftEndIdx = leftArr.size() - 1;
|
const size_t leftEndIdx = leftArr.size() - 1;
|
||||||
const size_t rightStartIdx = 0;
|
const size_t rightStartIdx = 0;
|
||||||
size_t leftSpan = 0;
|
size_t leftSpan = 0;
|
||||||
size_t rightSpan = 0;
|
size_t rightSpan = 0;
|
||||||
|
|
||||||
while (!(leftArr[leftEndIdx - leftSpan] < rightArr[rightStartIdx]))
|
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp))
|
||||||
{
|
{
|
||||||
leftSpan += 1;
|
leftSpan += 1;
|
||||||
if (leftSpan > leftEndIdx)
|
if (leftSpan > leftEndIdx)
|
||||||
|
|
@ -379,7 +385,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while (!(leftArr[leftEndIdx] < rightArr[rightSpan]))
|
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp))
|
||||||
{
|
{
|
||||||
rightSpan += 1;
|
rightSpan += 1;
|
||||||
if (rightSpan == rightArr.size() - 1)
|
if (rightSpan == rightArr.size() - 1)
|
||||||
|
|
@ -390,7 +396,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
|
||||||
std::sort(dst->begin(), dst->end());
|
std::sort(dst->begin(), dst->end());
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 将重叠部分的dup idx放进数据中 */
|
/* 将frags重叠部分的dup idx放进数据中 */
|
||||||
static inline void refreshFragDupIdx(set<int64_t> &dupIdx,
|
static inline void refreshFragDupIdx(set<int64_t> &dupIdx,
|
||||||
set<int64_t> ¬DupIdx,
|
set<int64_t> ¬DupIdx,
|
||||||
SerailMarkDupArg * lastArg,
|
SerailMarkDupArg * lastArg,
|
||||||
|
|
@ -410,6 +416,7 @@ static inline void refreshFragDupIdx(set<int64_t> &dupIdx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* 将pairs重叠部分的dup idx放进数据中 */
|
||||||
static inline void refreshPairDupIdx(set<int64_t> &dupIdx,
|
static inline void refreshPairDupIdx(set<int64_t> &dupIdx,
|
||||||
set<int64_t> &opticalDupIdx,
|
set<int64_t> &opticalDupIdx,
|
||||||
set<int64_t> ¬DupIdx,
|
set<int64_t> ¬DupIdx,
|
||||||
|
|
@ -437,104 +444,72 @@ static inline void refreshPairDupIdx(set<int64_t> &dupIdx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 处理未匹配的部分 */
|
// 用来分别处理dup和optical dup
|
||||||
static inline void processUnpairedPosForCalc(UnpairedNameMap &lpUnpairedDic,
|
static void refeshTaskDupInfo(set<int64_t> &dupIdx,
|
||||||
UnpairedPositionMap &lpUnpairedPosArr,
|
set<int64_t> &opticalDupIdx,
|
||||||
UnpairedNameMap &pUnpairedDic,
|
set<int64_t> ¬DupIdx,
|
||||||
UnpairedPositionMap &pUnpairedPosArr,
|
set<int64_t> &latterDupIdx,
|
||||||
vector<ReadEnds> &pairs,
|
set<int64_t> &latterOpticalDupIdx,
|
||||||
map<int64_t, int64_t> &recalcPos,
|
set<int64_t> &latterNotDupIdx)
|
||||||
bool addToLast = false)
|
|
||||||
{
|
{
|
||||||
recalcPos.clear();
|
for (auto idx : dupIdx)
|
||||||
for (auto itr = pUnpairedDic.begin(); itr != pUnpairedDic.end();)
|
latterDupIdx.insert(idx);
|
||||||
{
|
for (auto idx : opticalDupIdx)
|
||||||
auto &readName = itr->first;
|
latterOpticalDupIdx.insert(idx);
|
||||||
|
for (auto idx : notDupIdx)
|
||||||
if (lpUnpairedDic.find(readName) != lpUnpairedDic.end())
|
latterNotDupIdx.insert(idx);
|
||||||
{
|
|
||||||
auto &posInfo = lpUnpairedDic[readName];
|
|
||||||
auto posKey = posInfo.unpairedRE.posKey;
|
|
||||||
auto &posReArr = lpUnpairedPosArr[posKey];
|
|
||||||
|
|
||||||
modifyPairedEnds(itr->second.unpairedRE, &posInfo.unpairedRE);
|
|
||||||
posKey = posInfo.unpairedRE.posKey;
|
|
||||||
if (recalcPos.find(posKey) == recalcPos.end()) // 如果之前没有这个位点
|
|
||||||
getEqualRE(posInfo.unpairedRE, pairs, &posReArr);
|
|
||||||
recalcPos[posKey] = posInfo.taskSeq;
|
|
||||||
posReArr.push_back(posInfo.unpairedRE);
|
|
||||||
std::sort(posReArr.begin(), posReArr.end());
|
|
||||||
|
|
||||||
lpUnpairedDic.erase(readName);
|
|
||||||
itr = pUnpairedDic.erase(itr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (addToLast) // 将数据添加进遗留数据中
|
|
||||||
{
|
|
||||||
auto posKey = itr->second.unpairedRE.posKey;
|
|
||||||
lpUnpairedDic[readName] = itr->second;
|
|
||||||
lpUnpairedPosArr[posKey] = pUnpairedPosArr[posKey];
|
|
||||||
}
|
|
||||||
++itr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 用来分别处理dup和optical dup
|
/* 最后合并数据并排序 */
|
||||||
static void refeshTaskDupInfo(vector<int64_t> &addDup,
|
static void refeshFinalTaskDupInfo(set<int64_t> &dupIdx,
|
||||||
map<int64_t, int64_t> &ndPosVal,
|
|
||||||
set<int64_t> &dupIdx,
|
|
||||||
set<int64_t> ¬DupIdx,
|
set<int64_t> ¬DupIdx,
|
||||||
vector<int64_t> &dupArr)
|
vector<int64_t> &dupArr)
|
||||||
{
|
{
|
||||||
addDup.clear();
|
vector<int64_t> midArr;
|
||||||
ndPosVal.clear();
|
|
||||||
// 去除之前有的,重复的
|
auto ai = dupArr.begin();
|
||||||
for (auto i = dupIdx.begin(); i != dupIdx.end();)
|
auto bi = dupIdx.begin();
|
||||||
|
auto ae = dupArr.end();
|
||||||
|
auto be = dupIdx.end();
|
||||||
|
|
||||||
|
int64_t val = 0;
|
||||||
|
while (ai != ae && bi != be)
|
||||||
{
|
{
|
||||||
auto itr = binaryFind(dupArr.begin(), dupArr.end(), *i);
|
if (*ai < *bi)
|
||||||
if (itr != dupArr.end())
|
|
||||||
{
|
{
|
||||||
i = dupIdx.erase(i);
|
val = *ai++;
|
||||||
|
}
|
||||||
|
else if (*bi < *ai)
|
||||||
|
{
|
||||||
|
val = *bi++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
++i;
|
val = *ai++;
|
||||||
|
bi++;
|
||||||
}
|
}
|
||||||
}
|
if (notDupIdx.find(val) == notDupIdx.end())
|
||||||
// 添加现有的
|
|
||||||
auto di = dupIdx.begin();
|
|
||||||
for (auto nidx : notDupIdx)
|
|
||||||
{
|
{
|
||||||
auto itr = binaryFind(dupArr.begin(), dupArr.end(), nidx);
|
midArr.push_back(val);
|
||||||
if (itr != dupArr.end())
|
}
|
||||||
|
}
|
||||||
|
while (ai != ae)
|
||||||
{
|
{
|
||||||
ndPosVal[itr - dupArr.begin()] = *di++;
|
val = *ai++;
|
||||||
}
|
if (notDupIdx.find(val) == notDupIdx.end())
|
||||||
}
|
|
||||||
|
|
||||||
while (di != dupIdx.end())
|
|
||||||
addDup.push_back(*di++);
|
|
||||||
|
|
||||||
for (auto pos : ndPosVal)
|
|
||||||
dupArr[pos.first] = pos.second;
|
|
||||||
dupArr.insert(dupArr.end(), addDup.begin(), addDup.end());
|
|
||||||
std::sort(dupArr.begin(), dupArr.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 将遗留的冗余信息添加进对应的任务数据中 */
|
|
||||||
static void addDupInfoToTask(map<int64_t, TaskSeqDupInfo> &seqTaskChanged, GlobalDataArg *gDataArg)
|
|
||||||
{
|
|
||||||
auto &g = *gDataArg;
|
|
||||||
// 更新遗留的结果
|
|
||||||
vector<int64_t> addDup;
|
|
||||||
map<int64_t, int64_t> ndPosVal;
|
|
||||||
for (auto &e : seqTaskChanged)
|
|
||||||
{
|
{
|
||||||
refeshTaskDupInfo(addDup, ndPosVal, e.second.dupIdx, e.second.notDupIdx, g.dupIdxArr[e.first]);
|
midArr.push_back(val);
|
||||||
refeshTaskDupInfo(addDup, ndPosVal, e.second.opticalDupIdx, e.second.notDupIdx, g.opticalDupIdxArr[e.first]);
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
while (bi != be)
|
||||||
|
{
|
||||||
|
val = *bi++;
|
||||||
|
if (notDupIdx.find(val) == notDupIdx.end())
|
||||||
|
{
|
||||||
|
midArr.push_back(val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dupArr = midArr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 处理相邻的两个任务,有相交叉的数据 */
|
/* 处理相邻的两个任务,有相交叉的数据 */
|
||||||
|
|
@ -557,51 +532,211 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
||||||
dupIdx.clear();
|
dupIdx.clear();
|
||||||
notDupIdx.clear();
|
notDupIdx.clear();
|
||||||
set<int64_t> opticalDupIdx;
|
set<int64_t> opticalDupIdx;
|
||||||
getIntersectData(lp.pairs, p.pairs, &reArr);
|
getIntersectData(lp.pairs, p.pairs, &reArr, true);
|
||||||
processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx);
|
processPairs(reArr, &dupIdx, &opticalDupIdx, ¬DupIdx);
|
||||||
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
||||||
|
|
||||||
// 处理之前未匹配的部分
|
// 处理之前未匹配的部分
|
||||||
map<int64_t, int64_t> recalcPos;
|
map<CalcKey, int64_t> recalcPos;
|
||||||
processUnpairedPosForCalc(lp.unpairedDic,
|
set<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
|
||||||
lp.unpairedPosArr,
|
set<int64_t> addToGlobal;
|
||||||
p.unpairedDic,
|
int64_t prevLastPos = 0, nextFirstPos = 0;
|
||||||
p.unpairedPosArr,
|
if (lp.frags.size() > 0)
|
||||||
p.pairs,
|
prevLastPos = lp.frags.back().posKey;
|
||||||
recalcPos);
|
if (p.frags.size() > 0)
|
||||||
for (auto &e : recalcPos)
|
nextFirstPos = p.frags[0].posKey;
|
||||||
|
// cout << "range: " << nextFirstPos << '\t' << prevLastPos << endl;
|
||||||
|
for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
|
||||||
{
|
{
|
||||||
auto posKey = e.first;
|
auto &readName = prevUnpair.first;
|
||||||
dupIdx.clear();
|
auto &prevPosInfo = prevUnpair.second;
|
||||||
notDupIdx.clear();
|
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||||
opticalDupIdx.clear();
|
|
||||||
processPairs(lp.unpairedPosArr[posKey], &dupIdx, &opticalDupIdx, ¬DupIdx);
|
|
||||||
refreshPairDupIdx(dupIdx, opticalDupIdx, notDupIdx, &lp, &p);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 遗留的未匹配的pair
|
if (p.unpairedDic.find(readName) != p.unpairedDic.end()) // 在当前这个任务里找到了这个未匹配的read
|
||||||
processUnpairedPosForCalc(g.unpairedDic,
|
{
|
||||||
g.unpairedPosArr,
|
auto &nextPosInfo = p.unpairedDic[readName];
|
||||||
lp.unpairedDic,
|
auto &nextFragEnd = nextPosInfo.unpairedRE;
|
||||||
lp.unpairedPosArr,
|
int64_t prevPosKey = prevFragEnd.posKey;
|
||||||
lp.pairs,
|
modifyPairedEnds(nextFragEnd, &prevFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||||
recalcPos,
|
int64_t nextPosKey = max(prevPosKey, nextFragEnd.posKey);
|
||||||
true);
|
CalcKey ck = {prevPosKey, nextPosKey};
|
||||||
map<int64_t, TaskSeqDupInfo> seqTaskChanged;
|
UnpairedPosInfo *prevUnpairInfoP = nullptr;
|
||||||
|
UnpairedPosInfo *nextUnpairInfoP = nullptr;
|
||||||
|
if (lp.unpairedPosArr.find(prevPosKey) != lp.unpairedPosArr.end())
|
||||||
|
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
||||||
|
if (p.unpairedPosArr.find(prevPosKey) != p.unpairedPosArr.end())
|
||||||
|
nextUnpairInfoP = &p.unpairedPosArr[prevPosKey];
|
||||||
|
|
||||||
|
// pos分为两种情况,根据poskey(pair中两个read分别的pos)的位置确定
|
||||||
|
// 1. prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据;
|
||||||
|
// 2. prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read pair进行重新计算
|
||||||
|
// 复杂情况1. g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
||||||
|
// 3. prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read pair进行重新计算
|
||||||
|
// 复杂情况2. p中是否包含prevPosKey对应的unpair
|
||||||
|
// 4. prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read pair进行重新计算
|
||||||
|
|
||||||
|
bool addDataToPos = true;
|
||||||
|
if (alreadyAdd.find(ck) != alreadyAdd.end())
|
||||||
|
{
|
||||||
|
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
|
||||||
|
}
|
||||||
|
else
|
||||||
|
alreadyAdd.insert(ck);
|
||||||
|
|
||||||
|
if (prevPosKey < nextFirstPos) // prevpos在交叉部分之前
|
||||||
|
{
|
||||||
|
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||||
|
prevPairArr.push_back(prevFragEnd);
|
||||||
|
if (nextPosKey <= prevLastPos && addDataToPos) // 第二种情况
|
||||||
|
{
|
||||||
|
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
||||||
|
}
|
||||||
|
// 第一种情况,第二种情况下都会出现,复杂情况一
|
||||||
|
auto gPosInfo = g.unpairedPosArr.find(prevPosKey);
|
||||||
|
if (gPosInfo != g.unpairedPosArr.end()) // 可能g和p有匹配的,刚好和该位点一致
|
||||||
|
{
|
||||||
|
auto &gUnpairInfo = gPosInfo->second;
|
||||||
|
auto pPosInfo = p.unpairedPosArr.find(nextPosKey);
|
||||||
|
if (pPosInfo != p.unpairedPosArr.end())
|
||||||
|
{
|
||||||
|
auto &pUnpairInfo = pPosInfo->second;
|
||||||
|
for (auto &rn : gUnpairInfo.readNameSet) // 遍历每一个readname,看是否有匹配的
|
||||||
|
{
|
||||||
|
if (pUnpairInfo.readNameSet.find(rn) != pUnpairInfo.readNameSet.end())
|
||||||
|
{
|
||||||
|
auto pe = g.unpairedDic[rn].unpairedRE;
|
||||||
|
auto fe = p.unpairedDic[rn].unpairedRE;
|
||||||
|
modifyPairedEnds(fe, &pe);
|
||||||
|
prevPairArr.push_back(pe);
|
||||||
|
g.unpairedDic.erase(rn);
|
||||||
|
p.unpairedDic.erase(rn);
|
||||||
|
// cout << "找到了!" << rn << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||||
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
|
}
|
||||||
|
else // prevpos在交叉部分
|
||||||
|
{
|
||||||
|
if (nextPosKey > prevLastPos) // nextpos在交叉部分之后
|
||||||
|
{ // 第三种情况
|
||||||
|
if (nextUnpairInfoP != nullptr) // 且在pos点,next task有unpair,这样才把这些数据放到next task里
|
||||||
|
{
|
||||||
|
auto &nextPairArr = nextUnpairInfoP->pairArr;
|
||||||
|
nextPairArr.push_back(prevFragEnd);
|
||||||
|
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||||
|
prevPairArr.push_back(prevFragEnd);
|
||||||
|
if (addDataToPos)
|
||||||
|
{
|
||||||
|
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||||
|
}
|
||||||
|
recalcPos[ck] = nextPosInfo.taskSeq; // 将数据放到next task里, (这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
|
||||||
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
|
}
|
||||||
|
else // next task在该位点没有unpair,那就把数据放到prev task里
|
||||||
|
{
|
||||||
|
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||||
|
prevPairArr.push_back(prevFragEnd);
|
||||||
|
if (addDataToPos) // 第二种情况
|
||||||
|
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||||
|
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||||
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ // 第四种情况
|
||||||
|
if (prevUnpairInfoP == nullptr) {
|
||||||
|
prevUnpairInfoP = &lp.unpairedPosArr[prevPosKey];
|
||||||
|
prevUnpairInfoP->taskSeq = lp.taskSeq;
|
||||||
|
}
|
||||||
|
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||||
|
prevPairArr.push_back(prevFragEnd);
|
||||||
|
if (addDataToPos)
|
||||||
|
{
|
||||||
|
getEqualRE(prevFragEnd, lp.pairs, &prevPairArr);
|
||||||
|
getEqualRE(prevFragEnd, p.pairs, &prevPairArr);
|
||||||
|
}
|
||||||
|
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||||
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p.unpairedDic.erase(readName); // 在next task里删除该read
|
||||||
|
}
|
||||||
|
else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
|
||||||
|
{
|
||||||
|
auto &remainPosInfo = g.unpairedDic[readName];
|
||||||
|
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||||
|
int64_t remainPosKey = remainFragEnd.posKey;
|
||||||
|
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||||
|
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
||||||
|
auto &remainPairArr = remainUnpairInfo.pairArr;
|
||||||
|
remainPairArr.push_back(remainFragEnd);
|
||||||
|
CalcKey ck = {remainPosKey, prevFragEnd.posKey};
|
||||||
|
recalcPos[ck] = remainPosInfo.taskSeq;
|
||||||
|
std::sort(remainPairArr.begin(), remainPairArr.end());
|
||||||
|
|
||||||
|
g.unpairedDic.erase(readName);
|
||||||
|
}
|
||||||
|
else // 都没找到,那就保存到遗留数据里
|
||||||
|
{
|
||||||
|
int64_t prevPosKey = prevFragEnd.posKey;
|
||||||
|
g.unpairedDic.insert(prevUnpair);
|
||||||
|
addToGlobal.insert(prevPosKey);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto posKey : addToGlobal) // 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
||||||
|
g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
||||||
|
|
||||||
|
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||||
|
set<int64_t> posProcessed;
|
||||||
for (auto &e : recalcPos)
|
for (auto &e : recalcPos)
|
||||||
{
|
{
|
||||||
auto posKey = e.first;
|
auto posKey = e.first.read1Pos;
|
||||||
auto seqNum = e.second;
|
if (posProcessed.find(posKey) != posProcessed.end())
|
||||||
auto &t = seqTaskChanged[seqNum];
|
continue;
|
||||||
|
posProcessed.insert(posKey);
|
||||||
|
auto taskSeq = e.second;
|
||||||
|
auto &t = taskChanged[taskSeq];
|
||||||
// 在对应的任务包含的dup idx里修改结果数据
|
// 在对应的任务包含的dup idx里修改结果数据
|
||||||
processPairs(g.unpairedPosArr[posKey], &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
vector<ReadEnds> *pairArrP = nullptr;
|
||||||
|
if (taskSeq < lp.taskSeq)
|
||||||
|
pairArrP = &g.unpairedPosArr[posKey].pairArr;
|
||||||
|
else
|
||||||
|
pairArrP = &lp.unpairedPosArr[posKey].pairArr;
|
||||||
|
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
||||||
|
if (taskSeq < lp.taskSeq)
|
||||||
g.unpairedPosArr.erase(posKey);
|
g.unpairedPosArr.erase(posKey);
|
||||||
}
|
}
|
||||||
addDupInfoToTask(seqTaskChanged, &g);
|
// 更新结果
|
||||||
|
|
||||||
cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
for (auto &e : taskChanged)
|
||||||
|
{
|
||||||
|
auto taskSeq = e.first;
|
||||||
|
auto &t = e.second;
|
||||||
|
if (taskSeq < lp.taskSeq)
|
||||||
|
{
|
||||||
|
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
|
||||||
|
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
||||||
|
}
|
||||||
|
else if (taskSeq == lp.taskSeq)
|
||||||
|
{
|
||||||
|
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &lp, &p);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.notDupIdx, &p, &lp); // 把结果放到p中
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cout << "remain unpaired: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
||||||
|
// cout << "calc g time: " << t.seconds_elapsed() << " s" << endl;
|
||||||
// 将dupidx放进全局数据
|
// 将dupidx放进全局数据
|
||||||
|
g.latterDupIdxArr.push_back(set<int64_t>());
|
||||||
|
g.latterOpticalDupIdxArr.push_back(set<int64_t>());
|
||||||
|
g.latterNotDupIdxArr.push_back(set<int64_t>());
|
||||||
|
|
||||||
g.dupIdxArr.push_back(vector<int64_t>());
|
g.dupIdxArr.push_back(vector<int64_t>());
|
||||||
auto &vIdx = g.dupIdxArr.back();
|
auto &vIdx = g.dupIdxArr.back();
|
||||||
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
||||||
|
|
@ -617,32 +752,60 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg)
|
||||||
{
|
{
|
||||||
auto &lp = *task;
|
auto &lp = *task;
|
||||||
auto &g = *gDataArg;
|
auto &g = *gDataArg;
|
||||||
|
|
||||||
map<int64_t, int64_t> recalcPos;
|
|
||||||
// 遗留的未匹配的pair
|
// 遗留的未匹配的pair
|
||||||
processUnpairedPosForCalc(g.unpairedDic,
|
for (auto &prevUnpair : lp.unpairedDic) // 遍历上一个任务中的每个未匹配的read
|
||||||
g.unpairedPosArr,
|
{
|
||||||
lp.unpairedDic,
|
auto &readName = prevUnpair.first;
|
||||||
lp.unpairedPosArr,
|
auto &prevPosInfo = prevUnpair.second;
|
||||||
lp.pairs,
|
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||||
recalcPos,
|
|
||||||
true);
|
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) // 在遗留数据中找到了匹配的read
|
||||||
map<int64_t, TaskSeqDupInfo> seqTaskChanged;
|
{
|
||||||
for (auto &e : recalcPos)
|
auto &remainPosInfo = g.unpairedDic[readName];
|
||||||
|
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||||
|
int64_t remainPosKey = remainFragEnd.posKey;
|
||||||
|
modifyPairedEnds(prevFragEnd, &remainFragEnd); // 在某些clip情况下,poskey可能是后面的read
|
||||||
|
auto &remainUnpairInfo = g.unpairedPosArr[remainPosKey];
|
||||||
|
|
||||||
|
remainUnpairInfo.pairArr.push_back(remainFragEnd);
|
||||||
|
g.unpairedDic.erase(readName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||||
|
for (auto &e : g.unpairedPosArr)
|
||||||
{
|
{
|
||||||
auto posKey = e.first;
|
auto posKey = e.first;
|
||||||
auto seqNum = e.second;
|
auto taskSeq = e.second.taskSeq;
|
||||||
auto &t = seqTaskChanged[seqNum];
|
auto &t = taskChanged[taskSeq];
|
||||||
// 在对应的任务包含的dup idx里修改结果数据
|
auto &arr = g.unpairedPosArr[posKey].pairArr;
|
||||||
processPairs(g.unpairedPosArr[posKey], &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
|
||||||
g.unpairedPosArr.erase(posKey);
|
|
||||||
}
|
|
||||||
// 更新遗留的结果
|
|
||||||
addDupInfoToTask(seqTaskChanged, &g);
|
|
||||||
|
|
||||||
cout << "last unpair info: " << g.unpairedPosArr.size() << '\t' << g.unpairedDic.size() << endl;
|
if (arr.size() > 1)
|
||||||
|
{
|
||||||
|
std::sort(arr.begin(), arr.end());
|
||||||
|
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.notDupIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 更新结果
|
||||||
|
vector<int64_t> addDup;
|
||||||
|
map<int64_t, int64_t> ndPosVal;
|
||||||
|
for (auto &e : taskChanged)
|
||||||
|
{
|
||||||
|
auto taskSeq = e.first;
|
||||||
|
auto &t = e.second;
|
||||||
|
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.notDupIdx,
|
||||||
|
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq]);
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
|
||||||
|
g.unpairedPosArr.clear();
|
||||||
|
g.unpairedDic.clear();
|
||||||
|
|
||||||
// 将dupidx放进全局数据
|
// 将dupidx放进全局数据
|
||||||
|
for (int i = 0; i < g.dupIdxArr.size() - 1; ++i)
|
||||||
|
refeshFinalTaskDupInfo(g.latterDupIdxArr[i], g.latterNotDupIdxArr[i], g.dupIdxArr[i]);
|
||||||
|
for (int i = 0; i < g.opticalDupIdxArr.size() - 1; ++i)
|
||||||
|
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotDupIdxArr[i], g.opticalDupIdxArr[i]);
|
||||||
|
|
||||||
g.dupIdxArr.push_back(vector<int64_t>());
|
g.dupIdxArr.push_back(vector<int64_t>());
|
||||||
auto &vIdx = g.dupIdxArr.back();
|
auto &vIdx = g.dupIdxArr.back();
|
||||||
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
lp.pairDupIdx.insert(lp.fragDupIdx.begin(), lp.fragDupIdx.end());
|
||||||
|
|
@ -662,7 +825,7 @@ static void serialMarkDups()
|
||||||
BamBufType inBamBuf(g_gArg.use_asyncio);
|
BamBufType inBamBuf(g_gArg.use_asyncio);
|
||||||
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
inBamBuf.Init(g_inBamFp, g_inBamHeader, g_gArg.max_mem);
|
||||||
// BamBufType inBamBuf(false);
|
// BamBufType inBamBuf(false);
|
||||||
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 20 * 1024 * 1024);
|
// inBamBuf.Init(g_inBamFp, g_inBamHeader, 100 * 1024 * 1024);
|
||||||
int64_t processedBamNum = 0;
|
int64_t processedBamNum = 0;
|
||||||
|
|
||||||
SerailMarkDupArg smdArg1, smdArg2;
|
SerailMarkDupArg smdArg1, smdArg2;
|
||||||
|
|
@ -689,13 +852,13 @@ static void serialMarkDups()
|
||||||
tm_arr[0].acc_start();
|
tm_arr[0].acc_start();
|
||||||
Timer t1;
|
Timer t1;
|
||||||
generateReadEnds(curArgP);
|
generateReadEnds(curArgP);
|
||||||
cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl;
|
//cout << "calc read end time: " << t1.seconds_elapsed() << " s" << endl;
|
||||||
tm_arr[0].acc_end();
|
tm_arr[0].acc_end();
|
||||||
|
|
||||||
tm_arr[1].acc_start();
|
tm_arr[1].acc_start();
|
||||||
t1.reinit();
|
t1.reinit();
|
||||||
markdups(curArgP);
|
markdups(curArgP);
|
||||||
cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
|
//cout << "markdups time: " << t1.seconds_elapsed() << " s" << endl;
|
||||||
tm_arr[1].acc_end();
|
tm_arr[1].acc_end();
|
||||||
|
|
||||||
if (!isFirstRound)
|
if (!isFirstRound)
|
||||||
|
|
@ -703,7 +866,7 @@ static void serialMarkDups()
|
||||||
tm_arr[2].acc_start();
|
tm_arr[2].acc_start();
|
||||||
t1.reinit();
|
t1.reinit();
|
||||||
handleIntersectData(lastArgP, curArgP, &gData);
|
handleIntersectData(lastArgP, curArgP, &gData);
|
||||||
cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl;
|
//cout << "intersect time: " << t1.seconds_elapsed() << " s" << endl;
|
||||||
// addTaskIdxToSet(lastArgP, &gData);
|
// addTaskIdxToSet(lastArgP, &gData);
|
||||||
tm_arr[2].acc_end();
|
tm_arr[2].acc_end();
|
||||||
}
|
}
|
||||||
|
|
@ -724,20 +887,42 @@ static void serialMarkDups()
|
||||||
// break;
|
// break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// cout << "here" << endl;
|
||||||
tm_arr[3].acc_start();
|
tm_arr[3].acc_start();
|
||||||
// 处理剩下的全局数据
|
// 处理剩下的全局数据
|
||||||
handleLastTask(lastArgP, &gData);
|
handleLastTask(lastArgP, &gData);
|
||||||
|
// cout << "here 2" << endl;
|
||||||
tm_arr[3].acc_end();
|
tm_arr[3].acc_end();
|
||||||
|
|
||||||
tm_arr[5].acc_end();
|
tm_arr[5].acc_end();
|
||||||
// 统计所有冗余index数量
|
// 统计所有冗余index数量
|
||||||
int64_t dupNum = 0;
|
int64_t dupNum = 0;
|
||||||
unordered_set<int64_t> dup;
|
set<int64_t> dup;
|
||||||
|
|
||||||
|
// int taskSeq = 0;
|
||||||
|
// for (auto &arr : gData.dupIdxArr)
|
||||||
|
// {
|
||||||
|
// for (auto idx : arr) {
|
||||||
|
// if (dup.find(idx) != dup.end())
|
||||||
|
// {
|
||||||
|
// cout << "dup index: " << taskSeq << '\t' << idx << endl;
|
||||||
|
// }
|
||||||
|
// dup.insert(idx);
|
||||||
|
// }
|
||||||
|
// taskSeq++;
|
||||||
|
// }
|
||||||
|
// #include <fstream>
|
||||||
|
// ofstream out("tumor_dup.txt");
|
||||||
|
// for (auto idx : dup)
|
||||||
|
// {
|
||||||
|
// out << idx << endl;
|
||||||
|
// }
|
||||||
|
// out.close();
|
||||||
|
|
||||||
for (auto &arr : gData.dupIdxArr)
|
for (auto &arr : gData.dupIdxArr)
|
||||||
for (auto idx : arr)
|
dupNum += arr.size();
|
||||||
dup.insert(idx);
|
|
||||||
dupNum += dup.size();
|
cout << "dup num : " << dupNum << '\t' << dup.size() << endl;
|
||||||
cout << "dup num : " << dupNum << endl;
|
|
||||||
|
|
||||||
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
|
||||||
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
|
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
|
||||||
|
|
@ -747,6 +932,8 @@ static void serialMarkDups()
|
||||||
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
|
cout << "new arg : " << tm_arr[6].acc_seconds_elapsed() << endl;
|
||||||
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
|
cout << "del arg : " << tm_arr[7].acc_seconds_elapsed() << endl;
|
||||||
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
|
cout << "build ends : " << tm_arr[8].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
||||||
|
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
||||||
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
||||||
|
|
||||||
Timer::log_time("serial end ");
|
Timer::log_time("serial end ");
|
||||||
|
|
|
||||||
|
|
@ -101,15 +101,6 @@ struct ReadEnds : PhysicalLocation
|
||||||
return areComparable;
|
return areComparable;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 找某一个位置的所有readend时需要
|
|
||||||
static bool pairsCmp(const ReadEnds &lhs, const ReadEnds &rhs)
|
|
||||||
{
|
|
||||||
int comp = lhs.read1ReferenceIndex - rhs.read1ReferenceIndex;
|
|
||||||
if (comp == 0)
|
|
||||||
comp = lhs.read1Coordinate - rhs.read1Coordinate;
|
|
||||||
return comp < 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 比对方向是否正向 */
|
/* 比对方向是否正向 */
|
||||||
bool IsPositiveStrand() const
|
bool IsPositiveStrand() const
|
||||||
{
|
{
|
||||||
|
|
@ -127,6 +118,30 @@ struct ReadEnds : PhysicalLocation
|
||||||
return orientation == R;
|
return orientation == R;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 对于相交的数据进行比对,a是否小于b,根据AreComparableForDuplicates函数得来
|
||||||
|
static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false)
|
||||||
|
{
|
||||||
|
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = a.read1Coordinate - b.read1Coordinate;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = a.orientation - b.orientation;
|
||||||
|
if (compareRead2)
|
||||||
|
{
|
||||||
|
if (comp == 0)
|
||||||
|
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
||||||
|
if (comp == 0)
|
||||||
|
comp = a.read2Coordinate - b.read2Coordinate;
|
||||||
|
}
|
||||||
|
return comp < 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 找某一个位置的所有readend时需要
|
||||||
|
static bool pairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs)
|
||||||
|
{
|
||||||
|
return ReadLittleThan(lhs, rhs, true);
|
||||||
|
}
|
||||||
|
|
||||||
// 比较函数
|
// 比较函数
|
||||||
bool operator < (const ReadEnds &o) const
|
bool operator < (const ReadEnds &o) const
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue