修改了一些格式

This commit is contained in:
zzh 2024-11-05 15:53:04 +08:00
parent f84d7bb0dc
commit 899d40cbda
6 changed files with 197 additions and 118 deletions

4
run.sh
View File

@ -1,7 +1,7 @@
#input=~/data/bam/zy_normal.bam #input=~/data/bam/zy_normal.bam
input=~/data/bam/zy_tumor.bam #input=~/data/bam/zy_tumor.bam
#input=~/data/bam/100w.bam #input=~/data/bam/100w.bam
#input=~/data/bam/1kw.sam input=~/data/bam/1kw.sam
#input=~/data/bam/n1kw.sam #input=~/data/bam/n1kw.sam
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \

View File

@ -1,17 +1,15 @@
#include <htslib/sam.h>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <htslib/sam.h>
#include "module.h" #include "module.h"
/* 版本信息 */ /* 版本信息 */
const char *version() const char *version() { return PICARD_CPP_VERSION; }
{
return PICARD_CPP_VERSION;
}
/* 使用说明 */ /* 使用说明 */
static void usage(FILE *fp) static void usage(FILE *fp) {
{
fprintf(fp, fprintf(fp,
"\n" "\n"
"Program: picard_cpp (A cpp implementation for picard.)\n" "Program: picard_cpp (A cpp implementation for picard.)\n"
@ -25,19 +23,14 @@ static void usage(FILE *fp)
"\n"); "\n");
} }
int main(int argc, char *argv[]) int main(int argc, char *argv[]) {
{ if (argc < 2) {
if (argc < 2)
{
usage(stderr); usage(stderr);
return 1; return 1;
} }
if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) {
{ if (argc == 2) {
if (argc == 2)
{
usage(stdout); usage(stdout);
return 0; return 0;
} }
@ -49,8 +42,7 @@ int main(int argc, char *argv[])
if (strcmp(argv[1], "MarkDuplicates") == 0) if (strcmp(argv[1], "MarkDuplicates") == 0)
ret = MarkDuplicates(argc - 1, argv + 1); ret = MarkDuplicates(argc - 1, argv + 1);
else else {
{
fprintf(stderr, "\n[Error]: unrecognized command '%s'\n\n", argv[1]); fprintf(stderr, "\n[Error]: unrecognized command '%s'\n\n", argv[1]);
usage(stdout); usage(stdout);
return 1; return 1;

View File

@ -26,12 +26,12 @@ Date : 2023/10/23
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "dup_metrics.h"
#include "markdups_arg.h" #include "markdups_arg.h"
#include "md_funcs.h" #include "md_funcs.h"
#include "parallel_md.h" #include "parallel_md.h"
#include "serial_md.h" #include "serial_md.h"
#include "shared_args.h" #include "shared_args.h"
#include "dup_metrics.h"
using namespace std; using namespace std;
using std::cout; using std::cout;
@ -93,8 +93,8 @@ int MarkDuplicates(int argc, char *argv[]) {
/* 利用线程池对输入输出文件进行读写 */ /* 利用线程池对输入输出文件进行读写 */
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池 htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
//htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads); // htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
//htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads); // htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolRead.pool = hts_tpool_init(32); htsPoolRead.pool = hts_tpool_init(32);
htsPoolWrite.pool = hts_tpool_init(32); htsPoolWrite.pool = hts_tpool_init(32);
if (!htsPoolRead.pool || !htsPoolWrite.pool) { if (!htsPoolRead.pool || !htsPoolWrite.pool) {
@ -170,10 +170,10 @@ int MarkDuplicates(int argc, char *argv[]) {
for (size_t i = 0; i < inBuf.Size(); ++i) { for (size_t i = 0; i < inBuf.Size(); ++i) {
/* 判断是否冗余 */ /* 判断是否冗余 */
if (bamIdx == dupIdx) { if (bamIdx == dupIdx) {
//if (dupIdx.dupSet != 0) // if (dupIdx.dupSet != 0)
// cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl; // cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl;
// 为了防止小内存运行的时候有重复的dupidx这时候dup的repIdx和dupSetSize可能会有不同 // 为了防止小内存运行的时候有重复的dupidx这时候dup的repIdx和dupSetSize可能会有不同
while((dupIdx = dupIdxQue.Pop()) == bamIdx); while ((dupIdx = dupIdxQue.Pop()) == bamIdx);
} }
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent
// cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl; // cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl;
@ -213,4 +213,4 @@ int MarkDuplicates(int argc, char *argv[]) {
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl; // cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
Timer::log_time("程序结束"); Timer::log_time("程序结束");
return 0; return 0;
} }

View File

@ -81,8 +81,8 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
// int maxOperateTime = 0; // int maxOperateTime = 0;
/** All read ends should have orientation FF, FR, RF, or RR **/ /** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) { // 找分数最高的readend for (auto pe : vpRe) { // 找分数最高的readend
// maxOperateTime = max(maxOperateTime, pe->oprateTime); // maxOperateTime = max(maxOperateTime, pe->oprateTime);
// (const_cast<ReadEnds *>(pe))->oprateTime ++; // (const_cast<ReadEnds *>(pe))->oprateTime ++;
if (pe->score > maxScore || pBest == nullptr) { if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score; maxScore = pe->score;
pBest = pe; pBest = pe;
@ -102,13 +102,13 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
cout << "mark pair end: " << endl; cout << "mark pair end: " << endl;
} }
*/ */
// cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl; // cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" <<
// if (maxOperateTime == 0) ++zzhtestnum; // pBest->read2IndexInFile << endl; if (maxOperateTime == 0) ++zzhtestnum;
if (notDupIdx != nullptr) { if (notDupIdx != nullptr) {
notDupIdx->insert(pBest->read1IndexInFile); notDupIdx->insert(pBest->read1IndexInFile);
notDupIdx->insert(pBest->read2IndexInFile); notDupIdx->insert(pBest->read2IndexInFile);
} }
//if (false) { // if (false) {
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余 if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
// trackOpticalDuplicates // trackOpticalDuplicates
vector<const ReadEnds *> prevOpticalRe; vector<const ReadEnds *> prevOpticalRe;
@ -134,7 +134,7 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
if (pe != pBest) { // 非best if (pe != pBest) { // 非best
dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1 dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile) if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); //read2, dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2,
// 注意这里代表是read1的索引 // 注意这里代表是read1的索引
// 检查是否optical dup // 检查是否optical dup
if (pe->isOpticalDuplicate && opticalDupIdx != nullptr) { if (pe->isOpticalDuplicate && opticalDupIdx != nullptr) {
@ -370,13 +370,22 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &not
/* 将pairs重叠部分的dup idx放进数据中 */ /* 将pairs重叠部分的dup idx放进数据中 */
static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx, static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx, MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx,
SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) { MDSet<int64_t> &notRepIdx, SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) {
auto &lp = *lastArg; auto &lp = *lastArg;
auto &p = *curArg; auto &p = *curArg;
for (auto idx : dupIdx) { lp.pairDupIdx.insert(idx); p.pairDupIdx.erase(idx); } for (auto idx : dupIdx) {
for (auto idx : opticalDupIdx) { lp.pairOpticalDupIdx.insert(idx); p.pairOpticalDupIdx.erase(idx); } lp.pairDupIdx.insert(idx);
for (auto idx : repIdx) { lp.pairRepIdx.insert(idx); p.pairRepIdx.erase(idx); } p.pairDupIdx.erase(idx);
}
for (auto idx : opticalDupIdx) {
lp.pairOpticalDupIdx.insert(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : repIdx) {
lp.pairRepIdx.insert(idx);
p.pairRepIdx.erase(idx);
}
// for (auto idx : notDupIdx) { // for (auto idx : notDupIdx) {
// if (lp.pairDupIdx.find(idx) != lp.pairDupIdx.end()) cout << "find-1: " << idx << endl; // if (lp.pairDupIdx.find(idx) != lp.pairDupIdx.end()) cout << "find-1: " << idx << endl;
// if (lp.pairDupIdx.find({idx}) != lp.pairDupIdx.end()) cout << "find-2: " << idx << endl; // if (lp.pairDupIdx.find({idx}) != lp.pairDupIdx.end()) cout << "find-2: " << idx << endl;
@ -384,15 +393,22 @@ static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opt
// if (p.pairDupIdx.find({idx}) != p.pairDupIdx.end()) cout << "find-4: " << idx << endl; // if (p.pairDupIdx.find({idx}) != p.pairDupIdx.end()) cout << "find-4: " << idx << endl;
// lp.pairDupIdx.erase(idx); p.pairDupIdx.erase(idx); // lp.pairDupIdx.erase(idx); p.pairDupIdx.erase(idx);
// } // }
for (auto idx : notOpticalDupIdx) { lp.pairOpticalDupIdx.erase(idx); p.pairOpticalDupIdx.erase(idx); } for (auto idx : notOpticalDupIdx) {
for (auto idx : notRepIdx) { lp.pairRepIdx.erase(idx); p.pairRepIdx.erase(idx); } lp.pairOpticalDupIdx.erase(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : notRepIdx) {
lp.pairRepIdx.erase(idx);
p.pairRepIdx.erase(idx);
}
} }
// 用来分别处理dup和optical dup // 用来分别处理dup和optical dup
static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx, static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx, MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
DPSet<DupInfo> &latterDupIdx, MDSet<int64_t> &latterOpticalDupIdx, DPSet<DupInfo> &latterRepIdx, DPSet<DupInfo> &latterDupIdx, MDSet<int64_t> &latterOpticalDupIdx,
MDSet<int64_t> &latterNotDupIdx, MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotRepIdx) { DPSet<DupInfo> &latterRepIdx, MDSet<int64_t> &latterNotDupIdx,
MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotRepIdx) {
for (auto idx : dupIdx) { for (auto idx : dupIdx) {
latterDupIdx.insert(idx); latterDupIdx.insert(idx);
// latterNotDupIdx.erase(idx); // 后来的更新为准 // latterNotDupIdx.erase(idx); // 后来的更新为准
@ -405,7 +421,7 @@ static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDup
} }
/* 最后合并数据并排序 */ /* 最后合并数据并排序 */
template<typename DupContainer, typename T> template <typename DupContainer, typename T>
static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupIdx, vector<T> &dupArr, static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupIdx, vector<T> &dupArr,
vector<T> &cacheDupIdx, vector<T> &midArr) { vector<T> &cacheDupIdx, vector<T> &midArr) {
midArr.resize(0); midArr.resize(0);
@ -417,8 +433,8 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupI
auto ae = dupArr.end(); auto ae = dupArr.end();
auto bi = cacheDupIdx.begin(); auto bi = cacheDupIdx.begin();
auto be = cacheDupIdx.end(); auto be = cacheDupIdx.end();
//auto bi = dupIdx.begin(); // auto bi = dupIdx.begin();
//auto be = dupIdx.end(); // auto be = dupIdx.end();
T val = 0; T val = 0;
while (ai != ae && bi != be) { while (ai != ae && bi != be) {
@ -475,8 +491,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx); processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx);
refreshPairDupIdx(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, &lp, &p); refreshPairDupIdx(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, &lp, &p);
//cout << (g.unpairedDic.find("A01415:368:HL7NTDSX3:3:1104:5195:34757") != g.unpairedDic.end()) << endl; // cout << (g.unpairedDic.find("A01415:368:HL7NTDSX3:3:1104:5195:34757") != g.unpairedDic.end()) << endl;
//cout << (g.unpairedPosArr.find(14293757783047) != g.unpairedPosArr.end()) << endl; // cout << (g.unpairedPosArr.find(14293757783047) != g.unpairedPosArr.end()) << endl;
// 处理之前未匹配的部分 // 处理之前未匹配的部分
map<CalcKey, int64_t> recalcPos; map<CalcKey, int64_t> recalcPos;
CalcSet<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了 CalcSet<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
@ -516,8 +532,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// if (prevFragEnd.read1IndexInFile == 255830545 || prevFragEnd.read1IndexInFile == 255830546 || // if (prevFragEnd.read1IndexInFile == 255830545 || prevFragEnd.read1IndexInFile == 255830546 ||
// prevFragEnd.read1IndexInFile == 255832599 || prevFragEnd.read1IndexInFile == 255832601) { // prevFragEnd.read1IndexInFile == 255832599 || prevFragEnd.read1IndexInFile == 255832601) {
// cout << "find in p: " << lp.taskSeq << "\t" << prevFragEnd.read1IndexInFile << "\t" << readName << endl; // cout << "find in p: " << lp.taskSeq << "\t" << prevFragEnd.read1IndexInFile << "\t" << readName <<
// if (nextUnpairInfoP != nullptr) // endl; if (nextUnpairInfoP != nullptr)
// cout << "next p: " << nextUnpairInfoP->unpairedNum << endl; // cout << "next p: " << nextUnpairInfoP->unpairedNum << endl;
// if (prevUnpairInfoP != nullptr) // if (prevUnpairInfoP != nullptr)
// cout << "prev p: " << prevUnpairInfoP->unpairedNum << endl; // cout << "prev p: " << prevUnpairInfoP->unpairedNum << endl;
@ -657,7 +673,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// if (p.taskSeq == 163) { // if (p.taskSeq == 163) {
// cout << "final" << endl; // cout << "final" << endl;
// } // }
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx); processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx,
&t.notRepIdx);
if (taskSeq < lp.taskSeq) if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey); g.unpairedPosArr.erase(posKey);
} }
@ -673,7 +690,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// for (auto &re: lp.unpairedPosArr[posKey].pairArr) { // for (auto &re: lp.unpairedPosArr[posKey].pairArr) {
// cout << "lp reads: " << re.read1IndexInFile << "\t" << re.read2IndexInFile << endl; // cout << "lp reads: " << re.read1IndexInFile << "\t" << re.read2IndexInFile << endl;
// } // }
// cout << "found in g: " << lp.taskSeq << "\t" << lp.unpairedPosArr[posKey].unpairedNum << "\t" << lp.unpairedPosArr[posKey].pairArr.size() << endl; // cout << "found in g: " << lp.taskSeq << "\t" << lp.unpairedPosArr[posKey].unpairedNum << "\t" <<
// lp.unpairedPosArr[posKey].pairArr.size() << endl;
// } // }
g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey]; g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
} }
@ -686,11 +704,14 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
if (taskSeq < lp.taskSeq) { if (taskSeq < lp.taskSeq) {
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx,
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]); g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq],
g.latterNotRepIdxArr[taskSeq]);
} else if (taskSeq == lp.taskSeq) { } else if (taskSeq == lp.taskSeq) {
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &lp, &p); refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &lp,
&p);
} else { } else {
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &p, &lp); // 把结果放到p中 refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &p,
&lp); // 把结果放到p中
} }
} }
@ -769,21 +790,21 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
// cout << t.dupIdx.size() << "\t" << t.notDupIdx.size() << endl; // cout << t.dupIdx.size() << "\t" << t.notDupIdx.size() << endl;
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx,
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]); g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq],
g.latterNotRepIdxArr[taskSeq]);
} }
// cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; // cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
g.unpairedPosArr.clear(); g.unpairedPosArr.clear();
g.unpairedDic.clear(); g.unpairedDic.clear();
/* /*
int taskSeq = 0; int taskSeq = 0;
for (auto &arr : g.dupIdxArr) { for (auto &arr : g.dupIdxArr) {
cout << taskSeq << "\t" << arr.size(); cout << taskSeq << "\t" << arr.size();
if (taskSeq < (int)g.dupIdxArr.size() - 1) if (taskSeq < (int)g.dupIdxArr.size() - 1)
cout << "\t" << g.latterDupIdxArr[taskSeq].size() << "\t" << g.latterNotDupIdxArr[taskSeq].size() << endl; cout << "\t" << g.latterDupIdxArr[taskSeq].size() << "\t" << g.latterNotDupIdxArr[taskSeq].size() <<
else endl; else cout << endl;
cout << endl;
// if (taskSeq == 98) { // if (taskSeq == 98) {
// vector<DupInfo> v; // vector<DupInfo> v;
// v.insert(v.end(), g.latterDupIdxArr[taskSeq].begin(), g.latterDupIdxArr[taskSeq].end()); // v.insert(v.end(), g.latterDupIdxArr[taskSeq].begin(), g.latterDupIdxArr[taskSeq].end());
@ -797,7 +818,7 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
// } // }
taskSeq++; taskSeq++;
} }
*/ */
// 将dupidx放进全局数据 // 将dupidx放进全局数据
vector<DupInfo> cacheDupIdx; vector<DupInfo> cacheDupIdx;
vector<DupInfo> midArr; vector<DupInfo> midArr;
@ -837,7 +858,8 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
*/ */
} }
for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i) for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotOpticalDupIdxArr[i], g.opticalDupIdxArr[i], intCacheDupIdx, intMidArr); refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotOpticalDupIdxArr[i], g.opticalDupIdxArr[i],
intCacheDupIdx, intMidArr);
for (int i = 0; i < (int)g.repIdxArr.size() - 1; ++i) for (int i = 0; i < (int)g.repIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterRepIdxArr[i], g.latterNotRepIdxArr[i], g.repIdxArr[i], cacheDupIdx, midArr); refeshFinalTaskDupInfo(g.latterRepIdxArr[i], g.latterNotRepIdxArr[i], g.repIdxArr[i], cacheDupIdx, midArr);
@ -866,6 +888,15 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
std::sort(vRepIdx.begin(), vRepIdx.end()); std::sort(vRepIdx.begin(), vRepIdx.end());
} }
void calculateMetrics(SerailMarkDupArg &lp, SerailMarkDupArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) {
DuplicationMetrics &gMetrics = *pgMetrics;
cout << "calculateMetrics start: " << endl;
cout << lp.unpairedDic.size() << "\t" << p.unpairedDic.size() << "\t" << g.unpairedDic.size() << endl;
cout << "calculateMetrics end" << endl;
}
/* 串行处理数据,标记冗余 */ /* 串行处理数据,标记冗余 */
void serialMarkDups() { void serialMarkDups() {
tm_arr[5].acc_start(); tm_arr[5].acc_start();
@ -933,14 +964,18 @@ void serialMarkDups() {
// cout << "round time: " << t_round.seconds_elapsed() << endl; // cout << "round time: " << t_round.seconds_elapsed() << endl;
roundNum++; roundNum++;
if (roundNum % 100 == 0) { if (roundNum % 100 == 0) {
//cout << "read sum: " << readNumSum << endl; // cout << "read sum: " << readNumSum << endl;
//cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl; // cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
} }
} }
// cout << "here" << endl; // cout << "here" << endl;
tm_arr[3].acc_start(); tm_arr[3].acc_start();
// 处理剩下的全局数据 // 处理剩下的全局数据
handleLastTask(lastArgP, &gData); handleLastTask(lastArgP, &gData);
// 计算各种统计指标
calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics);
// cout << "here 2" << endl; // cout << "here 2" << endl;
tm_arr[3].acc_end(); tm_arr[3].acc_end();
@ -964,7 +999,6 @@ void serialMarkDups() {
taskSeq++; taskSeq++;
} }
// #include <fstream> // #include <fstream>
// ofstream out("tumor_dup.txt"); // ofstream out("tumor_dup.txt");
// for (auto idx : dup) // for (auto idx : dup)
@ -989,12 +1023,22 @@ void serialMarkDups() {
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t"
<< gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl;
cout << "optical dup: " << zzhopticalSet.size() << endl; cout << "optical dup: " << zzhopticalSet.size() << endl;
cout << "optical arr dup: " << zzhopticalArr.size() << endl; cout << "optical arr dup: " << zzhopticalArr.size() << endl;
cout << "optical size: " << opticalDupNum << endl; cout << "optical size: " << opticalDupNum << endl;
cout << "metrics: \n"
<< "LIBRARY: " << gMetrics.LIBRARY << "\n"
<< "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n"
<< "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n"
<< "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n"
<< "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n"
<< "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n"
<< "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n"
<< "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n"
<< "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n"
<< "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;
Timer::log_time("serial end "); Timer::log_time("serial end ");
// for (auto i : gData.dupArr) // for (auto i : gData.dupArr)

View File

@ -7,12 +7,12 @@
#include <set> #include <set>
#include <string> #include <string>
#include <vector>
#include <unordered_set> #include <unordered_set>
#include <vector>
using std::set; using std::set;
using std::unordered_set;
using std::string; using std::string;
using std::unordered_set;
using std::vector; using std::vector;
/* 存放未匹配readend相同位点的所有readend */ /* 存放未匹配readend相同位点的所有readend */
@ -49,18 +49,12 @@ struct DupInfo {
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引 int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
int16_t dupSet = 0; // dup set size int16_t dupSet = 0; // dup set size
DupInfo() : DupInfo(-1, 0, 0) { } DupInfo() : DupInfo(-1, 0, 0) {}
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) { } DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {}
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {} DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
bool operator<(const DupInfo &o) const { bool operator<(const DupInfo &o) const { return idx < o.idx; }
return idx < o.idx; bool operator>(const DupInfo &o) const { return idx > o.idx; }
} operator int64_t() const { return idx; }
bool operator>(const DupInfo &o) const {
return idx > o.idx;
}
operator int64_t() const {
return idx;
}
}; };
struct DupInfoHash { struct DupInfoHash {
@ -73,7 +67,7 @@ struct DupInfoEqual {
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; } bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
}; };
template<typename T> template <typename T>
// using MDSet = set<T>; // using MDSet = set<T>;
// using MDSet = unordered_set<T>; // using MDSet = unordered_set<T>;
using MDSet = tsl::robin_set<T>; using MDSet = tsl::robin_set<T>;
@ -84,7 +78,7 @@ template <typename T>
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>; using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
template <typename T> template <typename T>
//using CalcSet = set<T>; // using CalcSet = set<T>;
using CalcSet = tsl::robin_set<T, CalcKeyHash>; using CalcSet = tsl::robin_set<T, CalcKeyHash>;
/* 当遗留数据在当前任务找到了pair read后进行冗余计算时候存放结果的数据结构 */ /* 当遗留数据在当前任务找到了pair read后进行冗余计算时候存放结果的数据结构 */
@ -108,7 +102,8 @@ struct UnpairedPosInfo {
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引保存未匹配的pair read typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引保存未匹配的pair read
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量 typedef tsl::robin_map<int64_t, UnpairedPosInfo>
UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
/* 单线程处理冗余参数结构体 */ /* 单线程处理冗余参数结构体 */
struct SerailMarkDupArg { struct SerailMarkDupArg {

48
todo.md 100644
View File

@ -0,0 +1,48 @@
# 各种统计信息的计算metrics文件中记录的信息
- htsjdk.samtools.metrics.StringHeader
- 命令行参数
- 执行命令的时间
- METRICS CLASS picard.sam.DuplicationMetrics
```
LIBRARY 样本ID(建库制备的样本id)
UNPAIRED_READS_EXAMINED 未匹配的reads数量
READ_PAIRS_EXAMINED 匹配的reads数量
SECONDARY_OR_SUPPLEMENTARY_RDS 非主要匹配的reads个数
UNMAPPED_READS 未匹配的reads个数
UNPAIRED_READ_DUPLICATES 未匹配的reads冗余的个数
READ_PAIR_DUPLICATES 匹配的reads冗余个数
READ_PAIR_OPTICAL_DUPLICATES 匹配的reads光学原因造成的冗余个数
PERCENT_DUPLICATION 冗余reads占比
ESTIMATED_LIBRARY_SIZE 估计的样本reads总量
normal 1205 498430 729 1205 763 117212 6877 0.235643 924111
```
- HISTOGRAM java.lang.Double
```
BIN 坐标值
CoverageMult
all_sets
optical_sets
non_optical_sets
1.0 1.010558 287416 0 291706
2.0 1.599836 72561 6664 70093
3.0 1.943455 15542 105 14299
4.0 2.143827 3274 1 2831
5.0 2.260667 708 0 607
6.0 2.3288 136 0 114
7.0 2.368529 28 0 17
8.0 2.391696 7 0 5
9.0 2.405205 2 0 2
10.0 2.413082 0 0 0
11.0 2.417676 0 0 0
12.0 2.420354 0 0 0
13.0 2.421916 0 0 0
14.0 2.422827 0 0 0
15.0 2.423358 0 0 0
...
...
98.0 2.424101 0 0 0
99.0 2.424101 0 0 0
100.0 2.424101 0 0 0
```