修改了一些格式

This commit is contained in:
zzh 2024-11-05 15:53:04 +08:00
parent f84d7bb0dc
commit 899d40cbda
6 changed files with 197 additions and 118 deletions

4
run.sh
View File

@ -1,7 +1,7 @@
#input=~/data/bam/zy_normal.bam
input=~/data/bam/zy_tumor.bam
#input=~/data/bam/zy_tumor.bam
#input=~/data/bam/100w.bam
#input=~/data/bam/1kw.sam
input=~/data/bam/1kw.sam
#input=~/data/bam/n1kw.sam
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \

View File

@ -1,17 +1,15 @@
#include <htslib/sam.h>
#include <cstdio>
#include <cstring>
#include <htslib/sam.h>
#include "module.h"
/* 版本信息 */
const char *version()
{
return PICARD_CPP_VERSION;
}
const char *version() { return PICARD_CPP_VERSION; }
/* 使用说明 */
static void usage(FILE *fp)
{
static void usage(FILE *fp) {
fprintf(fp,
"\n"
"Program: picard_cpp (A cpp implementation for picard.)\n"
@ -25,19 +23,14 @@ static void usage(FILE *fp)
"\n");
}
int main(int argc, char *argv[])
{
if (argc < 2)
{
int main(int argc, char *argv[]) {
if (argc < 2) {
usage(stderr);
return 1;
}
if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0)
{
if (argc == 2)
{
if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) {
if (argc == 2) {
usage(stdout);
return 0;
}
@ -49,8 +42,7 @@ int main(int argc, char *argv[])
if (strcmp(argv[1], "MarkDuplicates") == 0)
ret = MarkDuplicates(argc - 1, argv + 1);
else
{
else {
fprintf(stderr, "\n[Error]: unrecognized command '%s'\n\n", argv[1]);
usage(stdout);
return 1;

View File

@ -26,12 +26,12 @@ Date : 2023/10/23
#include <unordered_set>
#include <vector>
#include "dup_metrics.h"
#include "markdups_arg.h"
#include "md_funcs.h"
#include "parallel_md.h"
#include "serial_md.h"
#include "shared_args.h"
#include "dup_metrics.h"
using namespace std;
using std::cout;
@ -93,8 +93,8 @@ int MarkDuplicates(int argc, char *argv[]) {
/* 利用线程池对输入输出文件进行读写 */
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
//htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
//htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolRead.pool = hts_tpool_init(32);
htsPoolWrite.pool = hts_tpool_init(32);
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
@ -166,14 +166,14 @@ int MarkDuplicates(int argc, char *argv[]) {
while (inBuf.ReadStat() >= 0) {
Timer tw1;
size_t readNum = inBuf.ReadBam();
// cout << "read: " << readNum << endl;
// cout << "read: " << readNum << endl;
for (size_t i = 0; i < inBuf.Size(); ++i) {
/* 判断是否冗余 */
if (bamIdx == dupIdx) {
//if (dupIdx.dupSet != 0)
// cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl;
// 为了防止小内存运行的时候有重复的dupidx这时候dup的repIdx和dupSetSize可能会有不同
while((dupIdx = dupIdxQue.Pop()) == bamIdx);
// if (dupIdx.dupSet != 0)
// cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl;
// 为了防止小内存运行的时候有重复的dupidx这时候dup的repIdx和dupSetSize可能会有不同
while ((dupIdx = dupIdxQue.Pop()) == bamIdx);
}
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent
// cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl;
@ -213,4 +213,4 @@ int MarkDuplicates(int argc, char *argv[]) {
// cout << "计算read end: " << tm_arr[0].acc_seconds_elapsed() << endl;
Timer::log_time("程序结束");
return 0;
}
}

View File

@ -64,7 +64,7 @@ static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
/* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) {
if (vpRe.size() < 2) {
if (vpRe.size() == 1) {
@ -81,8 +81,8 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
// int maxOperateTime = 0;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) { // 找分数最高的readend
// maxOperateTime = max(maxOperateTime, pe->oprateTime);
// (const_cast<ReadEnds *>(pe))->oprateTime ++;
// maxOperateTime = max(maxOperateTime, pe->oprateTime);
// (const_cast<ReadEnds *>(pe))->oprateTime ++;
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
@ -102,13 +102,13 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
cout << "mark pair end: " << endl;
}
*/
// cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl;
// if (maxOperateTime == 0) ++zzhtestnum;
// cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" <<
// pBest->read2IndexInFile << endl; if (maxOperateTime == 0) ++zzhtestnum;
if (notDupIdx != nullptr) {
notDupIdx->insert(pBest->read1IndexInFile);
notDupIdx->insert(pBest->read2IndexInFile);
}
//if (false) {
// if (false) {
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
// trackOpticalDuplicates
vector<const ReadEnds *> prevOpticalRe;
@ -131,10 +131,10 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
}
}
for (auto pe : vpRe) { // 对非best read标记冗余
if (pe != pBest) { // 非best
if (pe != pBest) { // 非best
dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); //read2,
dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2,
// 注意这里代表是read1的索引
// 检查是否optical dup
if (pe->isOpticalDuplicate && opticalDupIdx != nullptr) {
@ -253,7 +253,7 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
}
if (p.taskSeq == 98) {
for (auto &re : p.unpairedPosArr[14293757783047].pairArr) {
cout << "task 99 unpair reads: " << re.read1IndexInFile << "\t" << re.read2IndexInFile << endl;
cout << "task 99 unpair reads: " << re.read1IndexInFile << "\t" << re.read2IndexInFile << endl;
}
}
// cout << "依赖比例:" << (float)p.unpairedDic.size() / p.frags.size() <<
@ -370,13 +370,22 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &not
/* 将pairs重叠部分的dup idx放进数据中 */
static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) {
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx,
MDSet<int64_t> &notRepIdx, SerailMarkDupArg *lastArg, SerailMarkDupArg *curArg) {
auto &lp = *lastArg;
auto &p = *curArg;
for (auto idx : dupIdx) { lp.pairDupIdx.insert(idx); p.pairDupIdx.erase(idx); }
for (auto idx : opticalDupIdx) { lp.pairOpticalDupIdx.insert(idx); p.pairOpticalDupIdx.erase(idx); }
for (auto idx : repIdx) { lp.pairRepIdx.insert(idx); p.pairRepIdx.erase(idx); }
for (auto idx : dupIdx) {
lp.pairDupIdx.insert(idx);
p.pairDupIdx.erase(idx);
}
for (auto idx : opticalDupIdx) {
lp.pairOpticalDupIdx.insert(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : repIdx) {
lp.pairRepIdx.insert(idx);
p.pairRepIdx.erase(idx);
}
// for (auto idx : notDupIdx) {
// if (lp.pairDupIdx.find(idx) != lp.pairDupIdx.end()) cout << "find-1: " << idx << endl;
// if (lp.pairDupIdx.find({idx}) != lp.pairDupIdx.end()) cout << "find-2: " << idx << endl;
@ -384,15 +393,22 @@ static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opt
// if (p.pairDupIdx.find({idx}) != p.pairDupIdx.end()) cout << "find-4: " << idx << endl;
// lp.pairDupIdx.erase(idx); p.pairDupIdx.erase(idx);
// }
for (auto idx : notOpticalDupIdx) { lp.pairOpticalDupIdx.erase(idx); p.pairOpticalDupIdx.erase(idx); }
for (auto idx : notRepIdx) { lp.pairRepIdx.erase(idx); p.pairRepIdx.erase(idx); }
for (auto idx : notOpticalDupIdx) {
lp.pairOpticalDupIdx.erase(idx);
p.pairOpticalDupIdx.erase(idx);
}
for (auto idx : notRepIdx) {
lp.pairRepIdx.erase(idx);
p.pairRepIdx.erase(idx);
}
}
// 用来分别处理dup和optical dup
static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
DPSet<DupInfo> &latterDupIdx, MDSet<int64_t> &latterOpticalDupIdx, DPSet<DupInfo> &latterRepIdx,
MDSet<int64_t> &latterNotDupIdx, MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotRepIdx) {
DPSet<DupInfo> &latterDupIdx, MDSet<int64_t> &latterOpticalDupIdx,
DPSet<DupInfo> &latterRepIdx, MDSet<int64_t> &latterNotDupIdx,
MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotRepIdx) {
for (auto idx : dupIdx) {
latterDupIdx.insert(idx);
// latterNotDupIdx.erase(idx); // 后来的更新为准
@ -405,7 +421,7 @@ static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDup
}
/* 最后合并数据并排序 */
template<typename DupContainer, typename T>
template <typename DupContainer, typename T>
static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupIdx, vector<T> &dupArr,
vector<T> &cacheDupIdx, vector<T> &midArr) {
midArr.resize(0);
@ -417,8 +433,8 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupI
auto ae = dupArr.end();
auto bi = cacheDupIdx.begin();
auto be = cacheDupIdx.end();
//auto bi = dupIdx.begin();
//auto be = dupIdx.end();
// auto bi = dupIdx.begin();
// auto be = dupIdx.end();
T val = 0;
while (ai != ae && bi != be) {
@ -427,7 +443,7 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupI
} else if (*bi < *ai) {
val = *bi++;
} else {
val = *bi++; // 相等的时候取后面的作为结果
val = *bi++; // 相等的时候取后面的作为结果
ai++;
}
if (notDupIdx.find(val) == notDupIdx.end()) {
@ -475,9 +491,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx);
refreshPairDupIdx(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, &lp, &p);
//cout << (g.unpairedDic.find("A01415:368:HL7NTDSX3:3:1104:5195:34757") != g.unpairedDic.end()) << endl;
//cout << (g.unpairedPosArr.find(14293757783047) != g.unpairedPosArr.end()) << endl;
// 处理之前未匹配的部分
// cout << (g.unpairedDic.find("A01415:368:HL7NTDSX3:3:1104:5195:34757") != g.unpairedDic.end()) << endl;
// cout << (g.unpairedPosArr.find(14293757783047) != g.unpairedPosArr.end()) << endl;
// 处理之前未匹配的部分
map<CalcKey, int64_t> recalcPos;
CalcSet<CalcKey> alreadyAdd; // 与该位点相同的pair都添加到数组里了
MDSet<int64_t> addToGlobal;
@ -516,8 +532,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// if (prevFragEnd.read1IndexInFile == 255830545 || prevFragEnd.read1IndexInFile == 255830546 ||
// prevFragEnd.read1IndexInFile == 255832599 || prevFragEnd.read1IndexInFile == 255832601) {
// cout << "find in p: " << lp.taskSeq << "\t" << prevFragEnd.read1IndexInFile << "\t" << readName << endl;
// if (nextUnpairInfoP != nullptr)
// cout << "find in p: " << lp.taskSeq << "\t" << prevFragEnd.read1IndexInFile << "\t" << readName <<
// endl; if (nextUnpairInfoP != nullptr)
// cout << "next p: " << nextUnpairInfoP->unpairedNum << endl;
// if (prevUnpairInfoP != nullptr)
// cout << "prev p: " << prevUnpairInfoP->unpairedNum << endl;
@ -657,7 +673,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// if (p.taskSeq == 163) {
// cout << "final" << endl;
// }
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx);
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx,
&t.notRepIdx);
if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey);
}
@ -673,7 +690,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// for (auto &re: lp.unpairedPosArr[posKey].pairArr) {
// cout << "lp reads: " << re.read1IndexInFile << "\t" << re.read2IndexInFile << endl;
// }
// cout << "found in g: " << lp.taskSeq << "\t" << lp.unpairedPosArr[posKey].unpairedNum << "\t" << lp.unpairedPosArr[posKey].pairArr.size() << endl;
// cout << "found in g: " << lp.taskSeq << "\t" << lp.unpairedPosArr[posKey].unpairedNum << "\t" <<
// lp.unpairedPosArr[posKey].pairArr.size() << endl;
// }
g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
}
@ -686,11 +704,14 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
if (taskSeq < lp.taskSeq) {
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx,
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]);
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq],
g.latterNotRepIdxArr[taskSeq]);
} else if (taskSeq == lp.taskSeq) {
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &lp, &p);
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &lp,
&p);
} else {
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &p, &lp); // 把结果放到p中
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &p,
&lp); // 把结果放到p中
}
}
@ -769,35 +790,35 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
// cout << t.dupIdx.size() << "\t" << t.notDupIdx.size() << endl;
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx,
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq]);
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq],
g.latterNotRepIdxArr[taskSeq]);
}
// cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
g.unpairedPosArr.clear();
g.unpairedDic.clear();
/*
int taskSeq = 0;
for (auto &arr : g.dupIdxArr) {
cout << taskSeq << "\t" << arr.size();
if (taskSeq < (int)g.dupIdxArr.size() - 1)
cout << "\t" << g.latterDupIdxArr[taskSeq].size() << "\t" << g.latterNotDupIdxArr[taskSeq].size() << endl;
else
cout << endl;
// if (taskSeq == 98) {
// vector<DupInfo> v;
// v.insert(v.end(), g.latterDupIdxArr[taskSeq].begin(), g.latterDupIdxArr[taskSeq].end());
// std::sort(v.begin(), v.end());
// for (auto &idx : v)
// cout << idx.idx << " " << idx.repIdx << " " << idx.dupSet << endl;
// v.clear();
// v.insert(v.end(), g.latterNotDupIdxArr[taskSeq].begin(), g.latterNotDupIdxArr[taskSeq].end());
// std::sort(v.begin(), v.end());
// for (auto &idx : v) cout << idx << endl;
// }
taskSeq++;
}
*/
/*
int taskSeq = 0;
for (auto &arr : g.dupIdxArr) {
cout << taskSeq << "\t" << arr.size();
if (taskSeq < (int)g.dupIdxArr.size() - 1)
cout << "\t" << g.latterDupIdxArr[taskSeq].size() << "\t" << g.latterNotDupIdxArr[taskSeq].size() <<
endl; else cout << endl;
// if (taskSeq == 98) {
// vector<DupInfo> v;
// v.insert(v.end(), g.latterDupIdxArr[taskSeq].begin(), g.latterDupIdxArr[taskSeq].end());
// std::sort(v.begin(), v.end());
// for (auto &idx : v)
// cout << idx.idx << " " << idx.repIdx << " " << idx.dupSet << endl;
// v.clear();
// v.insert(v.end(), g.latterNotDupIdxArr[taskSeq].begin(), g.latterNotDupIdxArr[taskSeq].end());
// std::sort(v.begin(), v.end());
// for (auto &idx : v) cout << idx << endl;
// }
taskSeq++;
}
*/
// 将dupidx放进全局数据
vector<DupInfo> cacheDupIdx;
vector<DupInfo> midArr;
@ -837,7 +858,8 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
*/
}
for (int i = 0; i < (int)g.opticalDupIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotOpticalDupIdxArr[i], g.opticalDupIdxArr[i], intCacheDupIdx, intMidArr);
refeshFinalTaskDupInfo(g.latterOpticalDupIdxArr[i], g.latterNotOpticalDupIdxArr[i], g.opticalDupIdxArr[i],
intCacheDupIdx, intMidArr);
for (int i = 0; i < (int)g.repIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterRepIdxArr[i], g.latterNotRepIdxArr[i], g.repIdxArr[i], cacheDupIdx, midArr);
@ -866,6 +888,15 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
std::sort(vRepIdx.begin(), vRepIdx.end());
}
void calculateMetrics(SerailMarkDupArg &lp, SerailMarkDupArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) {
DuplicationMetrics &gMetrics = *pgMetrics;
cout << "calculateMetrics start: " << endl;
cout << lp.unpairedDic.size() << "\t" << p.unpairedDic.size() << "\t" << g.unpairedDic.size() << endl;
cout << "calculateMetrics end" << endl;
}
/* 串行处理数据,标记冗余 */
void serialMarkDups() {
tm_arr[5].acc_start();
@ -933,14 +964,18 @@ void serialMarkDups() {
// cout << "round time: " << t_round.seconds_elapsed() << endl;
roundNum++;
if (roundNum % 100 == 0) {
//cout << "read sum: " << readNumSum << endl;
//cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
// cout << "read sum: " << readNumSum << endl;
// cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
}
}
// cout << "here" << endl;
tm_arr[3].acc_start();
// 处理剩下的全局数据
handleLastTask(lastArgP, &gData);
// 计算各种统计指标
calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics);
// cout << "here 2" << endl;
tm_arr[3].acc_end();
@ -950,7 +985,7 @@ void serialMarkDups() {
int64_t opticalDupNum = 0;
map<int64_t, int> dup;
int taskSeq = 0;
for (auto &arr : gData.dupIdxArr) {
for (auto idx : arr) {
@ -963,7 +998,6 @@ void serialMarkDups() {
// cout << taskSeq << "\t" << arr.size() << endl;
taskSeq++;
}
// #include <fstream>
// ofstream out("tumor_dup.txt");
@ -989,12 +1023,22 @@ void serialMarkDups() {
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t"
<< gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl;
cout << "optical dup: " << zzhopticalSet.size() << endl;
cout << "optical arr dup: " << zzhopticalArr.size() << endl;
cout << "optical size: " << opticalDupNum << endl;
cout << "metrics: \n"
<< "LIBRARY: " << gMetrics.LIBRARY << "\n"
<< "UNPAIRED_READS_EXAMINED: " << gMetrics.UNPAIRED_READS_EXAMINED << "\n"
<< "READ_PAIRS_EXAMINED: " << gMetrics.READ_PAIRS_EXAMINED << "\n"
<< "SECONDARY_OR_SUPPLEMENTARY_RDS: " << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\n"
<< "UNMAPPED_READS: " << gMetrics.UNMAPPED_READS << "\n"
<< "UNPAIRED_READ_DUPLICATES: " << gMetrics.UNPAIRED_READ_DUPLICATES << "\n"
<< "READ_PAIR_DUPLICATES: " << gMetrics.READ_PAIR_DUPLICATES << "\n"
<< "READ_PAIR_OPTICAL_DUPLICATES: " << gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\n"
<< "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n"
<< "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;
Timer::log_time("serial end ");
// for (auto i : gData.dupArr)

View File

@ -7,12 +7,12 @@
#include <set>
#include <string>
#include <vector>
#include <unordered_set>
#include <vector>
using std::set;
using std::unordered_set;
using std::string;
using std::unordered_set;
using std::vector;
/* 存放未匹配readend相同位点的所有readend */
@ -38,29 +38,23 @@ struct CalcKey {
};
struct CalcKeyHash {
std::size_t operator()(const CalcKey &o) const {
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
std::size_t operator()(const CalcKey &o) const {
return std::hash<int64_t>()(o.read1Pos) ^ std::hash<int64_t>()(o.read2Pos);
}
};
/* 用来记录冗余索引相关的信息 */
struct DupInfo {
int64_t idx;
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
int64_t repIdx = 0; // 这一批冗余中的非冗余read 代表的索引
int16_t dupSet = 0; // dup set size
DupInfo() : DupInfo(-1, 0, 0) { }
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) { }
DupInfo() : DupInfo(-1, 0, 0) {}
DupInfo(int64_t idx_) : DupInfo(idx_, 0, 0) {}
DupInfo(int64_t idx_, int64_t repIdx_, int dupSet_) : idx(idx_), repIdx(repIdx_), dupSet(dupSet_) {}
bool operator<(const DupInfo &o) const {
return idx < o.idx;
}
bool operator>(const DupInfo &o) const {
return idx > o.idx;
}
operator int64_t() const {
return idx;
}
bool operator<(const DupInfo &o) const { return idx < o.idx; }
bool operator>(const DupInfo &o) const { return idx > o.idx; }
operator int64_t() const { return idx; }
};
struct DupInfoHash {
@ -70,10 +64,10 @@ struct DupInfoHash {
struct DupInfoEqual {
bool operator()(const DupInfo &o1, const DupInfo &o2) const { return o1.idx == o2.idx; }
bool operator()(const DupInfo &o1, const int64_t &o2) const { return o1.idx == o2; }
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
bool operator()(const int64_t &o1, const DupInfo &o2) const { return o1 == o2.idx; }
};
template<typename T>
template <typename T>
// using MDSet = set<T>;
// using MDSet = unordered_set<T>;
using MDSet = tsl::robin_set<T>;
@ -84,7 +78,7 @@ template <typename T>
using DPSet = tsl::robin_set<T, DupInfoHash, DupInfoEqual>;
template <typename T>
//using CalcSet = set<T>;
// using CalcSet = set<T>;
using CalcSet = tsl::robin_set<T, CalcKeyHash>;
/* 当遗留数据在当前任务找到了pair read后进行冗余计算时候存放结果的数据结构 */
@ -108,21 +102,22 @@ struct UnpairedPosInfo {
// typedef unordered_map<int64_t, UnpairedPosInfo> UnpairedPositionMap;
typedef tsl::robin_map<string, UnpairedREInfo> UnpairedNameMap; // 以read name为索引保存未匹配的pair read
typedef tsl::robin_map<int64_t, UnpairedPosInfo> UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
typedef tsl::robin_map<int64_t, UnpairedPosInfo>
UnpairedPositionMap; // 以位点为索引保存该位点包含的对应的所有read和该位点包含的剩余未匹配的read的数量
/* 单线程处理冗余参数结构体 */
struct SerailMarkDupArg {
int64_t taskSeq; // 任务序号
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
vector<BamWrap *> bams; // 存放待处理的bam read
vector<ReadEnds> pairs; // 成对的reads
vector<ReadEnds> frags; // 暂未找到配对的reads
int64_t taskSeq; // 任务序号
int64_t bamStartIdx; // 当前vBam数组中第一个bam记录在整体bam中所处的位置
vector<BamWrap *> bams; // 存放待处理的bam read
vector<ReadEnds> pairs; // 成对的reads
vector<ReadEnds> frags; // 暂未找到配对的reads
DPSet<DupInfo> pairDupIdx; // pair的冗余read的索引
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储
UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储
};
/* 全局保留的数据因为有些paired数据比对到了不同的染色体相距甚远 */

48
todo.md 100644
View File

@ -0,0 +1,48 @@
# 各种统计信息的计算metrics文件中记录的信息
- htsjdk.samtools.metrics.StringHeader
- 命令行参数
- 执行命令的时间
- METRICS CLASS picard.sam.DuplicationMetrics
```
LIBRARY 样本ID(建库制备的样本id)
UNPAIRED_READS_EXAMINED 未匹配的reads数量
READ_PAIRS_EXAMINED 匹配的reads数量
SECONDARY_OR_SUPPLEMENTARY_RDS 非主要匹配的reads个数
UNMAPPED_READS 未匹配的reads个数
UNPAIRED_READ_DUPLICATES 未匹配的reads冗余的个数
READ_PAIR_DUPLICATES 匹配的reads冗余个数
READ_PAIR_OPTICAL_DUPLICATES 匹配的reads光学原因造成的冗余个数
PERCENT_DUPLICATION 冗余reads占比
ESTIMATED_LIBRARY_SIZE 估计的样本reads总量
normal 1205 498430 729 1205 763 117212 6877 0.235643 924111
```
- HISTOGRAM java.lang.Double
```
BIN 坐标值
CoverageMult
all_sets
optical_sets
non_optical_sets
1.0 1.010558 287416 0 291706
2.0 1.599836 72561 6664 70093
3.0 1.943455 15542 105 14299
4.0 2.143827 3274 1 2831
5.0 2.260667 708 0 607
6.0 2.3288 136 0 114
7.0 2.368529 28 0 17
8.0 2.391696 7 0 5
9.0 2.405205 2 0 2
10.0 2.413082 0 0 0
11.0 2.417676 0 0 0
12.0 2.420354 0 0 0
13.0 2.421916 0 0 0
14.0 2.422827 0 0 0
15.0 2.423358 0 0 0
...
...
98.0 2.424101 0 0 0
99.0 2.424101 0 0 0
100.0 2.424101 0 0 0
```