解决了singleton的bug,serial结果一致了

This commit is contained in:
zzh 2024-11-14 10:48:27 +08:00
parent 4fd267c4db
commit fb1bd812ce
8 changed files with 1284 additions and 103 deletions

4
.vscode/launch.json vendored
View File

@ -13,11 +13,11 @@
"program": "${workspaceRoot}/build/bin/picard_cpp", "program": "${workspaceRoot}/build/bin/picard_cpp",
"args": [ "args": [
"MarkDuplicates", "MarkDuplicates",
"--INPUT", "~/data/bam/1k.sam", "--INPUT", "~/data/bam/100w.sam",
"--OUTPUT", "./out.sam", "--OUTPUT", "./out.sam",
"--METRICS_FILE", "metrics.txt", "--METRICS_FILE", "metrics.txt",
"--num_threads", "1", "--num_threads", "1",
"--max_mem", "2G", "--max_mem", "2M",
"--verbosity", "DEBUG", "--verbosity", "DEBUG",
"--asyncio", "true", "--asyncio", "true",
"--READ_NAME_REGEX", "" "--READ_NAME_REGEX", ""

9
run.sh
View File

@ -1,7 +1,8 @@
#input=~/data/bam/zy_normal.bam #input=~/data/bam/zy_normal.bam
#input=~/data/bam/zy_tumor.bam input=~/data/bam/zy_tumor.bam
#input=~/data/bam/100w.bam #input=~/data/bam/100w.bam
input=~/data/bam/1k.sam #input=~/data/bam/t100w.sam
#input=~/data/bam/1k.sam
#input=~/data/bam/1kw.sam #input=~/data/bam/1kw.sam
#input=~/data/bam/1kw.bam #input=~/data/bam/1kw.bam
#input=~/data/bam/n1kw.sam #input=~/data/bam/n1kw.sam
@ -11,12 +12,12 @@ cd ./build/ && make -j 8 && cd ..
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
MarkDuplicates \ MarkDuplicates \
--INPUT $input \ --INPUT $input \
--OUTPUT ./out.sam \ --OUTPUT ./out.bam \
--INDEX_FORMAT BAI \ --INDEX_FORMAT BAI \
--num_threads 1 \ --num_threads 1 \
--max_mem 2G \ --max_mem 2G \
--verbosity DEBUG \ --verbosity DEBUG \
--asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null --asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null #
#--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX "" #--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ""
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
#--READ_NAME_REGEX "" #--READ_NAME_REGEX ""

View File

@ -62,6 +62,8 @@ struct DuplicationMetrics {
MDMap NonOpticalDuplicateCountHist; MDMap NonOpticalDuplicateCountHist;
MDMap OpticalDuplicatesCountHist; MDMap OpticalDuplicatesCountHist;
// 没有冗余的位置总数量 // 没有冗余的位置总数量 for test
MDSet<int64_t> singletonReads; MDSet<int64_t> singletonReads;
MDSet<int64_t> dupReads; // for test
MDSet<int64_t> bestReads;
}; };

View File

@ -40,7 +40,7 @@ using std::string;
#define SMA_TAG_PG "PG" #define SMA_TAG_PG "PG"
#define BAM_BLOCK_SIZE 16L * 1024 * 1024 #define BAM_BLOCK_SIZE 32L * 1024 * 1024
#define NO_SUCH_INDEX INT64_MAX #define NO_SUCH_INDEX INT64_MAX
Timer tm_arr[20]; // 用来测试性能 Timer tm_arr[20]; // 用来测试性能
@ -153,8 +153,8 @@ int MarkDuplicates(int argc, char *argv[]) {
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池 htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads); // htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads); // htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
htsPoolRead.pool = hts_tpool_init(32); htsPoolRead.pool = hts_tpool_init(64);
htsPoolWrite.pool = hts_tpool_init(32); htsPoolWrite.pool = hts_tpool_init(64);
if (!htsPoolRead.pool || !htsPoolWrite.pool) { if (!htsPoolRead.pool || !htsPoolWrite.pool) {
Error("[%d] failed to set up thread pool", __LINE__); Error("[%d] failed to set up thread pool", __LINE__);
sam_close(g_inBamFp); sam_close(g_inBamFp);
@ -240,11 +240,13 @@ int MarkDuplicates(int argc, char *argv[]) {
bool isInDuplicateSet = false; bool isInDuplicateSet = false;
uint32_t representativeReadIndexInFile = 0; uint32_t representativeReadIndexInFile = 0;
uint32_t duplicateSetSize = 0; uint32_t duplicateSetSize = 0;
int64_t realDupSize = 0;
// exit(0); // exit(0);
while (inBuf.ReadStat() >= 0) { while (inBuf.ReadStat() >= 0) {
Timer tw1; Timer tw1;
size_t readNum = inBuf.ReadBam(); size_t readNum = inBuf.ReadBam();
// cout << "read: " << readNum << endl;
for (size_t i = 0; i < inBuf.Size(); ++i) { for (size_t i = 0; i < inBuf.Size(); ++i) {
BamWrap *bw = inBuf[i]; BamWrap *bw = inBuf[i];
if (bam_copy1(bp, bw->b) == nullptr) { if (bam_copy1(bp, bw->b) == nullptr) {
@ -255,12 +257,13 @@ int MarkDuplicates(int argc, char *argv[]) {
isDup = false; isDup = false;
isOpticalDup = false; isOpticalDup = false;
isInDuplicateSet = false; isInDuplicateSet = false;
// 删除原来的duplicate tag // 删除原来的duplicate tag
if (g_mdArg.CLEAR_DT) { if (g_mdArg.CLEAR_DT) {
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str()); uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str());
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
} }
++bam_num2;
// 统计信息 // 统计信息
if (bw->GetReadUnmappedFlag()) { if (bw->GetReadUnmappedFlag()) {
++gMetrics.UNMAPPED_READS; ++gMetrics.UNMAPPED_READS;
@ -274,14 +277,14 @@ int MarkDuplicates(int argc, char *argv[]) {
/* 判断是否冗余 */ /* 判断是否冗余 */
if (bamIdx == dupIdx) { if (bamIdx == dupIdx) {
++realDupSize; // for test
isDup = true; isDup = true;
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && dupIdx.dupSet != 0) { if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && dupIdx.dupSet != 0) {
// cerr << bamIdx << " " << dupIdx.repIdx << " " << dupIdx.dupSet << endl;
isInDuplicateSet = true; isInDuplicateSet = true;
representativeReadIndexInFile = dupIdx.repIdx; representativeReadIndexInFile = dupIdx.repIdx;
duplicateSetSize = dupIdx.dupSet; duplicateSetSize = dupIdx.dupSet;
} }
// 为了防止小内存运行的时候有重复的dupidx这时候dup的repIdx和dupSetSize可能会有不同 // 为了防止小内存运行的时候有重复的dupidx这时候dup的repIdx和dupSetSize可能会有不同
while ((dupIdx = dupIdxQue.Pop()) == bamIdx); while ((dupIdx = dupIdxQue.Pop()) == bamIdx);
if (opticalIdx == bamIdx) if (opticalIdx == bamIdx)
@ -293,10 +296,6 @@ int MarkDuplicates(int argc, char *argv[]) {
} }
// 添加冗余标识 // 添加冗余标识
bw->SetDuplicateReadFlag(true); bw->SetDuplicateReadFlag(true);
// if (isOpticalDup)
// cout << bamIdx << " optical" << endl;
// else
// cout << bamIdx << " not opt" << endl;
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str()); uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_TYPE_TAG.c_str());
if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal); if (oldTagVal != NULL) bam_aux_del(bw->b, oldTagVal);
@ -309,13 +308,6 @@ int MarkDuplicates(int argc, char *argv[]) {
g_mdArg.DUPLICATE_TYPE_LIBRARY.size() + 1, g_mdArg.DUPLICATE_TYPE_LIBRARY.size() + 1,
(const uint8_t *)g_mdArg.DUPLICATE_TYPE_LIBRARY.c_str()); (const uint8_t *)g_mdArg.DUPLICATE_TYPE_LIBRARY.c_str());
// cout << bamIdx << endl;
// int ival = 2;
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
// if (bam_aux_append(bw->b, "DT", 'Z', 3, (const uint8_t*)"LB") < 0) Error("add tag error!");
// // if (bam_aux_append(bw->b, "DT", 'i', sizeof(ival), (uint8_t *)&ival) < 0) Error("add tag error!");
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
// 计算统计信息 // 计算统计信息
if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) { if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) {
// Update the duplication metrics // Update the duplication metrics
@ -329,7 +321,6 @@ int MarkDuplicates(int argc, char *argv[]) {
bw->SetDuplicateReadFlag(false); bw->SetDuplicateReadFlag(false);
} }
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && bamIdx == repIdx) { // repressent
// cerr << bamIdx << " " << repIdx.repIdx << " " << repIdx.dupSet << endl;
isInDuplicateSet = true; isInDuplicateSet = true;
representativeReadIndexInFile = repIdx.repIdx; representativeReadIndexInFile = repIdx.repIdx;
duplicateSetSize = repIdx.dupSet; duplicateSetSize = repIdx.dupSet;
@ -354,9 +345,7 @@ int MarkDuplicates(int argc, char *argv[]) {
bam_aux_append(bw->b, g_mdArg.DUPLICATE_SET_SIZE_TAG.c_str(), 'i', sizeof(duplicateSetSize), bam_aux_append(bw->b, g_mdArg.DUPLICATE_SET_SIZE_TAG.c_str(), 'i', sizeof(duplicateSetSize),
(const uint8_t *)&duplicateSetSize); (const uint8_t *)&duplicateSetSize);
} }
// 每个read都要写到output只是添加标识 // 每个read都要写到output只是添加标识或者删掉这些冗余record
// cout << "bam rec size: " << bw->b->l_data << "\t" << bw->b->m_data << endl;
// if (bamIdx == 922) break;
++bamIdx; ++bamIdx;
if (isDup && g_mdArg.REMOVE_DUPLICATES) { if (isDup && g_mdArg.REMOVE_DUPLICATES) {
continue; continue;
@ -381,7 +370,7 @@ int MarkDuplicates(int argc, char *argv[]) {
cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl; cout << "write round time: " << tw1.seconds_elapsed() << " s" << endl;
} }
bam_destroy1(bp); bam_destroy1(bp);
cout << "bam_Num: " << bam_num1 << " : " << bam_num2 << endl; cout << "Final dup size: " << realDupSize << "\t" << dupIdxQue.Size() << endl;
// 计算统计信息 // 计算统计信息
gMetrics.READ_PAIRS_EXAMINED /= 2; gMetrics.READ_PAIRS_EXAMINED /= 2;

View File

@ -17,7 +17,7 @@ using std::vector;
/* 存放未匹配readend相同位点的所有readend */ /* 存放未匹配readend相同位点的所有readend */
struct UnpairedREInfo { struct UnpairedREInfo {
int64_t taskSeq; int64_t taskSeq; // 对应第几轮计算
ReadEnds unpairedRE; ReadEnds unpairedRE;
}; };
@ -90,9 +90,11 @@ struct TaskSeqDupInfo {
DPSet<DupInfo> dupIdx; DPSet<DupInfo> dupIdx;
MDSet<int64_t> opticalDupIdx; MDSet<int64_t> opticalDupIdx;
DPSet<DupInfo> repIdx; DPSet<DupInfo> repIdx;
MDSet<int64_t> singletonIdx;
MDSet<int64_t> notDupIdx; MDSet<int64_t> notDupIdx;
MDSet<int64_t> notOpticalDupIdx; MDSet<int64_t> notOpticalDupIdx;
MDSet<int64_t> notRepIdx; MDSet<int64_t> notRepIdx;
MDSet<int64_t> notSingletonIdx;
}; };
/* 保存有未匹配pair位点的信息包括read end数组和有几个未匹配的read end */ /* 保存有未匹配pair位点的信息包括read end数组和有几个未匹配的read end */
@ -120,7 +122,7 @@ struct MarkDupDataArg {
MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引 MDSet<int64_t> pairOpticalDupIdx; // optical冗余read的索引
DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引 DPSet<DupInfo> fragDupIdx; // frag的冗余read的索引
DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引 DPSet<DupInfo> pairRepIdx; // pair的dupset代表read的索引
MDSet<int64_t> pairSingletonIdxArr; // 某位置只有一对read的所有read pair个数 MDSet<int64_t> pairSingletonIdx; // 某位置只有一对read的所有read pair个数
UnpairedNameMap unpairedDic; // 用来寻找pair end UnpairedNameMap unpairedDic; // 用来寻找pair end
UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储 UnpairedPositionMap unpairedPosArr; // 存放未匹配的ReadEnd对应位点的所有ReadEnd为了避免重复存储
}; };

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,6 @@
void parallelMarkDups() #pragma once
{
} #include "md_types.h"
// 并行运行mark duplicate
void parallelMarkDups();

View File

@ -64,16 +64,27 @@ static inline void sortReadEndsArr(vector<ReadEnds> &arr) {
/* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/ /* 处理一组pairend的readends标记冗余, 这个函数需要串行运行,因为需要做一些统计*/
static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx, static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr, DPSet<DupInfo> *repIdx, MDSet<int64_t> *singletonIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) { MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr,
MDSet<int64_t> *notSingletonIdx = nullptr) {
if (vpRe.size() < 2) { if (vpRe.size() < 2) {
if (vpRe.size() == 1) { if (vpRe.size() == 1) {
// 这个统计可能会有误差因为当前位点可能还有没匹配上的read导致当前位点的readpaired数量为1 // 这个统计可能会有误差因为当前位点可能还有没匹配上的read导致当前位点的readpaired数量为1
// 可以通过后续的补充计算来解决这个问题,有必要么?好像没必要 // 可以通过后续的补充计算来解决这个问题,有必要么?好像没必要
gMetrics.singletonReads.insert(vpRe[0]->read1IndexInFile); //gMetrics.singletonReads.insert(vpRe[0]->read1IndexInFile);
singletonIdx->insert(vpRe[0]->read1IndexInFile);
} }
return; return;
} }
for (auto pe : vpRe) {
// gMetrics.singletonReads.erase(pe->read1IndexInFile);
}
if (notSingletonIdx != nullptr) {
for (auto pe : vpRe) {
notSingletonIdx->insert(pe->read1IndexInFile);
}
}
int maxScore = 0; int maxScore = 0;
const ReadEnds *pBest = nullptr; const ReadEnds *pBest = nullptr;
@ -133,6 +144,8 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
} }
for (auto pe : vpRe) { // 对非best read标记冗余 for (auto pe : vpRe) { // 对非best read标记冗余
if (pe != pBest) { // 非best if (pe != pBest) { // 非best
gMetrics.dupReads.insert(pe->read1IndexInFile);
gMetrics.dupReads.insert(pe->read2IndexInFile);
dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1 dupIdx->insert(DupInfo(pe->read1IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile) if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2, dupIdx->insert(DupInfo(pe->read2IndexInFile, pBest->read1IndexInFile, (int16_t)vpRe.size())); // read2,
@ -143,6 +156,9 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, DPSet<DupInfo> *dup
if (pe->read2IndexInFile != pe->read1IndexInFile) if (pe->read2IndexInFile != pe->read1IndexInFile)
opticalDupIdx->insert(pe->read2IndexInFile); opticalDupIdx->insert(pe->read2IndexInFile);
} }
} else {
gMetrics.dupReads.erase(pe->read1IndexInFile); // for test
gMetrics.dupReads.erase(pe->read2IndexInFile);
} }
} }
// 在输出的bam文件中添加tag, best作为dupset的代表 // 在输出的bam文件中添加tag, best作为dupset的代表
@ -168,6 +184,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
for (auto pe : vpRe) { for (auto pe : vpRe) {
if (!pe->IsPaired()) { if (!pe->IsPaired()) {
dupIdx->insert(pe->read1IndexInFile); dupIdx->insert(pe->read1IndexInFile);
gMetrics.dupReads.insert(pe->read1IndexInFile);
} }
} }
} else { } else {
@ -185,6 +202,9 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
for (auto pe : vpRe) { for (auto pe : vpRe) {
if (pe != pBest) { if (pe != pBest) {
dupIdx->insert(pe->read1IndexInFile); dupIdx->insert(pe->read1IndexInFile);
gMetrics.dupReads.insert(pe->read1IndexInFile);
} else {
gMetrics.dupReads.erase(pe->read1IndexInFile);
} }
} }
} }
@ -265,8 +285,9 @@ static void generateReadEnds(MarkDupDataArg *arg) {
/* 处理pairs */ /* 处理pairs */
static void processPairs(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx, static void processPairs(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDSet<int64_t> *opticalDupIdx,
DPSet<DupInfo> *repIdx, MDSet<int64_t> *notDupIdx = nullptr, DPSet<DupInfo> *repIdx, MDSet<int64_t> *singletonIdx, MDSet<int64_t> *notDupIdx = nullptr,
MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr) { MDSet<int64_t> *notOpticalDupIdx = nullptr, MDSet<int64_t> *notRepIdx = nullptr,
MDSet<int64_t> *notSingletonIdx = nullptr) {
// return; // return;
vector<const ReadEnds *> vpCache; // 有可能是冗余的reads vector<const ReadEnds *> vpCache; // 有可能是冗余的reads
const ReadEnds *pReadEnd = nullptr; const ReadEnds *pReadEnd = nullptr;
@ -274,13 +295,15 @@ static void processPairs(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDS
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组 vpCache.push_back(&re); // 处理一个潜在的冗余组
else { else {
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx); // 不一样 markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx,
notRepIdx, notSingletonIdx); // 不一样
vpCache.clear(); vpCache.clear();
vpCache.push_back(&re); vpCache.push_back(&re);
pReadEnd = &re; pReadEnd = &re;
} }
} }
markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx); markDupsForPairs(vpCache, dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx, notRepIdx,
notSingletonIdx);
} }
/* 处理frags */ /* 处理frags */
@ -313,13 +336,14 @@ static void processFrags(vector<ReadEnds> &readEnds, DPSet<DupInfo> *dupIdx, MDS
/* 单线程markdup (第二步)*/ /* 单线程markdup (第二步)*/
static void markdups(MarkDupDataArg *arg) { static void markdups(MarkDupDataArg *arg) {
auto &p = *arg; auto &p = *arg;
p.fragDupIdx.clear();
p.pairDupIdx.clear(); p.pairDupIdx.clear();
p.pairOpticalDupIdx.clear(); p.pairOpticalDupIdx.clear();
p.fragDupIdx.clear();
p.pairRepIdx.clear(); p.pairRepIdx.clear();
p.pairSingletonIdx.clear();
/* generateDuplicateIndexes计算冗余read在所有read中的位置索引 */ /* generateDuplicateIndexes计算冗余read在所有read中的位置索引 */
// 先处理 pair // 先处理 pair
processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx, &p.pairRepIdx); processPairs(p.pairs, &p.pairDupIdx, &p.pairOpticalDupIdx, &p.pairRepIdx, &p.pairSingletonIdx);
// cout << p.pairDupIdx.size() << "\t" << p.pairRepIdx.size() << endl; // cout << p.pairDupIdx.size() << "\t" << p.pairRepIdx.size() << endl;
// 再处理frag // 再处理frag
@ -373,8 +397,9 @@ static inline void refreshFragDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &not
/* 将pairs重叠部分的dup idx放进数据中 */ /* 将pairs重叠部分的dup idx放进数据中 */
static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx, static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &singletonIdx, MDSet<int64_t> &notDupIdx,
MDSet<int64_t> &notRepIdx, MarkDupDataArg *lastArg, MarkDupDataArg *curArg) { MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx,
MDSet<int64_t> &notSingletonIdx, MarkDupDataArg *lastArg, MarkDupDataArg *curArg) {
auto &lp = *lastArg; auto &lp = *lastArg;
auto &p = *curArg; auto &p = *curArg;
for (auto idx : dupIdx) { for (auto idx : dupIdx) {
@ -389,13 +414,18 @@ static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opt
lp.pairRepIdx.insert(idx); lp.pairRepIdx.insert(idx);
p.pairRepIdx.erase(idx); p.pairRepIdx.erase(idx);
} }
// for (auto idx : notDupIdx) { for (auto idx : singletonIdx) {
lp.pairSingletonIdx.insert(idx);
p.pairSingletonIdx.erase(idx);
}
for (auto idx : notDupIdx) {
// if (lp.pairDupIdx.find(idx) != lp.pairDupIdx.end()) cout << "find-1: " << idx << endl; // if (lp.pairDupIdx.find(idx) != lp.pairDupIdx.end()) cout << "find-1: " << idx << endl;
// if (lp.pairDupIdx.find({idx}) != lp.pairDupIdx.end()) cout << "find-2: " << idx << endl; // if (lp.pairDupIdx.find({idx}) != lp.pairDupIdx.end()) cout << "find-2: " << idx << endl;
// if (p.pairDupIdx.find(idx) != p.pairDupIdx.end()) cout << "find-3: " << idx << endl; // if (p.pairDupIdx.find(idx) != p.pairDupIdx.end()) cout << "find-3: " << idx << endl;
// if (p.pairDupIdx.find({idx}) != p.pairDupIdx.end()) cout << "find-4: " << idx << endl; // if (p.pairDupIdx.find({idx}) != p.pairDupIdx.end()) cout << "find-4: " << idx << endl;
// lp.pairDupIdx.erase(idx); p.pairDupIdx.erase(idx); lp.pairDupIdx.erase(idx);
// } p.pairDupIdx.erase(idx);
}
for (auto idx : notOpticalDupIdx) { for (auto idx : notOpticalDupIdx) {
lp.pairOpticalDupIdx.erase(idx); lp.pairOpticalDupIdx.erase(idx);
p.pairOpticalDupIdx.erase(idx); p.pairOpticalDupIdx.erase(idx);
@ -404,23 +434,31 @@ static inline void refreshPairDupIdx(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opt
lp.pairRepIdx.erase(idx); lp.pairRepIdx.erase(idx);
p.pairRepIdx.erase(idx); p.pairRepIdx.erase(idx);
} }
for (auto idx : notSingletonIdx) {
lp.pairSingletonIdx.erase(idx);
p.pairSingletonIdx.erase(idx);
}
} }
// 用来分别处理dup和optical dup // 用来分别处理dup和optical dup
static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx, static void refeshTaskDupInfo(DPSet<DupInfo> &dupIdx, MDSet<int64_t> &opticalDupIdx, DPSet<DupInfo> &repIdx,
MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx, MDSet<int64_t> &notRepIdx, MDSet<int64_t> &singletonIdx, MDSet<int64_t> &notDupIdx, MDSet<int64_t> &notOpticalDupIdx,
DPSet<DupInfo> &latterDupIdx, MDSet<int64_t> &latterOpticalDupIdx, MDSet<int64_t> &notRepIdx, MDSet<int64_t> &notSingletonIdx, DPSet<DupInfo> &latterDupIdx,
DPSet<DupInfo> &latterRepIdx, MDSet<int64_t> &latterNotDupIdx, MDSet<int64_t> &latterOpticalDupIdx, DPSet<DupInfo> &latterRepIdx,
MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotRepIdx) { MDSet<int64_t> &latterSingletonIdx, MDSet<int64_t> &latterNotDupIdx,
MDSet<int64_t> &latterNotOpticalDupIdx, MDSet<int64_t> &latterNotRepIdx,
MDSet<int64_t> &latterNotSingletonIdx) {
for (auto idx : dupIdx) { for (auto idx : dupIdx) {
latterDupIdx.insert(idx); latterDupIdx.insert(idx);
// latterNotDupIdx.erase(idx); // 后来的更新为准 // latterNotDupIdx.erase(idx); // 后来的更新为准
} }
for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx); for (auto idx : opticalDupIdx) latterOpticalDupIdx.insert(idx);
for (auto idx : repIdx) latterRepIdx.insert(idx); for (auto idx : repIdx) latterRepIdx.insert(idx);
for (auto idx : singletonIdx) latterSingletonIdx.insert(idx);
for (auto idx : notDupIdx) latterNotDupIdx.insert(idx); for (auto idx : notDupIdx) latterNotDupIdx.insert(idx);
for (auto idx : notOpticalDupIdx) latterNotOpticalDupIdx.insert(idx); for (auto idx : notOpticalDupIdx) latterNotOpticalDupIdx.insert(idx);
for (auto idx : notRepIdx) latterNotRepIdx.insert(idx); for (auto idx : notRepIdx) latterNotRepIdx.insert(idx);
for (auto idx : notSingletonIdx) latterNotSingletonIdx.insert(idx);
} }
/* 最后合并数据并排序 */ /* 最后合并数据并排序 */
@ -468,6 +506,9 @@ static void refeshFinalTaskDupInfo(DupContainer &dupIdx, MDSet<int64_t> &notDupI
dupArr = midArr; dupArr = midArr;
} }
static int64_t llp_frag_end_pos = 0;
static int64_t llp_pair_end_pos = 0;
static int64_t gSingletonNum = 0;
/* 处理相邻的两个任务,有相交叉的数据 */ /* 处理相邻的两个任务,有相交叉的数据 */
static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg, GlobalDataArg *gDataArg) { static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg, GlobalDataArg *gDataArg) {
auto &lp = *lastArg; auto &lp = *lastArg;
@ -478,21 +519,36 @@ static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg,
DPSet<DupInfo> dupIdx; DPSet<DupInfo> dupIdx;
MDSet<int64_t> opticalDupIdx; MDSet<int64_t> opticalDupIdx;
DPSet<DupInfo> repIdx; DPSet<DupInfo> repIdx;
MDSet<int64_t> singletonIdx;
MDSet<int64_t> notOpticalDupIdx; MDSet<int64_t> notOpticalDupIdx;
MDSet<int64_t> notDupIdx; MDSet<int64_t> notDupIdx;
MDSet<int64_t> notRepIdx; MDSet<int64_t> notRepIdx;
MDSet<int64_t> notSingletonIdx;
// 先处理重叠的frags // 先处理重叠的frags
getIntersectData(lp.frags, p.frags, &reArr); getIntersectData(lp.frags, p.frags, &reArr);
processFrags(reArr, &dupIdx, &notDupIdx); processFrags(reArr, &dupIdx, &notDupIdx);
refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p); refreshFragDupIdx(dupIdx, notDupIdx, &lp, &p);
if (!p.frags.empty())
if (llp_frag_end_pos >= p.frags.begin()->posKey) {
cout << "frags : " << llp_frag_end_pos << "\t" << p.frags[0].posKey << "\t" << p.frags.rbegin()->posKey << endl;
}
if (!p.pairs.empty())
if (llp_pair_end_pos >= p.pairs.begin()->posKey) {
cout << "pairs : " << llp_pair_end_pos << "\t" << p.pairs[0].posKey << "\t" << p.pairs.rbegin()->posKey << endl;
}
if (!p.frags.empty())llp_frag_end_pos = lp.frags.rbegin()->posKey;
if (!p.pairs.empty()) llp_pair_end_pos = lp.pairs.rbegin()->posKey;
// cout << "frags lp: " << lp.frags[0].posKey << "\t" << lp.frags.rbegin()->posKey << endl;
// cout << "frags p : " << p.frags[0].posKey << "\t" << p.frags.rbegin()->posKey << endl;
// cout << "pairs lp: " << lp.pairs[0].posKey << "\t" << lp.pairs.rbegin()->posKey << endl;
// cout << "pairs p : " << p.pairs[0].posKey << "\t" << p.pairs.rbegin()->posKey << endl;
// 再处理重叠的pairs // 再处理重叠的pairs
reArr.clear(); reArr.clear();
dupIdx.clear(); dupIdx.clear();
notDupIdx.clear(); notDupIdx.clear();
getIntersectData(lp.pairs, p.pairs, &reArr, true); getIntersectData(lp.pairs, p.pairs, &reArr, true);
processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx); processPairs(reArr, &dupIdx, &opticalDupIdx, &repIdx, &singletonIdx, &notDupIdx, &notOpticalDupIdx, &notRepIdx, &notSingletonIdx);
refreshPairDupIdx(dupIdx, opticalDupIdx, repIdx, notDupIdx, notOpticalDupIdx, notRepIdx, &lp, &p); refreshPairDupIdx(dupIdx, opticalDupIdx, repIdx, singletonIdx, notDupIdx, notOpticalDupIdx, notRepIdx, notSingletonIdx, & lp, &p);
// cout << (g.unpairedDic.find("A01415:368:HL7NTDSX3:3:1104:5195:34757") != g.unpairedDic.end()) << endl; // cout << (g.unpairedDic.find("A01415:368:HL7NTDSX3:3:1104:5195:34757") != g.unpairedDic.end()) << endl;
// cout << (g.unpairedPosArr.find(14293757783047) != g.unpairedPosArr.end()) << endl; // cout << (g.unpairedPosArr.find(14293757783047) != g.unpairedPosArr.end()) << endl;
@ -676,8 +732,8 @@ static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg,
// if (p.taskSeq == 163) { // if (p.taskSeq == 163) {
// cout << "final" << endl; // cout << "final" << endl;
// } // }
processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notOpticalDupIdx, processPairs(*pairArrP, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.singletonIdx, &t.notDupIdx, &t.notOpticalDupIdx,
&t.notRepIdx); &t.notRepIdx, &t.notSingletonIdx);
if (taskSeq < lp.taskSeq) if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey); g.unpairedPosArr.erase(posKey);
} }
@ -705,16 +761,18 @@ static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg,
auto &t = e.second; auto &t = e.second;
// cout << t.dupIdx.size() << "\t" << t.notDupIdx.size() << endl; // cout << t.dupIdx.size() << "\t" << t.notDupIdx.size() << endl;
if (taskSeq < lp.taskSeq) { if (taskSeq < lp.taskSeq) {
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.singletonIdx, t.notDupIdx, t.notOpticalDupIdx,
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], t.notRepIdx, t.notSingletonIdx, g.latterDupIdxArr[taskSeq],
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq],
g.latterNotRepIdxArr[taskSeq]); g.latterSingletonIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq],
g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq],
g.latterNotSingletonIdxArr[taskSeq]);
} else if (taskSeq == lp.taskSeq) { } else if (taskSeq == lp.taskSeq) {
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &lp, refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.singletonIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, t.notSingletonIdx, &lp,
&p); &p);
} else { } else {
refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, &p, refreshPairDupIdx(t.dupIdx, t.opticalDupIdx, t.repIdx, t.singletonIdx, t.notDupIdx, t.notOpticalDupIdx,
&lp); // 把结果放到p中 t.notRepIdx, t.notSingletonIdx, &p, &lp); // 把结果放到p中
} }
} }
@ -724,9 +782,11 @@ static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg,
g.latterDupIdxArr.push_back(DPSet<DupInfo>()); g.latterDupIdxArr.push_back(DPSet<DupInfo>());
g.latterOpticalDupIdxArr.push_back(MDSet<int64_t>()); g.latterOpticalDupIdxArr.push_back(MDSet<int64_t>());
g.latterRepIdxArr.push_back(DPSet<DupInfo>()); g.latterRepIdxArr.push_back(DPSet<DupInfo>());
g.latterSingletonIdxArr.push_back(MDSet<int64_t>());
g.latterNotDupIdxArr.push_back(MDSet<int64_t>()); g.latterNotDupIdxArr.push_back(MDSet<int64_t>());
g.latterNotOpticalDupIdxArr.push_back(MDSet<int64_t>()); g.latterNotOpticalDupIdxArr.push_back(MDSet<int64_t>());
g.latterNotRepIdxArr.push_back(MDSet<int64_t>()); g.latterNotRepIdxArr.push_back(MDSet<int64_t>());
g.latterNotSingletonIdxArr.push_back(MDSet<int64_t>());
g.dupIdxArr.push_back(vector<DupInfo>()); g.dupIdxArr.push_back(vector<DupInfo>());
auto &vIdx = g.dupIdxArr.back(); auto &vIdx = g.dupIdxArr.back();
@ -745,6 +805,12 @@ static void handleIntersectData(MarkDupDataArg *lastArg, MarkDupDataArg *curArg,
auto &vRepIdx = g.repIdxArr.back(); auto &vRepIdx = g.repIdxArr.back();
vRepIdx.insert(vRepIdx.end(), lp.pairRepIdx.begin(), lp.pairRepIdx.end()); vRepIdx.insert(vRepIdx.end(), lp.pairRepIdx.begin(), lp.pairRepIdx.end());
std::sort(vRepIdx.begin(), vRepIdx.end()); std::sort(vRepIdx.begin(), vRepIdx.end());
g.singletonIdxArr.push_back(vector<int64_t>());
auto &vSingletonIdx = g.singletonIdxArr.back();
vSingletonIdx.insert(vSingletonIdx.end(), lp.pairSingletonIdx.begin(), lp.pairSingletonIdx.end());
std::sort(vSingletonIdx.begin(), vSingletonIdx.end());
gSingletonNum += lp.pairSingletonIdx.size();
} }
/* 当所有任务结束后global data里还有未处理的数据 */ /* 当所有任务结束后global data里还有未处理的数据 */
@ -782,9 +848,10 @@ static void handleLastTask(MarkDupDataArg *task, GlobalDataArg *gDataArg) {
if (arr.size() > 1) { if (arr.size() > 1) {
std::sort(arr.begin(), arr.end()); std::sort(arr.begin(), arr.end());
// cout << "last task before mark pair" << endl; // cout << "last task before mark pair" << endl;
processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.notDupIdx, &t.notRepIdx); processPairs(arr, &t.dupIdx, &t.opticalDupIdx, &t.repIdx, &t.singletonIdx, &t.notDupIdx, &t.notOpticalDupIdx, &t.notRepIdx, &t.notSingletonIdx);
} else if (arr.size() == 1) { } else if (arr.size() == 1) {
gMetrics.singletonReads.insert(arr[0].read1IndexInFile); // gMetrics.singletonReads.insert(arr[0].read1IndexInFile);
t.singletonIdx.insert(arr[0].read1IndexInFile);
} }
} }
// 更新结果 // 更新结果
@ -794,10 +861,11 @@ static void handleLastTask(MarkDupDataArg *task, GlobalDataArg *gDataArg) {
auto taskSeq = e.first; auto taskSeq = e.first;
auto &t = e.second; auto &t = e.second;
// cout << t.dupIdx.size() << "\t" << t.notDupIdx.size() << endl; // cout << t.dupIdx.size() << "\t" << t.notDupIdx.size() << endl;
refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.notDupIdx, t.notOpticalDupIdx, t.notRepIdx, refeshTaskDupInfo(t.dupIdx, t.opticalDupIdx, t.repIdx, t.singletonIdx, t.notDupIdx, t.notOpticalDupIdx,
g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], t.notRepIdx, t.notSingletonIdx, g.latterDupIdxArr[taskSeq], g.latterOpticalDupIdxArr[taskSeq],
g.latterNotDupIdxArr[taskSeq], g.latterNotOpticalDupIdxArr[taskSeq], g.latterRepIdxArr[taskSeq], g.latterSingletonIdxArr[taskSeq], g.latterNotDupIdxArr[taskSeq],
g.latterNotRepIdxArr[taskSeq]); g.latterNotOpticalDupIdxArr[taskSeq], g.latterNotRepIdxArr[taskSeq],
g.latterNotSingletonIdxArr[taskSeq]);
} }
// cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl; // cout << "last unpair info: " << g.unpairedDic.size() << '\t' << g.unpairedPosArr.size() << endl;
@ -871,6 +939,9 @@ static void handleLastTask(MarkDupDataArg *task, GlobalDataArg *gDataArg) {
intCacheDupIdx, intMidArr); intCacheDupIdx, intMidArr);
for (int i = 0; i < (int)g.repIdxArr.size() - 1; ++i) for (int i = 0; i < (int)g.repIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterRepIdxArr[i], g.latterNotRepIdxArr[i], g.repIdxArr[i], cacheDupIdx, midArr); refeshFinalTaskDupInfo(g.latterRepIdxArr[i], g.latterNotRepIdxArr[i], g.repIdxArr[i], cacheDupIdx, midArr);
for (int i = 0; i < (int)g.singletonIdxArr.size() - 1; ++i)
refeshFinalTaskDupInfo(g.latterSingletonIdxArr[i], g.latterNotSingletonIdxArr[i], g.singletonIdxArr[i],
intCacheDupIdx, intMidArr);
g.dupIdxArr.push_back(vector<DupInfo>()); g.dupIdxArr.push_back(vector<DupInfo>());
auto &vIdx = g.dupIdxArr.back(); auto &vIdx = g.dupIdxArr.back();
@ -895,18 +966,12 @@ static void handleLastTask(MarkDupDataArg *task, GlobalDataArg *gDataArg) {
auto &vRepIdx = g.repIdxArr.back(); auto &vRepIdx = g.repIdxArr.back();
vRepIdx.insert(vRepIdx.end(), lp.pairRepIdx.begin(), lp.pairRepIdx.end()); vRepIdx.insert(vRepIdx.end(), lp.pairRepIdx.begin(), lp.pairRepIdx.end());
std::sort(vRepIdx.begin(), vRepIdx.end()); std::sort(vRepIdx.begin(), vRepIdx.end());
}
void calculateMetrics(MarkDupDataArg &lp, MarkDupDataArg &p, GlobalDataArg &g, DuplicationMetrics *pgMetrics) { g.singletonIdxArr.push_back(vector<int64_t>());
DuplicationMetrics &gMetrics = *pgMetrics; auto &vSingletonIdx = g.singletonIdxArr.back();
vSingletonIdx.insert(vSingletonIdx.end(), lp.pairSingletonIdx.begin(), lp.pairSingletonIdx.end());
// gMetrics.READ_PAIRS_EXAMINED /= 2; std::sort(vSingletonIdx.begin(), vSingletonIdx.end());
gSingletonNum += lp.pairSingletonIdx.size();
cout << "calculateMetrics start: " << endl;
cout << lp.unpairedDic.size() << "\t" << p.unpairedDic.size() << "\t" << g.unpairedDic.size() << endl;
cout << "all: " << (lp.unpairedDic.size() + p.unpairedDic.size() + g.unpairedDic.size()) << endl;
cout << "calculateMetrics end" << endl;
} }
/* 串行处理数据,标记冗余 */ /* 串行处理数据,标记冗余 */
@ -932,6 +997,8 @@ void serialMarkDups() {
// 读取bam文件中的read // 读取bam文件中的read
tm_arr[4].acc_start(); tm_arr[4].acc_start();
size_t readNum = inBamBuf.ReadBam(); size_t readNum = inBamBuf.ReadBam();
if (readNum < 1)
break;
readNumSum += readNum; readNumSum += readNum;
// readNumSum += inBamBuf.GetBamArr().size(); // readNumSum += inBamBuf.GetBamArr().size();
tm_arr[4].acc_end(); tm_arr[4].acc_end();
@ -986,9 +1053,6 @@ void serialMarkDups() {
// 处理剩下的全局数据 // 处理剩下的全局数据
handleLastTask(lastArgP, &gData); handleLastTask(lastArgP, &gData);
// 计算各种统计指标
// calculateMetrics(*lastArgP, *curArgP, gData, &gMetrics);
// cout << "here 2" << endl; // cout << "here 2" << endl;
tm_arr[3].acc_end(); tm_arr[3].acc_end();
@ -996,22 +1060,47 @@ void serialMarkDups() {
// 统计所有冗余index数量 // 统计所有冗余index数量
int64_t dupNum = 0; int64_t dupNum = 0;
int64_t opticalDupNum = 0; int64_t opticalDupNum = 0;
int64_t singletonNum = 0;
int64_t dupNumDic = 0;
int64_t singletonNumDic = 0;
map<int64_t, int> dup; map<int64_t, int> dup;
int taskSeq = 0; int taskSeq = 0;
/* for (auto &arr : gData.dupIdxArr) { // for (auto &arr : gData.dupIdxArr) {
for (auto idx : arr) { // for (auto idx : arr) {
if (dup.find(idx.idx) != dup.end()) { // if (dup.find(idx.idx) != dup.end()) {
// cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t' // // cout << "dup index: " << dup[idx] << '\t' << taskSeq << '\t'
// << idx << endl; // // << idx << endl;
} // }
dup[idx.idx] = taskSeq; // dup[idx.idx] = taskSeq;
} // }
// cout << taskSeq << "\t" << arr.size() << endl; // // cout << taskSeq << "\t" << arr.size() << endl;
taskSeq++; // taskSeq++;
} // }
*/ // dupNumDic = dup.size();
// dup.clear();
// int notInMetrics = 0;
// cout << "gmetrics single count: " << gMetrics.singletonReads.size() << endl;
// for (auto &arr : gData.singletonIdxArr) {
// for (auto idx : arr) {
// dup[idx] = 1;
// if (gMetrics.singletonReads.find(idx) == gMetrics.singletonReads.end()) {
//// cout << "not in gmetrics: " << idx << endl;
// ++notInMetrics;
// } else {
// gMetrics.singletonReads.erase(idx);
// }
// }
// }
// singletonNumDic = dup.size();
// cout << "not in arr: " << endl;
// for (auto idx : gMetrics.singletonReads) {
//// cout << idx << endl;
// }
// cout << "count: " << notInMetrics << "\t" << gMetrics.singletonReads.size() << endl;
// #include <fstream> // #include <fstream>
// ofstream out("tumor_dup.txt"); // ofstream out("tumor_dup.txt");
// for (auto idx : dup) // for (auto idx : dup)
@ -1022,8 +1111,12 @@ void serialMarkDups() {
for (auto &arr : gData.dupIdxArr) dupNum += arr.size(); for (auto &arr : gData.dupIdxArr) dupNum += arr.size();
for (auto &arr : gData.opticalDupIdxArr) opticalDupNum += arr.size(); for (auto &arr : gData.opticalDupIdxArr) opticalDupNum += arr.size();
for (auto &arr : gData.singletonIdxArr) singletonNum += arr.size();
cout << "dup num : " << dupNum << '\t' << dup.size() << "\t" << zzhtestnum << endl; cout << "dup num : " << dupNum << '\t' << dupNumDic << "\t" << zzhtestnum << endl;
cout << "singleton : " << singletonNum << "\t" << singletonNumDic << "\t" << gSingletonNum << endl;
cout << "singleton size: " << gMetrics.singletonReads.size() << endl;
cout << "dup read size: " << gMetrics.dupReads.size() << endl;
cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl; cout << "calc readend: " << tm_arr[0].acc_seconds_elapsed() << endl;
cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl; cout << "markdup : " << tm_arr[1].acc_seconds_elapsed() << endl;
@ -1036,8 +1129,8 @@ void serialMarkDups() {
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
cout << "optical size: " << opticalDupNum << endl; cout << "optical size: " << opticalDupNum << "\t" << opticalDupNum / 2 << endl;
cout << "singleton size: " << gMetrics.singletonReads.size() << endl;
Timer::log_time("serial end "); Timer::log_time("serial end ");
cout << "read num sum: " << readNumSum << endl; cout << "read num sum: " << readNumSum << endl;