需要完善optical dup idx计算

This commit is contained in:
zzh 2024-08-28 12:00:23 +08:00
parent 86954ffa85
commit 7352a00f2c
10 changed files with 97 additions and 63 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
# for fast-markdup # for fast-markdup
*.sam *.sam
*.bam
*.log *.log
# ---> C++ # ---> C++
# Prerequisites # Prerequisites

4
.vscode/launch.json vendored
View File

@ -13,11 +13,11 @@
"program": "${workspaceRoot}/build/bin/picard_cpp", "program": "${workspaceRoot}/build/bin/picard_cpp",
"args": [ "args": [
"MarkDuplicates", "MarkDuplicates",
"--INPUT", "~/data/bam/100w.bam", "--INPUT", "~/data/bam/zy_normal.bam",
"--OUTPUT", "out.bam", "--OUTPUT", "out.bam",
"--METRICS_FILE", "metrics.txt", "--METRICS_FILE", "metrics.txt",
"--num_threads", "1", "--num_threads", "1",
"--max_mem", "4G", "--max_mem", "100G",
"--verbosity", "DEBUG", "--verbosity", "DEBUG",
"--asyncio", "true", "--asyncio", "true",
], ],

8
run.sh
View File

@ -1,6 +1,8 @@
#input=~/data/bam/zy_normal.bam input=~/data/bam/zy_normal.bam
input=~/data/bam/zy_tumor.bam #input=~/data/bam/zy_tumor.bam
#input=~/data/bam/100w.bam #input=~/data/bam/100w.bam
#input=~/data/bam/1kw.sam
#input=~/data/bam/n1kw.sam
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
MarkDuplicates \ MarkDuplicates \
@ -8,7 +10,7 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
--OUTPUT ~/data/bam/out.bam \ --OUTPUT ~/data/bam/out.bam \
--INDEX_FORMAT BAI \ --INDEX_FORMAT BAI \
--num_threads 1 \ --num_threads 1 \
--max_mem 2G \ --max_mem 1G \
--verbosity DEBUG \ --verbosity DEBUG \
--asyncio true #\ --asyncio true #\
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"

View File

@ -71,7 +71,8 @@ void GlobalArg::parseArgument(int argNum) {
mem_arg <<= 20; mem_arg <<= 20;
else if (*q == 'g' || *q == 'G') else if (*q == 'g' || *q == 'G')
mem_arg <<= 30; mem_arg <<= 30;
if (mem_arg >= max_mem) //if (mem_arg >= max_mem)
if (true)
max_mem = mem_arg; max_mem = mem_arg;
else { else {
std::cerr << "[Warn] Too small mem size, use default" << std::endl; std::cerr << "[Warn] Too small mem size, use default" << std::endl;

View File

@ -57,6 +57,10 @@ static GlobalDataArg gData_;
GlobalDataArg &gData = gData_; GlobalDataArg &gData = gData_;
DuplicationMetrics gMetrics_; DuplicationMetrics gMetrics_;
DuplicationMetrics &gMetrics = gMetrics_; DuplicationMetrics &gMetrics = gMetrics_;
int zzhtestnum = 0;
set<int64_t> zzhopticalSet;
vector<int64_t> zzhopticalArr;
/* /*
* mark duplicate * mark duplicate
* bambarcode * bambarcode
@ -161,11 +165,11 @@ int MarkDuplicates(int argc, char *argv[]) {
while (inBuf.ReadStat() >= 0) { while (inBuf.ReadStat() >= 0) {
Timer tw1; Timer tw1;
size_t readNum = inBuf.ReadBam(); size_t readNum = inBuf.ReadBam();
cout << "read: " << readNum << endl; // cout << "read: " << readNum << endl;
for (size_t i = 0; i < inBuf.Size(); ++i) { for (size_t i = 0; i < inBuf.Size(); ++i) {
/* 判断是否冗余 */ /* 判断是否冗余 */
if (bamIdx == dupIdx) { if (bamIdx == dupIdx) {
// cout << "冗余" << bamIdx << endl; // cerr << bamIdx << endl;
dupIdx = idxQue.Pop(); dupIdx = idxQue.Pop();
} }
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) { if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {

View File

@ -213,7 +213,7 @@ struct MarkDupsArg
ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted; ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
/* "The scoring strategy for choosing the non-duplicate among candidates." */ /* "The scoring strategy for choosing the non-duplicate among candidates." */
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH; ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES;
/* "The program record ID for the @PG record(s) created by this program. Set to null to disable " + /* "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
"PG record creation. This string may have a suffix appended to avoid collision with other " + "PG record creation. This string may have a suffix appended to avoid collision with other " +

View File

@ -28,8 +28,6 @@ using std::set;
using std::unordered_map; using std::unordered_map;
using std::vector; using std::vector;
static int zzhtestnum = 0;
/* 清除key位置的数据 */ /* 清除key位置的数据 */
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) { void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx; auto &msIdx = *pmsIdx;
@ -247,6 +245,7 @@ void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
/* 对找到的pairend read end添加一些信息 */ /* 对找到的pairend read end添加一些信息 */
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) { void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
auto &pairedEnds = *pPairedEnds; auto &pairedEnds = *pPairedEnds;
int64_t bamIdx = fragEnd.read1IndexInFile; int64_t bamIdx = fragEnd.read1IndexInFile;
const int matesRefIndex = fragEnd.read1ReferenceIndex; const int matesRefIndex = fragEnd.read1ReferenceIndex;
const int matesCoordinate = fragEnd.read1Coordinate; const int matesCoordinate = fragEnd.read1Coordinate;
@ -459,14 +458,23 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags); findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
int opticalDuplicates = 0; int opticalDuplicates = 0;
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) { for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
if (opticalDuplicateFlags[i]) { if (opticalDuplicateFlags[i]) {
++opticalDuplicates; ++opticalDuplicates;
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]); // if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
// cout << "val: " << pRe->isOpticalDuplicate << endl;
// }
pRe->isOpticalDuplicate = true; pRe->isOpticalDuplicate = true;
zzhopticalSet.insert(pRe->read1IndexInFile);
zzhopticalSet.insert(pRe->read2IndexInFile);
zzhopticalArr.push_back(pRe->read1IndexInFile);
zzhopticalArr.push_back(pRe->read2IndexInFile);
} else {
pRe->isOpticalDuplicate = false;
zzhopticalSet.erase(pRe->read1IndexInFile);
zzhopticalSet.erase(pRe->read2IndexInFile);
} }
} }
if (opticalDuplicates > 0)
gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
return opticalDuplicates; return opticalDuplicates;
} }
@ -475,8 +483,11 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
*/ */
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) { void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
bool hasFR = false, hasRF = false; bool hasFR = false, hasRF = false;
int prevOpticalDupNum = 0;
// Check to see if we have a mixture of FR/RF // Check to see if we have a mixture of FR/RF
for (auto pRe : readEndsArr) { for (auto pRe : readEndsArr) {
if (pRe->isOpticalDuplicate)
++prevOpticalDupNum;
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
hasFR = true; hasFR = true;
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
@ -513,5 +524,10 @@ void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnd
if (nOpticalDup) if (nOpticalDup)
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1; gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl; // cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
} }

View File

@ -13,10 +13,10 @@
#include <set> #include <set>
#include <vector> #include <vector>
#include "dup_metrics.h"
#include "markdups_arg.h" #include "markdups_arg.h"
#include "md_funcs.h" #include "md_funcs.h"
#include "shared_args.h" #include "shared_args.h"
#include "dup_metrics.h"
using std::cout; using std::cout;
using std::set; using std::set;
@ -78,25 +78,28 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupId
} }
int maxScore = 0; int maxScore = 0;
const ReadEnds *pBest = nullptr; const ReadEnds *pBest = nullptr;
int maxOperateTime = 0;
/** All read ends should have orientation FF, FR, RF, or RR **/ /** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) { // 找分数最高的readend for (auto pe : vpRe) { // 找分数最高的readend
maxOperateTime = max(maxOperateTime, pe->oprateTime);
(const_cast<ReadEnds *>(pe))->oprateTime ++;
if (pe->score > maxScore || pBest == nullptr) { if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score; maxScore = pe->score;
pBest = pe; pBest = pe;
} }
} }
// cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl;
// if (maxOperateTime == 0) ++zzhtestnum;
if (notDupIdx != nullptr) { if (notDupIdx != nullptr) {
notDupIdx->insert(pBest->read1IndexInFile); notDupIdx->insert(pBest->read1IndexInFile);
notDupIdx->insert(pBest->read2IndexInFile); notDupIdx->insert(pBest->read2IndexInFile);
} }
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余 if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
// trackOpticalDuplicates // trackOpticalDuplicates
trackOpticalDuplicates(vpRe, pBest); trackOpticalDuplicates(vpRe, pBest);
} }
for (auto pe : vpRe) // 对非best read标记冗余 for (auto pe : vpRe) { // 对非best read标记冗余
{ if (pe != pBest) { // 非best
if (pe != pBest) // 非best
{
dupIdx->insert(pe->read1IndexInFile); // 添加read1 dupIdx->insert(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile) if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(pe->read2IndexInFile); // 添加read2 dupIdx->insert(pe->read2IndexInFile); // 添加read2
@ -139,8 +142,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
/* 找到与readend pos相等的所有readend */ /* 找到与readend pos相等的所有readend */
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) { static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
auto range = std::equal_range(src.begin(), src.end(), re, auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点
ReadEnds::PairsLittleThan); // 只比对位点
dst->insert(dst->end(), range.first, range.second); dst->insert(dst->end(), range.first, range.second);
} }
@ -158,24 +160,24 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
// set<ReadEnds> reSet; // set<ReadEnds> reSet;
// ReadEnds lastRe; // ReadEnds lastRe;
for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
BamWrap *bw = p.bams[i]; BamWrap *bw = p.bams[i];
const int64_t bamIdx = p.bamStartIdx + i; const int64_t bamIdx = p.bamStartIdx + i;
if (bw->GetReadUnmappedFlag()) { if (bw->GetReadUnmappedFlag()) {
if (bw->b->core.tid == -1) if (bw->b->core.tid == -1)
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort). // When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
break; break;
} else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对 } else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
ReadEnds fragEnd; ReadEnds fragEnd;
tm_arr[8].acc_start(); tm_arr[8].acc_start();
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd); buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
tm_arr[8].acc_end(); tm_arr[8].acc_end();
p.frags.push_back(fragEnd); // 添加进frag集合 p.frags.push_back(fragEnd); // 添加进frag集合
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了 if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
string key = bw->query_name(); string key = bw->query_name();
if (p.unpairedDic.find(key) == p.unpairedDic.end()) { if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
p.unpairedDic[key] = {p.taskSeq, fragEnd}; p.unpairedDic[key] = {p.taskSeq, fragEnd};
} else { // 找到了pairend } else { // 找到了pairend
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE; auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
modifyPairedEnds(fragEnd, &pairedEnds); modifyPairedEnds(fragEnd, &pairedEnds);
p.pairs.push_back(pairedEnds); p.pairs.push_back(pairedEnds);
@ -185,8 +187,8 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
} }
} }
tm_arr[9].acc_start(); tm_arr[9].acc_start();
//sortReadEndsArr(p.frags); sortReadEndsArr(p.frags);
sort(p.frags.begin(), p.frags.end()); // sort(p.frags.begin(), p.frags.end());
tm_arr[9].acc_end(); tm_arr[9].acc_end();
// cout << "sort pairs" << endl; // cout << "sort pairs" << endl;
tm_arr[10].acc_start(); tm_arr[10].acc_start();
@ -277,7 +279,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) { while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
leftSpan += 1; leftSpan += 1;
if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了可能会有bug上上个也与下一个有交集呢 if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了可能会有bug上上个也与下一个有交集呢
leftSpan = leftArr.size() - 1; leftSpan = leftArr.size() - 1;
break; break;
} }
@ -285,7 +287,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) { while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
rightSpan += 1; rightSpan += 1;
if (rightSpan == rightArr.size() - 1) // 同上可能会有bug if (rightSpan == rightArr.size() - 1) // 同上可能会有bug
break; break;
} }
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end()); dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
@ -432,21 +434,18 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// 1. // 1.
// prevpos在交叉部分之前nextpos在交叉部分之后这种情况不需要获取pairarr中的数据; // prevpos在交叉部分之前nextpos在交叉部分之后这种情况不需要获取pairarr中的数据;
// 2. // 2.
// prevpos在交叉部分之前nextpos在交叉部分需要获取lp中的相等read // prevpos在交叉部分之前nextpos在交叉部分需要获取lp中的相等read pair进行重新计算
// pair进行重新计算 // 复杂情况1. g中包含prevPosKey对应的unpairp中有对应的pair此时应该把这些pair考虑进去
// 复杂情况1.
// g中包含prevPosKey对应的unpairp中有对应的pair此时应该把这些pair考虑进去
// 3. // 3.
// prevpos在交叉部分nextpos在交叉部分之后需要获取p中的相等read // prevpos在交叉部分nextpos在交叉部分之后需要获取p中的相等read pair进行重新计算
// pair进行重新计算 // 复杂情况2. p中是否包含prevPosKey对应的unpair
// 复杂情况2. p中是否包含prevPosKey对应的unpair
// 4. // 4.
// prevpos在交叉部分nextpos在交叉部分需要获取lp和p中的相等read // prevpos在交叉部分nextpos在交叉部分需要获取lp和p中的相等read pair进行重新计算
// pair进行重新计算
bool addDataToPos = true; bool addDataToPos = true;
if (alreadyAdd.find(ck) != alreadyAdd.end()) { if (alreadyAdd.find(ck) != alreadyAdd.end()) {
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了 // 之前已经添加过了后面就不用再添加数据了因为同一个位置可能找到两个及以上的unpair数据处理之前的数据时候可能已经添加了这些数据
addDataToPos = false;
} else } else
alreadyAdd.insert(ck); alreadyAdd.insert(ck);
@ -478,9 +477,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
} }
recalcPos[ck] = prevPosInfo.taskSeq; recalcPos[ck] = prevPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end()); std::sort(prevPairArr.begin(), prevPairArr.end());
} else { // prevpos在交叉部分 } else { // prevpos在交叉部分
if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况 if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
if (nextUnpairInfoP != nullptr) { // 且在pos点next task有unpair这样才把这些数据放到next task里 if (nextUnpairInfoP != nullptr) { // 且在pos点next task有unpair这样才把这些数据放到next task里
auto &nextPairArr = nextUnpairInfoP->pairArr; auto &nextPairArr = nextUnpairInfoP->pairArr;
nextPairArr.push_back(prevFragEnd); nextPairArr.push_back(prevFragEnd);
auto &prevPairArr = prevUnpairInfoP->pairArr; auto &prevPairArr = prevUnpairInfoP->pairArr;
@ -492,7 +491,7 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
recalcPos[ck] = nextPosInfo.taskSeq; recalcPos[ck] = nextPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end()); std::sort(prevPairArr.begin(), prevPairArr.end());
} else { // next task在该位点没有unpair那就把数据放到prev task里 } else { // next task在该位点没有unpair那就把数据放到prev task里
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
prevPairArr.push_back(prevFragEnd); prevPairArr.push_back(prevFragEnd);
if (addDataToPos) // 第二种情况 if (addDataToPos) // 第二种情况
@ -515,8 +514,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
std::sort(prevPairArr.begin(), prevPairArr.end()); std::sort(prevPairArr.begin(), prevPairArr.end());
} }
} }
p.unpairedDic.erase(readName); // 在next task里删除该read p.unpairedDic.erase(readName); // 在next task里删除该read
} else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read } else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
auto &remainPosInfo = g.unpairedDic[readName]; auto &remainPosInfo = g.unpairedDic[readName];
auto remainFragEnd = remainPosInfo.unpairedRE; auto remainFragEnd = remainPosInfo.unpairedRE;
int64_t remainPosKey = remainFragEnd.posKey; int64_t remainPosKey = remainFragEnd.posKey;
@ -529,14 +528,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
std::sort(remainPairArr.begin(), remainPairArr.end()); std::sort(remainPairArr.begin(), remainPairArr.end());
g.unpairedDic.erase(readName); g.unpairedDic.erase(readName);
} else { // 都没找到,那就保存到遗留数据里 } else { // 都没找到,那就保存到遗留数据里
int64_t prevPosKey = prevFragEnd.posKey; int64_t prevPosKey = prevFragEnd.posKey;
g.unpairedDic.insert(prevUnpair); g.unpairedDic.insert(prevUnpair);
addToGlobal.insert(prevPosKey); addToGlobal.insert(prevPosKey);
} }
} }
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
map<int64_t, TaskSeqDupInfo> taskChanged; map<int64_t, TaskSeqDupInfo> taskChanged;
set<int64_t> posProcessed; set<int64_t> posProcessed;
@ -557,8 +554,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
if (taskSeq < lp.taskSeq) if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey); g.unpairedPosArr.erase(posKey);
} }
// 更新结果
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
// 放在这里因为lp中的unpairedPosArr中的readends可能会被修改比如optical duplicate
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
// 更新结果
for (auto &e : taskChanged) { for (auto &e : taskChanged) {
auto taskSeq = e.first; auto taskSeq = e.first;
auto &t = e.second; auto &t = e.second;
@ -594,12 +595,12 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
auto &lp = *task; auto &lp = *task;
auto &g = *gDataArg; auto &g = *gDataArg;
// 遗留的未匹配的pair // 遗留的未匹配的pair
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
auto &readName = prevUnpair.first; auto &readName = prevUnpair.first;
auto &prevPosInfo = prevUnpair.second; auto &prevPosInfo = prevUnpair.second;
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
auto &remainPosInfo = g.unpairedDic[readName]; auto &remainPosInfo = g.unpairedDic[readName];
auto remainFragEnd = remainPosInfo.unpairedRE; auto remainFragEnd = remainPosInfo.unpairedRE;
int64_t remainPosKey = remainFragEnd.posKey; int64_t remainPosKey = remainFragEnd.posKey;
@ -719,8 +720,8 @@ void serialMarkDups() {
// cout << "round time: " << t_round.seconds_elapsed() << endl; // cout << "round time: " << t_round.seconds_elapsed() << endl;
roundNum++; roundNum++;
if (roundNum % 100 == 0) { if (roundNum % 100 == 0) {
cout << "read sum: " << readNumSum << endl; //cout << "read sum: " << readNumSum << endl;
cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl; //cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
} }
} }
// cout << "here" << endl; // cout << "here" << endl;
@ -769,10 +770,10 @@ void serialMarkDups() {
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t"
<< gMetrics.NonOpticalDuplicateCountHist << "\t" << gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl;
<< gMetrics.OpticalDuplicatesCountHist << "\t" cout << "optical dup: " << zzhopticalSet.size() << endl;
<< gMetrics.OpticalDuplicatesByLibraryId << endl; cout << "optical arr dup: " << zzhopticalArr.size() << endl;
Timer::log_time("serial end "); Timer::log_time("serial end ");

View File

@ -6,6 +6,8 @@
#include <htslib/thread_pool.h> #include <htslib/thread_pool.h>
#include <sam/utils/read_ends.h> #include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h> #include <sam/utils/read_name_parser.h>
#include <set>
using std::set;
extern Timer tm_arr[20]; // 用来测试性能 extern Timer tm_arr[20]; // 用来测试性能
/* 全局本地变量 */ /* 全局本地变量 */
@ -24,3 +26,7 @@ class GlobalDataArg;
extern GlobalDataArg &gData; extern GlobalDataArg &gData;
class DuplicationMetrics; class DuplicationMetrics;
extern DuplicationMetrics &gMetrics; extern DuplicationMetrics &gMetrics;
extern int zzhtestnum;
extern set<int64_t> zzhopticalSet;
extern vector<int64_t> zzhopticalArr;

View File

@ -77,6 +77,9 @@ struct ReadEnds : PhysicalLocation {
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos; // MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
/* 用来做一些判断因为一些readends会做多次操作比如task之间有重叠等等 */
int oprateTime = 0;
/* 根据pairend read的比对方向来确定整体的比对方向 */ /* 根据pairend read的比对方向来确定整体的比对方向 */
static int8_t GetOrientationByte(bool read1NegativeStrand, static int8_t GetOrientationByte(bool read1NegativeStrand,
bool read2NegativeStrand) { bool read2NegativeStrand) {