需要完善optical dup idx计算

This commit is contained in:
zzh 2024-08-28 12:00:23 +08:00
parent 86954ffa85
commit 7352a00f2c
10 changed files with 97 additions and 63 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
# for fast-markdup
*.sam
*.bam
*.log
# ---> C++
# Prerequisites

4
.vscode/launch.json vendored
View File

@ -13,11 +13,11 @@
"program": "${workspaceRoot}/build/bin/picard_cpp",
"args": [
"MarkDuplicates",
"--INPUT", "~/data/bam/100w.bam",
"--INPUT", "~/data/bam/zy_normal.bam",
"--OUTPUT", "out.bam",
"--METRICS_FILE", "metrics.txt",
"--num_threads", "1",
"--max_mem", "4G",
"--max_mem", "100G",
"--verbosity", "DEBUG",
"--asyncio", "true",
],

8
run.sh
View File

@ -1,6 +1,8 @@
#input=~/data/bam/zy_normal.bam
input=~/data/bam/zy_tumor.bam
input=~/data/bam/zy_normal.bam
#input=~/data/bam/zy_tumor.bam
#input=~/data/bam/100w.bam
#input=~/data/bam/1kw.sam
#input=~/data/bam/n1kw.sam
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
MarkDuplicates \
@ -8,7 +10,7 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
--OUTPUT ~/data/bam/out.bam \
--INDEX_FORMAT BAI \
--num_threads 1 \
--max_mem 2G \
--max_mem 1G \
--verbosity DEBUG \
--asyncio true #\
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"

View File

@ -71,7 +71,8 @@ void GlobalArg::parseArgument(int argNum) {
mem_arg <<= 20;
else if (*q == 'g' || *q == 'G')
mem_arg <<= 30;
if (mem_arg >= max_mem)
//if (mem_arg >= max_mem)
if (true)
max_mem = mem_arg;
else {
std::cerr << "[Warn] Too small mem size, use default" << std::endl;

View File

@ -57,6 +57,10 @@ static GlobalDataArg gData_;
GlobalDataArg &gData = gData_;
DuplicationMetrics gMetrics_;
DuplicationMetrics &gMetrics = gMetrics_;
int zzhtestnum = 0;
set<int64_t> zzhopticalSet;
vector<int64_t> zzhopticalArr;
/*
* mark duplicate
* bambarcode
@ -161,11 +165,11 @@ int MarkDuplicates(int argc, char *argv[]) {
while (inBuf.ReadStat() >= 0) {
Timer tw1;
size_t readNum = inBuf.ReadBam();
cout << "read: " << readNum << endl;
// cout << "read: " << readNum << endl;
for (size_t i = 0; i < inBuf.Size(); ++i) {
/* 判断是否冗余 */
if (bamIdx == dupIdx) {
// cout << "冗余" << bamIdx << endl;
// cerr << bamIdx << endl;
dupIdx = idxQue.Pop();
}
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {

View File

@ -213,7 +213,7 @@ struct MarkDupsArg
ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
/* "The scoring strategy for choosing the non-duplicate among candidates." */
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH;
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES;
/* "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
"PG record creation. This string may have a suffix appended to avoid collision with other " +

View File

@ -28,8 +28,6 @@ using std::set;
using std::unordered_map;
using std::vector;
static int zzhtestnum = 0;
/* 清除key位置的数据 */
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
@ -247,6 +245,7 @@ void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
/* 对找到的pairend read end添加一些信息 */
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
auto &pairedEnds = *pPairedEnds;
int64_t bamIdx = fragEnd.read1IndexInFile;
const int matesRefIndex = fragEnd.read1ReferenceIndex;
const int matesCoordinate = fragEnd.read1Coordinate;
@ -459,14 +458,23 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
int opticalDuplicates = 0;
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
if (opticalDuplicateFlags[i]) {
++opticalDuplicates;
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
// if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
// cout << "val: " << pRe->isOpticalDuplicate << endl;
// }
pRe->isOpticalDuplicate = true;
zzhopticalSet.insert(pRe->read1IndexInFile);
zzhopticalSet.insert(pRe->read2IndexInFile);
zzhopticalArr.push_back(pRe->read1IndexInFile);
zzhopticalArr.push_back(pRe->read2IndexInFile);
} else {
pRe->isOpticalDuplicate = false;
zzhopticalSet.erase(pRe->read1IndexInFile);
zzhopticalSet.erase(pRe->read2IndexInFile);
}
}
if (opticalDuplicates > 0)
gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
return opticalDuplicates;
}
@ -475,8 +483,11 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
*/
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
bool hasFR = false, hasRF = false;
int prevOpticalDupNum = 0;
// Check to see if we have a mixture of FR/RF
for (auto pRe : readEndsArr) {
if (pRe->isOpticalDuplicate)
++prevOpticalDupNum;
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
hasFR = true;
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
@ -513,5 +524,10 @@ void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnd
if (nOpticalDup)
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
}

View File

@ -13,10 +13,10 @@
#include <set>
#include <vector>
#include "dup_metrics.h"
#include "markdups_arg.h"
#include "md_funcs.h"
#include "shared_args.h"
#include "dup_metrics.h"
using std::cout;
using std::set;
@ -78,25 +78,28 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupId
}
int maxScore = 0;
const ReadEnds *pBest = nullptr;
int maxOperateTime = 0;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) { // 找分数最高的readend
for (auto pe : vpRe) { // 找分数最高的readend
maxOperateTime = max(maxOperateTime, pe->oprateTime);
(const_cast<ReadEnds *>(pe))->oprateTime ++;
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
// cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl;
// if (maxOperateTime == 0) ++zzhtestnum;
if (notDupIdx != nullptr) {
notDupIdx->insert(pBest->read1IndexInFile);
notDupIdx->insert(pBest->read2IndexInFile);
}
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
// trackOpticalDuplicates
trackOpticalDuplicates(vpRe, pBest);
}
for (auto pe : vpRe) // 对非best read标记冗余
{
if (pe != pBest) // 非best
{
for (auto pe : vpRe) { // 对非best read标记冗余
if (pe != pBest) { // 非best
dupIdx->insert(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(pe->read2IndexInFile); // 添加read2
@ -139,8 +142,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
/* 找到与readend pos相等的所有readend */
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
auto range = std::equal_range(src.begin(), src.end(), re,
ReadEnds::PairsLittleThan); // 只比对位点
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点
dst->insert(dst->end(), range.first, range.second);
}
@ -158,24 +160,24 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
// set<ReadEnds> reSet;
// ReadEnds lastRe;
for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
BamWrap *bw = p.bams[i];
const int64_t bamIdx = p.bamStartIdx + i;
if (bw->GetReadUnmappedFlag()) {
if (bw->b->core.tid == -1)
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
break;
} else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
} else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
ReadEnds fragEnd;
tm_arr[8].acc_start();
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
tm_arr[8].acc_end();
p.frags.push_back(fragEnd); // 添加进frag集合
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
p.frags.push_back(fragEnd); // 添加进frag集合
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
string key = bw->query_name();
if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
p.unpairedDic[key] = {p.taskSeq, fragEnd};
} else { // 找到了pairend
} else { // 找到了pairend
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
modifyPairedEnds(fragEnd, &pairedEnds);
p.pairs.push_back(pairedEnds);
@ -185,8 +187,8 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
}
}
tm_arr[9].acc_start();
//sortReadEndsArr(p.frags);
sort(p.frags.begin(), p.frags.end());
sortReadEndsArr(p.frags);
// sort(p.frags.begin(), p.frags.end());
tm_arr[9].acc_end();
// cout << "sort pairs" << endl;
tm_arr[10].acc_start();
@ -277,7 +279,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
leftSpan += 1;
if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了可能会有bug上上个也与下一个有交集呢
if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了可能会有bug上上个也与下一个有交集呢
leftSpan = leftArr.size() - 1;
break;
}
@ -285,7 +287,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
rightSpan += 1;
if (rightSpan == rightArr.size() - 1) // 同上可能会有bug
if (rightSpan == rightArr.size() - 1) // 同上可能会有bug
break;
}
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
@ -432,21 +434,18 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// 1.
// prevpos在交叉部分之前nextpos在交叉部分之后这种情况不需要获取pairarr中的数据;
// 2.
// prevpos在交叉部分之前nextpos在交叉部分需要获取lp中的相等read
// pair进行重新计算
// 复杂情况1.
// g中包含prevPosKey对应的unpairp中有对应的pair此时应该把这些pair考虑进去
// prevpos在交叉部分之前nextpos在交叉部分需要获取lp中的相等read pair进行重新计算
// 复杂情况1. g中包含prevPosKey对应的unpairp中有对应的pair此时应该把这些pair考虑进去
// 3.
// prevpos在交叉部分nextpos在交叉部分之后需要获取p中的相等read
// pair进行重新计算
// 复杂情况2. p中是否包含prevPosKey对应的unpair
// prevpos在交叉部分nextpos在交叉部分之后需要获取p中的相等read pair进行重新计算
// 复杂情况2. p中是否包含prevPosKey对应的unpair
// 4.
// prevpos在交叉部分nextpos在交叉部分需要获取lp和p中的相等read
// pair进行重新计算
// prevpos在交叉部分nextpos在交叉部分需要获取lp和p中的相等read pair进行重新计算
bool addDataToPos = true;
if (alreadyAdd.find(ck) != alreadyAdd.end()) {
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
// 之前已经添加过了后面就不用再添加数据了因为同一个位置可能找到两个及以上的unpair数据处理之前的数据时候可能已经添加了这些数据
addDataToPos = false;
} else
alreadyAdd.insert(ck);
@ -478,9 +477,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
}
recalcPos[ck] = prevPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
} else { // prevpos在交叉部分
if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
if (nextUnpairInfoP != nullptr) { // 且在pos点next task有unpair这样才把这些数据放到next task里
} else { // prevpos在交叉部分
if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
if (nextUnpairInfoP != nullptr) { // 且在pos点next task有unpair这样才把这些数据放到next task里
auto &nextPairArr = nextUnpairInfoP->pairArr;
nextPairArr.push_back(prevFragEnd);
auto &prevPairArr = prevUnpairInfoP->pairArr;
@ -492,7 +491,7 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
recalcPos[ck] = nextPosInfo.taskSeq;
std::sort(prevPairArr.begin(), prevPairArr.end());
} else { // next task在该位点没有unpair那就把数据放到prev task里
} else { // next task在该位点没有unpair那就把数据放到prev task里
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
prevPairArr.push_back(prevFragEnd);
if (addDataToPos) // 第二种情况
@ -515,8 +514,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
std::sort(prevPairArr.begin(), prevPairArr.end());
}
}
p.unpairedDic.erase(readName); // 在next task里删除该read
} else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
p.unpairedDic.erase(readName); // 在next task里删除该read
} else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
auto &remainPosInfo = g.unpairedDic[readName];
auto remainFragEnd = remainPosInfo.unpairedRE;
int64_t remainPosKey = remainFragEnd.posKey;
@ -529,14 +528,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
std::sort(remainPairArr.begin(), remainPairArr.end());
g.unpairedDic.erase(readName);
} else { // 都没找到,那就保存到遗留数据里
} else { // 都没找到,那就保存到遗留数据里
int64_t prevPosKey = prevFragEnd.posKey;
g.unpairedDic.insert(prevUnpair);
addToGlobal.insert(prevPosKey);
}
}
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
map<int64_t, TaskSeqDupInfo> taskChanged;
set<int64_t> posProcessed;
@ -557,8 +554,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey);
}
// 更新结果
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
// 放在这里因为lp中的unpairedPosArr中的readends可能会被修改比如optical duplicate
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
// 更新结果
for (auto &e : taskChanged) {
auto taskSeq = e.first;
auto &t = e.second;
@ -594,12 +595,12 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
auto &lp = *task;
auto &g = *gDataArg;
// 遗留的未匹配的pair
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
auto &readName = prevUnpair.first;
auto &prevPosInfo = prevUnpair.second;
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
auto &remainPosInfo = g.unpairedDic[readName];
auto remainFragEnd = remainPosInfo.unpairedRE;
int64_t remainPosKey = remainFragEnd.posKey;
@ -719,8 +720,8 @@ void serialMarkDups() {
// cout << "round time: " << t_round.seconds_elapsed() << endl;
roundNum++;
if (roundNum % 100 == 0) {
cout << "read sum: " << readNumSum << endl;
cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
//cout << "read sum: " << readNumSum << endl;
//cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
}
}
// cout << "here" << endl;
@ -769,10 +770,10 @@ void serialMarkDups() {
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t"
<< gMetrics.NonOpticalDuplicateCountHist << "\t"
<< gMetrics.OpticalDuplicatesCountHist << "\t"
<< gMetrics.OpticalDuplicatesByLibraryId << endl;
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t"
<< gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl;
cout << "optical dup: " << zzhopticalSet.size() << endl;
cout << "optical arr dup: " << zzhopticalArr.size() << endl;
Timer::log_time("serial end ");

View File

@ -6,6 +6,8 @@
#include <htslib/thread_pool.h>
#include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h>
#include <set>
using std::set;
extern Timer tm_arr[20]; // 用来测试性能
/* 全局本地变量 */
@ -24,3 +26,7 @@ class GlobalDataArg;
extern GlobalDataArg &gData;
class DuplicationMetrics;
extern DuplicationMetrics &gMetrics;
extern int zzhtestnum;
extern set<int64_t> zzhopticalSet;
extern vector<int64_t> zzhopticalArr;

View File

@ -77,6 +77,9 @@ struct ReadEnds : PhysicalLocation {
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
/* 用来做一些判断因为一些readends会做多次操作比如task之间有重叠等等 */
int oprateTime = 0;
/* 根据pairend read的比对方向来确定整体的比对方向 */
static int8_t GetOrientationByte(bool read1NegativeStrand,
bool read2NegativeStrand) {