需要完善optical dup idx计算

This commit is contained in:
zzh 2024-08-28 12:00:23 +08:00
parent 86954ffa85
commit 7352a00f2c
10 changed files with 97 additions and 63 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
# for fast-markdup # for fast-markdup
*.sam *.sam
*.bam
*.log *.log
# ---> C++ # ---> C++
# Prerequisites # Prerequisites

4
.vscode/launch.json vendored
View File

@ -13,11 +13,11 @@
"program": "${workspaceRoot}/build/bin/picard_cpp", "program": "${workspaceRoot}/build/bin/picard_cpp",
"args": [ "args": [
"MarkDuplicates", "MarkDuplicates",
"--INPUT", "~/data/bam/100w.bam", "--INPUT", "~/data/bam/zy_normal.bam",
"--OUTPUT", "out.bam", "--OUTPUT", "out.bam",
"--METRICS_FILE", "metrics.txt", "--METRICS_FILE", "metrics.txt",
"--num_threads", "1", "--num_threads", "1",
"--max_mem", "4G", "--max_mem", "100G",
"--verbosity", "DEBUG", "--verbosity", "DEBUG",
"--asyncio", "true", "--asyncio", "true",
], ],

8
run.sh
View File

@ -1,6 +1,8 @@
#input=~/data/bam/zy_normal.bam input=~/data/bam/zy_normal.bam
input=~/data/bam/zy_tumor.bam #input=~/data/bam/zy_tumor.bam
#input=~/data/bam/100w.bam #input=~/data/bam/100w.bam
#input=~/data/bam/1kw.sam
#input=~/data/bam/n1kw.sam
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
MarkDuplicates \ MarkDuplicates \
@ -8,7 +10,7 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
--OUTPUT ~/data/bam/out.bam \ --OUTPUT ~/data/bam/out.bam \
--INDEX_FORMAT BAI \ --INDEX_FORMAT BAI \
--num_threads 1 \ --num_threads 1 \
--max_mem 2G \ --max_mem 1G \
--verbosity DEBUG \ --verbosity DEBUG \
--asyncio true #\ --asyncio true #\
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$" #--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"

View File

@ -71,7 +71,8 @@ void GlobalArg::parseArgument(int argNum) {
mem_arg <<= 20; mem_arg <<= 20;
else if (*q == 'g' || *q == 'G') else if (*q == 'g' || *q == 'G')
mem_arg <<= 30; mem_arg <<= 30;
if (mem_arg >= max_mem) //if (mem_arg >= max_mem)
if (true)
max_mem = mem_arg; max_mem = mem_arg;
else { else {
std::cerr << "[Warn] Too small mem size, use default" << std::endl; std::cerr << "[Warn] Too small mem size, use default" << std::endl;

View File

@ -57,6 +57,10 @@ static GlobalDataArg gData_;
GlobalDataArg &gData = gData_; GlobalDataArg &gData = gData_;
DuplicationMetrics gMetrics_; DuplicationMetrics gMetrics_;
DuplicationMetrics &gMetrics = gMetrics_; DuplicationMetrics &gMetrics = gMetrics_;
int zzhtestnum = 0;
set<int64_t> zzhopticalSet;
vector<int64_t> zzhopticalArr;
/* /*
* mark duplicate * mark duplicate
* bambarcode * bambarcode
@ -161,11 +165,11 @@ int MarkDuplicates(int argc, char *argv[]) {
while (inBuf.ReadStat() >= 0) { while (inBuf.ReadStat() >= 0) {
Timer tw1; Timer tw1;
size_t readNum = inBuf.ReadBam(); size_t readNum = inBuf.ReadBam();
cout << "read: " << readNum << endl; // cout << "read: " << readNum << endl;
for (size_t i = 0; i < inBuf.Size(); ++i) { for (size_t i = 0; i < inBuf.Size(); ++i) {
/* 判断是否冗余 */ /* 判断是否冗余 */
if (bamIdx == dupIdx) { if (bamIdx == dupIdx) {
// cout << "冗余" << bamIdx << endl; // cerr << bamIdx << endl;
dupIdx = idxQue.Pop(); dupIdx = idxQue.Pop();
} }
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) { if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {

View File

@ -213,7 +213,7 @@ struct MarkDupsArg
ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted; ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
/* "The scoring strategy for choosing the non-duplicate among candidates." */ /* "The scoring strategy for choosing the non-duplicate among candidates." */
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH; ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES;
/* "The program record ID for the @PG record(s) created by this program. Set to null to disable " + /* "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
"PG record creation. This string may have a suffix appended to avoid collision with other " + "PG record creation. This string may have a suffix appended to avoid collision with other " +

View File

@ -28,8 +28,6 @@ using std::set;
using std::unordered_map; using std::unordered_map;
using std::vector; using std::vector;
static int zzhtestnum = 0;
/* 清除key位置的数据 */ /* 清除key位置的数据 */
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) { void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx; auto &msIdx = *pmsIdx;
@ -247,6 +245,7 @@ void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
/* 对找到的pairend read end添加一些信息 */ /* 对找到的pairend read end添加一些信息 */
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) { void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
auto &pairedEnds = *pPairedEnds; auto &pairedEnds = *pPairedEnds;
int64_t bamIdx = fragEnd.read1IndexInFile; int64_t bamIdx = fragEnd.read1IndexInFile;
const int matesRefIndex = fragEnd.read1ReferenceIndex; const int matesRefIndex = fragEnd.read1ReferenceIndex;
const int matesCoordinate = fragEnd.read1Coordinate; const int matesCoordinate = fragEnd.read1Coordinate;
@ -459,14 +458,23 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags); findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
int opticalDuplicates = 0; int opticalDuplicates = 0;
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) { for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
if (opticalDuplicateFlags[i]) { if (opticalDuplicateFlags[i]) {
++opticalDuplicates; ++opticalDuplicates;
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]); // if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
// cout << "val: " << pRe->isOpticalDuplicate << endl;
// }
pRe->isOpticalDuplicate = true; pRe->isOpticalDuplicate = true;
zzhopticalSet.insert(pRe->read1IndexInFile);
zzhopticalSet.insert(pRe->read2IndexInFile);
zzhopticalArr.push_back(pRe->read1IndexInFile);
zzhopticalArr.push_back(pRe->read2IndexInFile);
} else {
pRe->isOpticalDuplicate = false;
zzhopticalSet.erase(pRe->read1IndexInFile);
zzhopticalSet.erase(pRe->read2IndexInFile);
} }
} }
if (opticalDuplicates > 0)
gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
return opticalDuplicates; return opticalDuplicates;
} }
@ -475,8 +483,11 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
*/ */
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) { void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
bool hasFR = false, hasRF = false; bool hasFR = false, hasRF = false;
int prevOpticalDupNum = 0;
// Check to see if we have a mixture of FR/RF // Check to see if we have a mixture of FR/RF
for (auto pRe : readEndsArr) { for (auto pRe : readEndsArr) {
if (pRe->isOpticalDuplicate)
++prevOpticalDupNum;
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
hasFR = true; hasFR = true;
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
@ -513,5 +524,10 @@ void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnd
if (nOpticalDup) if (nOpticalDup)
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1; gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl; // cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
} }

View File

@ -13,10 +13,10 @@
#include <set> #include <set>
#include <vector> #include <vector>
#include "dup_metrics.h"
#include "markdups_arg.h" #include "markdups_arg.h"
#include "md_funcs.h" #include "md_funcs.h"
#include "shared_args.h" #include "shared_args.h"
#include "dup_metrics.h"
using std::cout; using std::cout;
using std::set; using std::set;
@ -78,13 +78,18 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupId
} }
int maxScore = 0; int maxScore = 0;
const ReadEnds *pBest = nullptr; const ReadEnds *pBest = nullptr;
int maxOperateTime = 0;
/** All read ends should have orientation FF, FR, RF, or RR **/ /** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) { // 找分数最高的readend for (auto pe : vpRe) { // 找分数最高的readend
maxOperateTime = max(maxOperateTime, pe->oprateTime);
(const_cast<ReadEnds *>(pe))->oprateTime ++;
if (pe->score > maxScore || pBest == nullptr) { if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score; maxScore = pe->score;
pBest = pe; pBest = pe;
} }
} }
// cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl;
// if (maxOperateTime == 0) ++zzhtestnum;
if (notDupIdx != nullptr) { if (notDupIdx != nullptr) {
notDupIdx->insert(pBest->read1IndexInFile); notDupIdx->insert(pBest->read1IndexInFile);
notDupIdx->insert(pBest->read2IndexInFile); notDupIdx->insert(pBest->read2IndexInFile);
@ -93,10 +98,8 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupId
// trackOpticalDuplicates // trackOpticalDuplicates
trackOpticalDuplicates(vpRe, pBest); trackOpticalDuplicates(vpRe, pBest);
} }
for (auto pe : vpRe) // 对非best read标记冗余 for (auto pe : vpRe) { // 对非best read标记冗余
{ if (pe != pBest) { // 非best
if (pe != pBest) // 非best
{
dupIdx->insert(pe->read1IndexInFile); // 添加read1 dupIdx->insert(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile) if (pe->read2IndexInFile != pe->read1IndexInFile)
dupIdx->insert(pe->read2IndexInFile); // 添加read2 dupIdx->insert(pe->read2IndexInFile); // 添加read2
@ -139,8 +142,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
/* 找到与readend pos相等的所有readend */ /* 找到与readend pos相等的所有readend */
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) { static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
auto range = std::equal_range(src.begin(), src.end(), re, auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点
ReadEnds::PairsLittleThan); // 只比对位点
dst->insert(dst->end(), range.first, range.second); dst->insert(dst->end(), range.first, range.second);
} }
@ -185,8 +187,8 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
} }
} }
tm_arr[9].acc_start(); tm_arr[9].acc_start();
//sortReadEndsArr(p.frags); sortReadEndsArr(p.frags);
sort(p.frags.begin(), p.frags.end()); // sort(p.frags.begin(), p.frags.end());
tm_arr[9].acc_end(); tm_arr[9].acc_end();
// cout << "sort pairs" << endl; // cout << "sort pairs" << endl;
tm_arr[10].acc_start(); tm_arr[10].acc_start();
@ -432,21 +434,18 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
// 1. // 1.
// prevpos在交叉部分之前nextpos在交叉部分之后这种情况不需要获取pairarr中的数据; // prevpos在交叉部分之前nextpos在交叉部分之后这种情况不需要获取pairarr中的数据;
// 2. // 2.
// prevpos在交叉部分之前nextpos在交叉部分需要获取lp中的相等read // prevpos在交叉部分之前nextpos在交叉部分需要获取lp中的相等read pair进行重新计算
// pair进行重新计算 // 复杂情况1. g中包含prevPosKey对应的unpairp中有对应的pair此时应该把这些pair考虑进去
// 复杂情况1.
// g中包含prevPosKey对应的unpairp中有对应的pair此时应该把这些pair考虑进去
// 3. // 3.
// prevpos在交叉部分nextpos在交叉部分之后需要获取p中的相等read // prevpos在交叉部分nextpos在交叉部分之后需要获取p中的相等read pair进行重新计算
// pair进行重新计算
// 复杂情况2. p中是否包含prevPosKey对应的unpair // 复杂情况2. p中是否包含prevPosKey对应的unpair
// 4. // 4.
// prevpos在交叉部分nextpos在交叉部分需要获取lp和p中的相等read // prevpos在交叉部分nextpos在交叉部分需要获取lp和p中的相等read pair进行重新计算
// pair进行重新计算
bool addDataToPos = true; bool addDataToPos = true;
if (alreadyAdd.find(ck) != alreadyAdd.end()) { if (alreadyAdd.find(ck) != alreadyAdd.end()) {
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了 // 之前已经添加过了后面就不用再添加数据了因为同一个位置可能找到两个及以上的unpair数据处理之前的数据时候可能已经添加了这些数据
addDataToPos = false;
} else } else
alreadyAdd.insert(ck); alreadyAdd.insert(ck);
@ -535,8 +534,6 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
addToGlobal.insert(prevPosKey); addToGlobal.insert(prevPosKey);
} }
} }
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
map<int64_t, TaskSeqDupInfo> taskChanged; map<int64_t, TaskSeqDupInfo> taskChanged;
set<int64_t> posProcessed; set<int64_t> posProcessed;
@ -557,8 +554,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
if (taskSeq < lp.taskSeq) if (taskSeq < lp.taskSeq)
g.unpairedPosArr.erase(posKey); g.unpairedPosArr.erase(posKey);
} }
// 更新结果
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
// 放在这里因为lp中的unpairedPosArr中的readends可能会被修改比如optical duplicate
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
// 更新结果
for (auto &e : taskChanged) { for (auto &e : taskChanged) {
auto taskSeq = e.first; auto taskSeq = e.first;
auto &t = e.second; auto &t = e.second;
@ -719,8 +720,8 @@ void serialMarkDups() {
// cout << "round time: " << t_round.seconds_elapsed() << endl; // cout << "round time: " << t_round.seconds_elapsed() << endl;
roundNum++; roundNum++;
if (roundNum % 100 == 0) { if (roundNum % 100 == 0) {
cout << "read sum: " << readNumSum << endl; //cout << "read sum: " << readNumSum << endl;
cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl; //cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
} }
} }
// cout << "here" << endl; // cout << "here" << endl;
@ -769,10 +770,10 @@ void serialMarkDups() {
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl; cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl; cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl; cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t"
<< gMetrics.NonOpticalDuplicateCountHist << "\t" << gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl;
<< gMetrics.OpticalDuplicatesCountHist << "\t" cout << "optical dup: " << zzhopticalSet.size() << endl;
<< gMetrics.OpticalDuplicatesByLibraryId << endl; cout << "optical arr dup: " << zzhopticalArr.size() << endl;
Timer::log_time("serial end "); Timer::log_time("serial end ");

View File

@ -6,6 +6,8 @@
#include <htslib/thread_pool.h> #include <htslib/thread_pool.h>
#include <sam/utils/read_ends.h> #include <sam/utils/read_ends.h>
#include <sam/utils/read_name_parser.h> #include <sam/utils/read_name_parser.h>
#include <set>
using std::set;
extern Timer tm_arr[20]; // 用来测试性能 extern Timer tm_arr[20]; // 用来测试性能
/* 全局本地变量 */ /* 全局本地变量 */
@ -24,3 +26,7 @@ class GlobalDataArg;
extern GlobalDataArg &gData; extern GlobalDataArg &gData;
class DuplicationMetrics; class DuplicationMetrics;
extern DuplicationMetrics &gMetrics; extern DuplicationMetrics &gMetrics;
extern int zzhtestnum;
extern set<int64_t> zzhopticalSet;
extern vector<int64_t> zzhopticalArr;

View File

@ -77,6 +77,9 @@ struct ReadEnds : PhysicalLocation {
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid << int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos; // MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
/* 用来做一些判断因为一些readends会做多次操作比如task之间有重叠等等 */
int oprateTime = 0;
/* 根据pairend read的比对方向来确定整体的比对方向 */ /* 根据pairend read的比对方向来确定整体的比对方向 */
static int8_t GetOrientationByte(bool read1NegativeStrand, static int8_t GetOrientationByte(bool read1NegativeStrand,
bool read2NegativeStrand) { bool read2NegativeStrand) {