需要完善optical dup idx计算
This commit is contained in:
parent
86954ffa85
commit
7352a00f2c
|
|
@ -1,5 +1,6 @@
|
|||
# for fast-markdup
|
||||
*.sam
|
||||
*.bam
|
||||
*.log
|
||||
# ---> C++
|
||||
# Prerequisites
|
||||
|
|
|
|||
|
|
@ -13,11 +13,11 @@
|
|||
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
||||
"args": [
|
||||
"MarkDuplicates",
|
||||
"--INPUT", "~/data/bam/100w.bam",
|
||||
"--INPUT", "~/data/bam/zy_normal.bam",
|
||||
"--OUTPUT", "out.bam",
|
||||
"--METRICS_FILE", "metrics.txt",
|
||||
"--num_threads", "1",
|
||||
"--max_mem", "4G",
|
||||
"--max_mem", "100G",
|
||||
"--verbosity", "DEBUG",
|
||||
"--asyncio", "true",
|
||||
],
|
||||
|
|
|
|||
8
run.sh
8
run.sh
|
|
@ -1,6 +1,8 @@
|
|||
#input=~/data/bam/zy_normal.bam
|
||||
input=~/data/bam/zy_tumor.bam
|
||||
input=~/data/bam/zy_normal.bam
|
||||
#input=~/data/bam/zy_tumor.bam
|
||||
#input=~/data/bam/100w.bam
|
||||
#input=~/data/bam/1kw.sam
|
||||
#input=~/data/bam/n1kw.sam
|
||||
|
||||
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
||||
MarkDuplicates \
|
||||
|
|
@ -8,7 +10,7 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
|||
--OUTPUT ~/data/bam/out.bam \
|
||||
--INDEX_FORMAT BAI \
|
||||
--num_threads 1 \
|
||||
--max_mem 2G \
|
||||
--max_mem 1G \
|
||||
--verbosity DEBUG \
|
||||
--asyncio true #\
|
||||
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
||||
|
|
|
|||
|
|
@ -71,7 +71,8 @@ void GlobalArg::parseArgument(int argNum) {
|
|||
mem_arg <<= 20;
|
||||
else if (*q == 'g' || *q == 'G')
|
||||
mem_arg <<= 30;
|
||||
if (mem_arg >= max_mem)
|
||||
//if (mem_arg >= max_mem)
|
||||
if (true)
|
||||
max_mem = mem_arg;
|
||||
else {
|
||||
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
||||
|
|
|
|||
|
|
@ -57,6 +57,10 @@ static GlobalDataArg gData_;
|
|||
GlobalDataArg &gData = gData_;
|
||||
DuplicationMetrics gMetrics_;
|
||||
DuplicationMetrics &gMetrics = gMetrics_;
|
||||
|
||||
int zzhtestnum = 0;
|
||||
set<int64_t> zzhopticalSet;
|
||||
vector<int64_t> zzhopticalArr;
|
||||
/*
|
||||
* mark duplicate
|
||||
* 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
||||
|
|
@ -161,11 +165,11 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
while (inBuf.ReadStat() >= 0) {
|
||||
Timer tw1;
|
||||
size_t readNum = inBuf.ReadBam();
|
||||
cout << "read: " << readNum << endl;
|
||||
// cout << "read: " << readNum << endl;
|
||||
for (size_t i = 0; i < inBuf.Size(); ++i) {
|
||||
/* 判断是否冗余 */
|
||||
if (bamIdx == dupIdx) {
|
||||
// cout << "冗余" << bamIdx << endl;
|
||||
// cerr << bamIdx << endl;
|
||||
dupIdx = idxQue.Pop();
|
||||
}
|
||||
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
|
||||
|
|
|
|||
|
|
@ -213,7 +213,7 @@ struct MarkDupsArg
|
|||
ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
|
||||
|
||||
/* "The scoring strategy for choosing the non-duplicate among candidates." */
|
||||
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH;
|
||||
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES;
|
||||
|
||||
/* "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
|
||||
"PG record creation. This string may have a suffix appended to avoid collision with other " +
|
||||
|
|
|
|||
|
|
@ -28,8 +28,6 @@ using std::set;
|
|||
using std::unordered_map;
|
||||
using std::vector;
|
||||
|
||||
static int zzhtestnum = 0;
|
||||
|
||||
/* 清除key位置的数据 */
|
||||
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
||||
auto &msIdx = *pmsIdx;
|
||||
|
|
@ -247,6 +245,7 @@ void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
|
|||
/* 对找到的pairend read end添加一些信息 */
|
||||
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
|
||||
auto &pairedEnds = *pPairedEnds;
|
||||
|
||||
int64_t bamIdx = fragEnd.read1IndexInFile;
|
||||
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
||||
const int matesCoordinate = fragEnd.read1Coordinate;
|
||||
|
|
@ -459,14 +458,23 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
|
|||
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
|
||||
int opticalDuplicates = 0;
|
||||
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
|
||||
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
|
||||
if (opticalDuplicateFlags[i]) {
|
||||
++opticalDuplicates;
|
||||
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
|
||||
// if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
|
||||
// cout << "val: " << pRe->isOpticalDuplicate << endl;
|
||||
// }
|
||||
pRe->isOpticalDuplicate = true;
|
||||
zzhopticalSet.insert(pRe->read1IndexInFile);
|
||||
zzhopticalSet.insert(pRe->read2IndexInFile);
|
||||
zzhopticalArr.push_back(pRe->read1IndexInFile);
|
||||
zzhopticalArr.push_back(pRe->read2IndexInFile);
|
||||
} else {
|
||||
pRe->isOpticalDuplicate = false;
|
||||
zzhopticalSet.erase(pRe->read1IndexInFile);
|
||||
zzhopticalSet.erase(pRe->read2IndexInFile);
|
||||
}
|
||||
}
|
||||
if (opticalDuplicates > 0)
|
||||
gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
|
||||
return opticalDuplicates;
|
||||
}
|
||||
|
||||
|
|
@ -475,8 +483,11 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
|
|||
*/
|
||||
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
|
||||
bool hasFR = false, hasRF = false;
|
||||
int prevOpticalDupNum = 0;
|
||||
// Check to see if we have a mixture of FR/RF
|
||||
for (auto pRe : readEndsArr) {
|
||||
if (pRe->isOpticalDuplicate)
|
||||
++prevOpticalDupNum;
|
||||
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
|
||||
hasFR = true;
|
||||
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
|
||||
|
|
@ -513,5 +524,10 @@ void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnd
|
|||
if (nOpticalDup)
|
||||
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
|
||||
|
||||
|
||||
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
|
||||
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
|
||||
|
||||
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
|
||||
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
|
||||
}
|
||||
|
|
@ -13,10 +13,10 @@
|
|||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "dup_metrics.h"
|
||||
#include "markdups_arg.h"
|
||||
#include "md_funcs.h"
|
||||
#include "shared_args.h"
|
||||
#include "dup_metrics.h"
|
||||
|
||||
using std::cout;
|
||||
using std::set;
|
||||
|
|
@ -78,13 +78,18 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupId
|
|||
}
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
int maxOperateTime = 0;
|
||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||
for (auto pe : vpRe) { // 找分数最高的readend
|
||||
maxOperateTime = max(maxOperateTime, pe->oprateTime);
|
||||
(const_cast<ReadEnds *>(pe))->oprateTime ++;
|
||||
if (pe->score > maxScore || pBest == nullptr) {
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
// cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl;
|
||||
// if (maxOperateTime == 0) ++zzhtestnum;
|
||||
if (notDupIdx != nullptr) {
|
||||
notDupIdx->insert(pBest->read1IndexInFile);
|
||||
notDupIdx->insert(pBest->read2IndexInFile);
|
||||
|
|
@ -93,10 +98,8 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupId
|
|||
// trackOpticalDuplicates
|
||||
trackOpticalDuplicates(vpRe, pBest);
|
||||
}
|
||||
for (auto pe : vpRe) // 对非best read标记冗余
|
||||
{
|
||||
if (pe != pBest) // 非best
|
||||
{
|
||||
for (auto pe : vpRe) { // 对非best read标记冗余
|
||||
if (pe != pBest) { // 非best
|
||||
dupIdx->insert(pe->read1IndexInFile); // 添加read1
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
dupIdx->insert(pe->read2IndexInFile); // 添加read2
|
||||
|
|
@ -139,8 +142,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
|
|||
|
||||
/* 找到与readend pos相等的所有readend */
|
||||
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
|
||||
auto range = std::equal_range(src.begin(), src.end(), re,
|
||||
ReadEnds::PairsLittleThan); // 只比对位点
|
||||
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点
|
||||
dst->insert(dst->end(), range.first, range.second);
|
||||
}
|
||||
|
||||
|
|
@ -185,8 +187,8 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
|
|||
}
|
||||
}
|
||||
tm_arr[9].acc_start();
|
||||
//sortReadEndsArr(p.frags);
|
||||
sort(p.frags.begin(), p.frags.end());
|
||||
sortReadEndsArr(p.frags);
|
||||
// sort(p.frags.begin(), p.frags.end());
|
||||
tm_arr[9].acc_end();
|
||||
// cout << "sort pairs" << endl;
|
||||
tm_arr[10].acc_start();
|
||||
|
|
@ -432,21 +434,18 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
|||
// 1.
|
||||
// prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据;
|
||||
// 2.
|
||||
// prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read
|
||||
// pair进行重新计算
|
||||
// 复杂情况1.
|
||||
// g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
||||
// prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read pair进行重新计算
|
||||
// 复杂情况1. g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
||||
// 3.
|
||||
// prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read
|
||||
// pair进行重新计算
|
||||
// prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read pair进行重新计算
|
||||
// 复杂情况2. p中是否包含prevPosKey对应的unpair
|
||||
// 4.
|
||||
// prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read
|
||||
// pair进行重新计算
|
||||
// prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read pair进行重新计算
|
||||
|
||||
bool addDataToPos = true;
|
||||
if (alreadyAdd.find(ck) != alreadyAdd.end()) {
|
||||
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
|
||||
// 之前已经添加过了,后面就不用再添加数据了,因为同一个位置可能找到两个及以上的unpair数据,处理之前的数据时候可能已经添加了这些数据
|
||||
addDataToPos = false;
|
||||
} else
|
||||
alreadyAdd.insert(ck);
|
||||
|
||||
|
|
@ -535,8 +534,6 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
|||
addToGlobal.insert(prevPosKey);
|
||||
}
|
||||
}
|
||||
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
||||
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
||||
|
||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||
set<int64_t> posProcessed;
|
||||
|
|
@ -557,8 +554,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
|||
if (taskSeq < lp.taskSeq)
|
||||
g.unpairedPosArr.erase(posKey);
|
||||
}
|
||||
// 更新结果
|
||||
|
||||
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
||||
// 放在这里,因为lp中的unpairedPosArr中的readends可能会被修改(比如optical duplicate)
|
||||
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
||||
|
||||
// 更新结果
|
||||
for (auto &e : taskChanged) {
|
||||
auto taskSeq = e.first;
|
||||
auto &t = e.second;
|
||||
|
|
@ -719,8 +720,8 @@ void serialMarkDups() {
|
|||
// cout << "round time: " << t_round.seconds_elapsed() << endl;
|
||||
roundNum++;
|
||||
if (roundNum % 100 == 0) {
|
||||
cout << "read sum: " << readNumSum << endl;
|
||||
cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
|
||||
//cout << "read sum: " << readNumSum << endl;
|
||||
//cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
|
||||
}
|
||||
}
|
||||
// cout << "here" << endl;
|
||||
|
|
@ -769,10 +770,10 @@ void serialMarkDups() {
|
|||
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
||||
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
||||
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
||||
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t"
|
||||
<< gMetrics.NonOpticalDuplicateCountHist << "\t"
|
||||
<< gMetrics.OpticalDuplicatesCountHist << "\t"
|
||||
<< gMetrics.OpticalDuplicatesByLibraryId << endl;
|
||||
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t"
|
||||
<< gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl;
|
||||
cout << "optical dup: " << zzhopticalSet.size() << endl;
|
||||
cout << "optical arr dup: " << zzhopticalArr.size() << endl;
|
||||
|
||||
Timer::log_time("serial end ");
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@
|
|||
#include <htslib/thread_pool.h>
|
||||
#include <sam/utils/read_ends.h>
|
||||
#include <sam/utils/read_name_parser.h>
|
||||
#include <set>
|
||||
using std::set;
|
||||
|
||||
extern Timer tm_arr[20]; // 用来测试性能
|
||||
/* 全局本地变量 */
|
||||
|
|
@ -24,3 +26,7 @@ class GlobalDataArg;
|
|||
extern GlobalDataArg &gData;
|
||||
class DuplicationMetrics;
|
||||
extern DuplicationMetrics &gMetrics;
|
||||
|
||||
extern int zzhtestnum;
|
||||
extern set<int64_t> zzhopticalSet;
|
||||
extern vector<int64_t> zzhopticalArr;
|
||||
|
|
@ -77,6 +77,9 @@ struct ReadEnds : PhysicalLocation {
|
|||
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
|
||||
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
||||
|
||||
/* 用来做一些判断,因为一些readends会做多次操作,比如task之间有重叠等等 */
|
||||
int oprateTime = 0;
|
||||
|
||||
/* 根据pairend read的比对方向,来确定整体的比对方向 */
|
||||
static int8_t GetOrientationByte(bool read1NegativeStrand,
|
||||
bool read2NegativeStrand) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue