需要完善optical dup idx计算
This commit is contained in:
parent
86954ffa85
commit
7352a00f2c
|
|
@ -1,5 +1,6 @@
|
||||||
# for fast-markdup
|
# for fast-markdup
|
||||||
*.sam
|
*.sam
|
||||||
|
*.bam
|
||||||
*.log
|
*.log
|
||||||
# ---> C++
|
# ---> C++
|
||||||
# Prerequisites
|
# Prerequisites
|
||||||
|
|
@ -36,4 +37,4 @@
|
||||||
*.app
|
*.app
|
||||||
|
|
||||||
lib/
|
lib/
|
||||||
build/
|
build/
|
||||||
|
|
|
||||||
|
|
@ -13,11 +13,11 @@
|
||||||
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
||||||
"args": [
|
"args": [
|
||||||
"MarkDuplicates",
|
"MarkDuplicates",
|
||||||
"--INPUT", "~/data/bam/100w.bam",
|
"--INPUT", "~/data/bam/zy_normal.bam",
|
||||||
"--OUTPUT", "out.bam",
|
"--OUTPUT", "out.bam",
|
||||||
"--METRICS_FILE", "metrics.txt",
|
"--METRICS_FILE", "metrics.txt",
|
||||||
"--num_threads", "1",
|
"--num_threads", "1",
|
||||||
"--max_mem", "4G",
|
"--max_mem", "100G",
|
||||||
"--verbosity", "DEBUG",
|
"--verbosity", "DEBUG",
|
||||||
"--asyncio", "true",
|
"--asyncio", "true",
|
||||||
],
|
],
|
||||||
|
|
|
||||||
8
run.sh
8
run.sh
|
|
@ -1,6 +1,8 @@
|
||||||
#input=~/data/bam/zy_normal.bam
|
input=~/data/bam/zy_normal.bam
|
||||||
input=~/data/bam/zy_tumor.bam
|
#input=~/data/bam/zy_tumor.bam
|
||||||
#input=~/data/bam/100w.bam
|
#input=~/data/bam/100w.bam
|
||||||
|
#input=~/data/bam/1kw.sam
|
||||||
|
#input=~/data/bam/n1kw.sam
|
||||||
|
|
||||||
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
||||||
MarkDuplicates \
|
MarkDuplicates \
|
||||||
|
|
@ -8,7 +10,7 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
||||||
--OUTPUT ~/data/bam/out.bam \
|
--OUTPUT ~/data/bam/out.bam \
|
||||||
--INDEX_FORMAT BAI \
|
--INDEX_FORMAT BAI \
|
||||||
--num_threads 1 \
|
--num_threads 1 \
|
||||||
--max_mem 2G \
|
--max_mem 1G \
|
||||||
--verbosity DEBUG \
|
--verbosity DEBUG \
|
||||||
--asyncio true #\
|
--asyncio true #\
|
||||||
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,8 @@ void GlobalArg::parseArgument(int argNum) {
|
||||||
mem_arg <<= 20;
|
mem_arg <<= 20;
|
||||||
else if (*q == 'g' || *q == 'G')
|
else if (*q == 'g' || *q == 'G')
|
||||||
mem_arg <<= 30;
|
mem_arg <<= 30;
|
||||||
if (mem_arg >= max_mem)
|
//if (mem_arg >= max_mem)
|
||||||
|
if (true)
|
||||||
max_mem = mem_arg;
|
max_mem = mem_arg;
|
||||||
else {
|
else {
|
||||||
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,10 @@ static GlobalDataArg gData_;
|
||||||
GlobalDataArg &gData = gData_;
|
GlobalDataArg &gData = gData_;
|
||||||
DuplicationMetrics gMetrics_;
|
DuplicationMetrics gMetrics_;
|
||||||
DuplicationMetrics &gMetrics = gMetrics_;
|
DuplicationMetrics &gMetrics = gMetrics_;
|
||||||
|
|
||||||
|
int zzhtestnum = 0;
|
||||||
|
set<int64_t> zzhopticalSet;
|
||||||
|
vector<int64_t> zzhopticalArr;
|
||||||
/*
|
/*
|
||||||
* mark duplicate
|
* mark duplicate
|
||||||
* 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
* 入口,假定bam是按照比对后的坐标排序的,同一个样本的话不需要考虑barcode的问题
|
||||||
|
|
@ -161,11 +165,11 @@ int MarkDuplicates(int argc, char *argv[]) {
|
||||||
while (inBuf.ReadStat() >= 0) {
|
while (inBuf.ReadStat() >= 0) {
|
||||||
Timer tw1;
|
Timer tw1;
|
||||||
size_t readNum = inBuf.ReadBam();
|
size_t readNum = inBuf.ReadBam();
|
||||||
cout << "read: " << readNum << endl;
|
// cout << "read: " << readNum << endl;
|
||||||
for (size_t i = 0; i < inBuf.Size(); ++i) {
|
for (size_t i = 0; i < inBuf.Size(); ++i) {
|
||||||
/* 判断是否冗余 */
|
/* 判断是否冗余 */
|
||||||
if (bamIdx == dupIdx) {
|
if (bamIdx == dupIdx) {
|
||||||
// cout << "冗余" << bamIdx << endl;
|
// cerr << bamIdx << endl;
|
||||||
dupIdx = idxQue.Pop();
|
dupIdx = idxQue.Pop();
|
||||||
}
|
}
|
||||||
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
|
if (sam_write1(g_outBamFp, g_outBamHeader, inBuf[i]->b) < 0) {
|
||||||
|
|
|
||||||
|
|
@ -213,7 +213,7 @@ struct MarkDupsArg
|
||||||
ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
|
ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
|
||||||
|
|
||||||
/* "The scoring strategy for choosing the non-duplicate among candidates." */
|
/* "The scoring strategy for choosing the non-duplicate among candidates." */
|
||||||
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH;
|
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES;
|
||||||
|
|
||||||
/* "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
|
/* "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
|
||||||
"PG record creation. This string may have a suffix appended to avoid collision with other " +
|
"PG record creation. This string may have a suffix appended to avoid collision with other " +
|
||||||
|
|
|
||||||
|
|
@ -28,8 +28,6 @@ using std::set;
|
||||||
using std::unordered_map;
|
using std::unordered_map;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
static int zzhtestnum = 0;
|
|
||||||
|
|
||||||
/* 清除key位置的数据 */
|
/* 清除key位置的数据 */
|
||||||
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
||||||
auto &msIdx = *pmsIdx;
|
auto &msIdx = *pmsIdx;
|
||||||
|
|
@ -247,6 +245,7 @@ void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
|
||||||
/* 对找到的pairend read end添加一些信息 */
|
/* 对找到的pairend read end添加一些信息 */
|
||||||
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
|
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
|
||||||
auto &pairedEnds = *pPairedEnds;
|
auto &pairedEnds = *pPairedEnds;
|
||||||
|
|
||||||
int64_t bamIdx = fragEnd.read1IndexInFile;
|
int64_t bamIdx = fragEnd.read1IndexInFile;
|
||||||
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
const int matesRefIndex = fragEnd.read1ReferenceIndex;
|
||||||
const int matesCoordinate = fragEnd.read1Coordinate;
|
const int matesCoordinate = fragEnd.read1Coordinate;
|
||||||
|
|
@ -459,14 +458,23 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
|
||||||
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
|
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
|
||||||
int opticalDuplicates = 0;
|
int opticalDuplicates = 0;
|
||||||
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
|
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
|
||||||
|
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
|
||||||
if (opticalDuplicateFlags[i]) {
|
if (opticalDuplicateFlags[i]) {
|
||||||
++opticalDuplicates;
|
++opticalDuplicates;
|
||||||
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
|
// if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
|
||||||
|
// cout << "val: " << pRe->isOpticalDuplicate << endl;
|
||||||
|
// }
|
||||||
pRe->isOpticalDuplicate = true;
|
pRe->isOpticalDuplicate = true;
|
||||||
|
zzhopticalSet.insert(pRe->read1IndexInFile);
|
||||||
|
zzhopticalSet.insert(pRe->read2IndexInFile);
|
||||||
|
zzhopticalArr.push_back(pRe->read1IndexInFile);
|
||||||
|
zzhopticalArr.push_back(pRe->read2IndexInFile);
|
||||||
|
} else {
|
||||||
|
pRe->isOpticalDuplicate = false;
|
||||||
|
zzhopticalSet.erase(pRe->read1IndexInFile);
|
||||||
|
zzhopticalSet.erase(pRe->read2IndexInFile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (opticalDuplicates > 0)
|
|
||||||
gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates;
|
|
||||||
return opticalDuplicates;
|
return opticalDuplicates;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -475,8 +483,11 @@ static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const R
|
||||||
*/
|
*/
|
||||||
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
|
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
|
||||||
bool hasFR = false, hasRF = false;
|
bool hasFR = false, hasRF = false;
|
||||||
|
int prevOpticalDupNum = 0;
|
||||||
// Check to see if we have a mixture of FR/RF
|
// Check to see if we have a mixture of FR/RF
|
||||||
for (auto pRe : readEndsArr) {
|
for (auto pRe : readEndsArr) {
|
||||||
|
if (pRe->isOpticalDuplicate)
|
||||||
|
++prevOpticalDupNum;
|
||||||
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
|
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
|
||||||
hasFR = true;
|
hasFR = true;
|
||||||
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
|
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
|
||||||
|
|
@ -513,5 +524,10 @@ void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnd
|
||||||
if (nOpticalDup)
|
if (nOpticalDup)
|
||||||
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
|
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
|
||||||
|
|
||||||
|
|
||||||
|
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
|
||||||
|
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
|
||||||
|
|
||||||
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
|
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
|
||||||
|
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
|
||||||
}
|
}
|
||||||
|
|
@ -13,10 +13,10 @@
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "dup_metrics.h"
|
||||||
#include "markdups_arg.h"
|
#include "markdups_arg.h"
|
||||||
#include "md_funcs.h"
|
#include "md_funcs.h"
|
||||||
#include "shared_args.h"
|
#include "shared_args.h"
|
||||||
#include "dup_metrics.h"
|
|
||||||
|
|
||||||
using std::cout;
|
using std::cout;
|
||||||
using std::set;
|
using std::set;
|
||||||
|
|
@ -78,25 +78,28 @@ static void markDupsForPairs(vector<const ReadEnds *> &vpRe, set<int64_t> *dupId
|
||||||
}
|
}
|
||||||
int maxScore = 0;
|
int maxScore = 0;
|
||||||
const ReadEnds *pBest = nullptr;
|
const ReadEnds *pBest = nullptr;
|
||||||
|
int maxOperateTime = 0;
|
||||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||||
for (auto pe : vpRe) { // 找分数最高的readend
|
for (auto pe : vpRe) { // 找分数最高的readend
|
||||||
|
maxOperateTime = max(maxOperateTime, pe->oprateTime);
|
||||||
|
(const_cast<ReadEnds *>(pe))->oprateTime ++;
|
||||||
if (pe->score > maxScore || pBest == nullptr) {
|
if (pe->score > maxScore || pBest == nullptr) {
|
||||||
maxScore = pe->score;
|
maxScore = pe->score;
|
||||||
pBest = pe;
|
pBest = pe;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// cerr << zzhtestnum << " best: " << vpRe.size() << " " << pBest->read1IndexInFile << "-" << pBest->read2IndexInFile << endl;
|
||||||
|
// if (maxOperateTime == 0) ++zzhtestnum;
|
||||||
if (notDupIdx != nullptr) {
|
if (notDupIdx != nullptr) {
|
||||||
notDupIdx->insert(pBest->read1IndexInFile);
|
notDupIdx->insert(pBest->read1IndexInFile);
|
||||||
notDupIdx->insert(pBest->read2IndexInFile);
|
notDupIdx->insert(pBest->read2IndexInFile);
|
||||||
}
|
}
|
||||||
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
|
if (!g_mdArg.READ_NAME_REGEX.empty()) { // 检查光学冗余
|
||||||
// trackOpticalDuplicates
|
// trackOpticalDuplicates
|
||||||
trackOpticalDuplicates(vpRe, pBest);
|
trackOpticalDuplicates(vpRe, pBest);
|
||||||
}
|
}
|
||||||
for (auto pe : vpRe) // 对非best read标记冗余
|
for (auto pe : vpRe) { // 对非best read标记冗余
|
||||||
{
|
if (pe != pBest) { // 非best
|
||||||
if (pe != pBest) // 非best
|
|
||||||
{
|
|
||||||
dupIdx->insert(pe->read1IndexInFile); // 添加read1
|
dupIdx->insert(pe->read1IndexInFile); // 添加read1
|
||||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||||
dupIdx->insert(pe->read2IndexInFile); // 添加read2
|
dupIdx->insert(pe->read2IndexInFile); // 添加read2
|
||||||
|
|
@ -139,8 +142,7 @@ static void markDupsForFrags(vector<const ReadEnds *> &vpRe, bool containsPairs,
|
||||||
|
|
||||||
/* 找到与readend pos相等的所有readend */
|
/* 找到与readend pos相等的所有readend */
|
||||||
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
|
static void getEqualRE(const ReadEnds &re, vector<ReadEnds> &src, vector<ReadEnds> *dst) {
|
||||||
auto range = std::equal_range(src.begin(), src.end(), re,
|
auto range = std::equal_range(src.begin(), src.end(), re, ReadEnds::PairsLittleThan); // 只比对位点
|
||||||
ReadEnds::PairsLittleThan); // 只比对位点
|
|
||||||
dst->insert(dst->end(), range.first, range.second);
|
dst->insert(dst->end(), range.first, range.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -158,24 +160,24 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
|
||||||
// set<ReadEnds> reSet;
|
// set<ReadEnds> reSet;
|
||||||
// ReadEnds lastRe;
|
// ReadEnds lastRe;
|
||||||
|
|
||||||
for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
|
for (int i = 0; i < p.bams.size(); ++i) { // 循环处理每个read
|
||||||
BamWrap *bw = p.bams[i];
|
BamWrap *bw = p.bams[i];
|
||||||
const int64_t bamIdx = p.bamStartIdx + i;
|
const int64_t bamIdx = p.bamStartIdx + i;
|
||||||
if (bw->GetReadUnmappedFlag()) {
|
if (bw->GetReadUnmappedFlag()) {
|
||||||
if (bw->b->core.tid == -1)
|
if (bw->b->core.tid == -1)
|
||||||
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
|
||||||
break;
|
break;
|
||||||
} else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
|
} else if (!bw->IsSecondaryOrSupplementary()) { // 是主要比对
|
||||||
ReadEnds fragEnd;
|
ReadEnds fragEnd;
|
||||||
tm_arr[8].acc_start();
|
tm_arr[8].acc_start();
|
||||||
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
|
buildReadEnds(*bw, bamIdx, rnParser, &fragEnd);
|
||||||
tm_arr[8].acc_end();
|
tm_arr[8].acc_end();
|
||||||
p.frags.push_back(fragEnd); // 添加进frag集合
|
p.frags.push_back(fragEnd); // 添加进frag集合
|
||||||
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
|
if (bw->GetReadPairedFlag() && !bw->GetMateUnmappedFlag()) { // 是pairend而且互补的read也比对上了
|
||||||
string key = bw->query_name();
|
string key = bw->query_name();
|
||||||
if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
|
if (p.unpairedDic.find(key) == p.unpairedDic.end()) {
|
||||||
p.unpairedDic[key] = {p.taskSeq, fragEnd};
|
p.unpairedDic[key] = {p.taskSeq, fragEnd};
|
||||||
} else { // 找到了pairend
|
} else { // 找到了pairend
|
||||||
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
auto &pairedEnds = p.unpairedDic.at(key).unpairedRE;
|
||||||
modifyPairedEnds(fragEnd, &pairedEnds);
|
modifyPairedEnds(fragEnd, &pairedEnds);
|
||||||
p.pairs.push_back(pairedEnds);
|
p.pairs.push_back(pairedEnds);
|
||||||
|
|
@ -185,8 +187,8 @@ static void generateReadEnds(SerailMarkDupArg *arg) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tm_arr[9].acc_start();
|
tm_arr[9].acc_start();
|
||||||
//sortReadEndsArr(p.frags);
|
sortReadEndsArr(p.frags);
|
||||||
sort(p.frags.begin(), p.frags.end());
|
// sort(p.frags.begin(), p.frags.end());
|
||||||
tm_arr[9].acc_end();
|
tm_arr[9].acc_end();
|
||||||
// cout << "sort pairs" << endl;
|
// cout << "sort pairs" << endl;
|
||||||
tm_arr[10].acc_start();
|
tm_arr[10].acc_start();
|
||||||
|
|
@ -277,7 +279,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
|
||||||
|
|
||||||
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
|
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx - leftSpan], rightArr[rightStartIdx], isPairCmp)) {
|
||||||
leftSpan += 1;
|
leftSpan += 1;
|
||||||
if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了(可能会有bug,上上个也与下一个有交集呢?)
|
if (leftSpan > leftEndIdx) { // 上一个的范围被下一个全部包围了(可能会有bug,上上个也与下一个有交集呢?)
|
||||||
leftSpan = leftArr.size() - 1;
|
leftSpan = leftArr.size() - 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -285,7 +287,7 @@ static inline void getIntersectData(vector<ReadEnds> &leftArr, vector<ReadEnds>
|
||||||
|
|
||||||
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
|
while (!ReadEnds::ReadLittleThan(leftArr[leftEndIdx], rightArr[rightSpan], isPairCmp)) {
|
||||||
rightSpan += 1;
|
rightSpan += 1;
|
||||||
if (rightSpan == rightArr.size() - 1) // 同上,可能会有bug
|
if (rightSpan == rightArr.size() - 1) // 同上,可能会有bug
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
dst->insert(dst->end(), leftArr.end() - leftSpan, leftArr.end());
|
||||||
|
|
@ -432,21 +434,18 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
||||||
// 1.
|
// 1.
|
||||||
// prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据;
|
// prevpos在交叉部分之前,nextpos在交叉部分之后,这种情况不需要获取pairarr中的数据;
|
||||||
// 2.
|
// 2.
|
||||||
// prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read
|
// prevpos在交叉部分之前,nextpos在交叉部分,需要获取lp中的相等read pair进行重新计算
|
||||||
// pair进行重新计算
|
// 复杂情况1. g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
||||||
// 复杂情况1.
|
|
||||||
// g中包含prevPosKey对应的unpair,p中有对应的pair,此时应该把这些pair考虑进去
|
|
||||||
// 3.
|
// 3.
|
||||||
// prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read
|
// prevpos在交叉部分,nextpos在交叉部分之后,需要获取p中的相等read pair进行重新计算
|
||||||
// pair进行重新计算
|
// 复杂情况2. p中是否包含prevPosKey对应的unpair
|
||||||
// 复杂情况2. p中是否包含prevPosKey对应的unpair
|
|
||||||
// 4.
|
// 4.
|
||||||
// prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read
|
// prevpos在交叉部分,nextpos在交叉部分,需要获取lp和p中的相等read pair进行重新计算
|
||||||
// pair进行重新计算
|
|
||||||
|
|
||||||
bool addDataToPos = true;
|
bool addDataToPos = true;
|
||||||
if (alreadyAdd.find(ck) != alreadyAdd.end()) {
|
if (alreadyAdd.find(ck) != alreadyAdd.end()) {
|
||||||
addDataToPos = false; // 之前已经添加过了,后面就不用再添加数据了
|
// 之前已经添加过了,后面就不用再添加数据了,因为同一个位置可能找到两个及以上的unpair数据,处理之前的数据时候可能已经添加了这些数据
|
||||||
|
addDataToPos = false;
|
||||||
} else
|
} else
|
||||||
alreadyAdd.insert(ck);
|
alreadyAdd.insert(ck);
|
||||||
|
|
||||||
|
|
@ -478,9 +477,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
||||||
}
|
}
|
||||||
recalcPos[ck] = prevPosInfo.taskSeq;
|
recalcPos[ck] = prevPosInfo.taskSeq;
|
||||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
} else { // prevpos在交叉部分
|
} else { // prevpos在交叉部分
|
||||||
if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
|
if (nextPosKey > prevLastPos) { // nextpos在交叉部分之后 第三种情况
|
||||||
if (nextUnpairInfoP != nullptr) { // 且在pos点,next task有unpair,这样才把这些数据放到next task里
|
if (nextUnpairInfoP != nullptr) { // 且在pos点,next task有unpair,这样才把这些数据放到next task里
|
||||||
auto &nextPairArr = nextUnpairInfoP->pairArr;
|
auto &nextPairArr = nextUnpairInfoP->pairArr;
|
||||||
nextPairArr.push_back(prevFragEnd);
|
nextPairArr.push_back(prevFragEnd);
|
||||||
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
auto &prevPairArr = prevUnpairInfoP->pairArr;
|
||||||
|
|
@ -490,9 +489,9 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
||||||
}
|
}
|
||||||
// 将数据放到next task里,(这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
|
// 将数据放到next task里,(这个位点以后会可能还会计算到,目前方案是都计算,只是把冗余剔除)
|
||||||
recalcPos[ck] = nextPosInfo.taskSeq;
|
recalcPos[ck] = nextPosInfo.taskSeq;
|
||||||
|
|
||||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
} else { // next task在该位点没有unpair,那就把数据放到prev task里
|
} else { // next task在该位点没有unpair,那就把数据放到prev task里
|
||||||
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
auto &prevPairArr = prevUnpairInfoP->pairArr; // prevUnpairInfoP肯定不是nullptr
|
||||||
prevPairArr.push_back(prevFragEnd);
|
prevPairArr.push_back(prevFragEnd);
|
||||||
if (addDataToPos) // 第二种情况
|
if (addDataToPos) // 第二种情况
|
||||||
|
|
@ -515,8 +514,8 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
||||||
std::sort(prevPairArr.begin(), prevPairArr.end());
|
std::sort(prevPairArr.begin(), prevPairArr.end());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
p.unpairedDic.erase(readName); // 在next task里删除该read
|
p.unpairedDic.erase(readName); // 在next task里删除该read
|
||||||
} else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
|
} else if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
|
||||||
auto &remainPosInfo = g.unpairedDic[readName];
|
auto &remainPosInfo = g.unpairedDic[readName];
|
||||||
auto remainFragEnd = remainPosInfo.unpairedRE;
|
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||||
int64_t remainPosKey = remainFragEnd.posKey;
|
int64_t remainPosKey = remainFragEnd.posKey;
|
||||||
|
|
@ -529,14 +528,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
||||||
std::sort(remainPairArr.begin(), remainPairArr.end());
|
std::sort(remainPairArr.begin(), remainPairArr.end());
|
||||||
|
|
||||||
g.unpairedDic.erase(readName);
|
g.unpairedDic.erase(readName);
|
||||||
} else { // 都没找到,那就保存到遗留数据里
|
} else { // 都没找到,那就保存到遗留数据里
|
||||||
int64_t prevPosKey = prevFragEnd.posKey;
|
int64_t prevPosKey = prevFragEnd.posKey;
|
||||||
g.unpairedDic.insert(prevUnpair);
|
g.unpairedDic.insert(prevUnpair);
|
||||||
addToGlobal.insert(prevPosKey);
|
addToGlobal.insert(prevPosKey);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
|
||||||
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
|
||||||
|
|
||||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||||
set<int64_t> posProcessed;
|
set<int64_t> posProcessed;
|
||||||
|
|
@ -557,8 +554,12 @@ static void handleIntersectData(SerailMarkDupArg *lastArg, SerailMarkDupArg *cur
|
||||||
if (taskSeq < lp.taskSeq)
|
if (taskSeq < lp.taskSeq)
|
||||||
g.unpairedPosArr.erase(posKey);
|
g.unpairedPosArr.erase(posKey);
|
||||||
}
|
}
|
||||||
// 更新结果
|
|
||||||
|
|
||||||
|
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
||||||
|
// 放在这里,因为lp中的unpairedPosArr中的readends可能会被修改(比如optical duplicate)
|
||||||
|
for (auto posKey : addToGlobal) g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
||||||
|
|
||||||
|
// 更新结果
|
||||||
for (auto &e : taskChanged) {
|
for (auto &e : taskChanged) {
|
||||||
auto taskSeq = e.first;
|
auto taskSeq = e.first;
|
||||||
auto &t = e.second;
|
auto &t = e.second;
|
||||||
|
|
@ -594,12 +595,12 @@ static void handleLastTask(SerailMarkDupArg *task, GlobalDataArg *gDataArg) {
|
||||||
auto &lp = *task;
|
auto &lp = *task;
|
||||||
auto &g = *gDataArg;
|
auto &g = *gDataArg;
|
||||||
// 遗留的未匹配的pair
|
// 遗留的未匹配的pair
|
||||||
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
|
for (auto &prevUnpair : lp.unpairedDic) { // 遍历上一个任务中的每个未匹配的read
|
||||||
auto &readName = prevUnpair.first;
|
auto &readName = prevUnpair.first;
|
||||||
auto &prevPosInfo = prevUnpair.second;
|
auto &prevPosInfo = prevUnpair.second;
|
||||||
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
auto prevFragEnd = prevPosInfo.unpairedRE; // 未匹配的read end
|
||||||
|
|
||||||
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
|
if (g.unpairedDic.find(readName) != g.unpairedDic.end()) { // 在遗留数据中找到了匹配的read
|
||||||
auto &remainPosInfo = g.unpairedDic[readName];
|
auto &remainPosInfo = g.unpairedDic[readName];
|
||||||
auto remainFragEnd = remainPosInfo.unpairedRE;
|
auto remainFragEnd = remainPosInfo.unpairedRE;
|
||||||
int64_t remainPosKey = remainFragEnd.posKey;
|
int64_t remainPosKey = remainFragEnd.posKey;
|
||||||
|
|
@ -719,8 +720,8 @@ void serialMarkDups() {
|
||||||
// cout << "round time: " << t_round.seconds_elapsed() << endl;
|
// cout << "round time: " << t_round.seconds_elapsed() << endl;
|
||||||
roundNum++;
|
roundNum++;
|
||||||
if (roundNum % 100 == 0) {
|
if (roundNum % 100 == 0) {
|
||||||
cout << "read sum: " << readNumSum << endl;
|
//cout << "read sum: " << readNumSum << endl;
|
||||||
cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
|
//cout << "round time: " << t_round.seconds_elapsed() * 100 << " s" << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// cout << "here" << endl;
|
// cout << "here" << endl;
|
||||||
|
|
@ -769,10 +770,10 @@ void serialMarkDups() {
|
||||||
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
cout << "sort frags : " << tm_arr[9].acc_seconds_elapsed() << endl;
|
||||||
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
cout << "sort pairs : " << tm_arr[10].acc_seconds_elapsed() << endl;
|
||||||
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
cout << "all : " << tm_arr[5].acc_seconds_elapsed() << endl;
|
||||||
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t"
|
cout << "metrics: " << gMetrics.DuplicateCountHist << "\t" << gMetrics.NonOpticalDuplicateCountHist << "\t"
|
||||||
<< gMetrics.NonOpticalDuplicateCountHist << "\t"
|
<< gMetrics.OpticalDuplicatesCountHist << "\t" << gMetrics.OpticalDuplicatesByLibraryId << endl;
|
||||||
<< gMetrics.OpticalDuplicatesCountHist << "\t"
|
cout << "optical dup: " << zzhopticalSet.size() << endl;
|
||||||
<< gMetrics.OpticalDuplicatesByLibraryId << endl;
|
cout << "optical arr dup: " << zzhopticalArr.size() << endl;
|
||||||
|
|
||||||
Timer::log_time("serial end ");
|
Timer::log_time("serial end ");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,8 @@
|
||||||
#include <htslib/thread_pool.h>
|
#include <htslib/thread_pool.h>
|
||||||
#include <sam/utils/read_ends.h>
|
#include <sam/utils/read_ends.h>
|
||||||
#include <sam/utils/read_name_parser.h>
|
#include <sam/utils/read_name_parser.h>
|
||||||
|
#include <set>
|
||||||
|
using std::set;
|
||||||
|
|
||||||
extern Timer tm_arr[20]; // 用来测试性能
|
extern Timer tm_arr[20]; // 用来测试性能
|
||||||
/* 全局本地变量 */
|
/* 全局本地变量 */
|
||||||
|
|
@ -23,4 +25,8 @@ extern MarkDupsArg &g_mdArg;
|
||||||
class GlobalDataArg;
|
class GlobalDataArg;
|
||||||
extern GlobalDataArg &gData;
|
extern GlobalDataArg &gData;
|
||||||
class DuplicationMetrics;
|
class DuplicationMetrics;
|
||||||
extern DuplicationMetrics &gMetrics;
|
extern DuplicationMetrics &gMetrics;
|
||||||
|
|
||||||
|
extern int zzhtestnum;
|
||||||
|
extern set<int64_t> zzhopticalSet;
|
||||||
|
extern vector<int64_t> zzhopticalArr;
|
||||||
|
|
@ -77,6 +77,9 @@ struct ReadEnds : PhysicalLocation {
|
||||||
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
|
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
|
||||||
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
||||||
|
|
||||||
|
/* 用来做一些判断,因为一些readends会做多次操作,比如task之间有重叠等等 */
|
||||||
|
int oprateTime = 0;
|
||||||
|
|
||||||
/* 根据pairend read的比对方向,来确定整体的比对方向 */
|
/* 根据pairend read的比对方向,来确定整体的比对方向 */
|
||||||
static int8_t GetOrientationByte(bool read1NegativeStrand,
|
static int8_t GetOrientationByte(bool read1NegativeStrand,
|
||||||
bool read2NegativeStrand) {
|
bool read2NegativeStrand) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue