删除了串行相关的代码,修复了read名字解析坐标时候出异常的catch问题
This commit is contained in:
parent
6de538670f
commit
7352eb9070
10
run.sh
10
run.sh
|
|
@ -1,14 +1,14 @@
|
||||||
#nthread=1
|
nthread=1
|
||||||
#nthread=2
|
#nthread=2
|
||||||
#nthread=4
|
#nthread=4
|
||||||
#nthread=8
|
#nthread=8
|
||||||
#nthread=16
|
#nthread=16
|
||||||
nthread=32
|
#nthread=32
|
||||||
#nthread=64
|
#nthread=32
|
||||||
#nthread=128
|
#nthread=128
|
||||||
#input=/home/zzh/data/bam/zy_normal.bam
|
#input=/home/zzh/data/bam/zy_normal.bam
|
||||||
input=/home/zzh/data/bam/zy_tumor.bam
|
#input=/home/zzh/data/bam/zy_tumor.bam
|
||||||
#input=/home/zzh/data/wgs_na12878.bam
|
input=/home/zzh/data/wgs_na12878.bam
|
||||||
#input=~/data/bam/100w.bam
|
#input=~/data/bam/100w.bam
|
||||||
#input=~/data/bam/t100w.sam
|
#input=~/data/bam/t100w.sam
|
||||||
#input=~/data/bam/1k.sam
|
#input=~/data/bam/1k.sam
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "serial_md.h"
|
|
||||||
|
#include "md_types.h"
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
|
||||||
|
|
@ -30,9 +30,7 @@ Date : 2023/10/23
|
||||||
#include "dup_metrics.h"
|
#include "dup_metrics.h"
|
||||||
#include "markdups_arg.h"
|
#include "markdups_arg.h"
|
||||||
#include "md_funcs.h"
|
#include "md_funcs.h"
|
||||||
#include "parallel_md.h"
|
|
||||||
#include "pipeline_md.h"
|
#include "pipeline_md.h"
|
||||||
#include "serial_md.h"
|
|
||||||
#include "shared_args.h"
|
#include "shared_args.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
@ -177,14 +175,7 @@ int MarkDuplicates(int argc, char *argv[]) {
|
||||||
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||||||
|
|
||||||
/* 冗余检查和标记 */
|
/* 冗余检查和标记 */
|
||||||
// if (g_gArg.num_threads == 1) {
|
|
||||||
// // serialMarkDups(); // 串行运行
|
|
||||||
// parallelMarkDups(); // 并行运行
|
|
||||||
// } else {
|
|
||||||
// parallelMarkDups(); // 并行运行
|
|
||||||
// }
|
|
||||||
pipelineMarkDups();
|
pipelineMarkDups();
|
||||||
//parallelMarkDups();
|
|
||||||
|
|
||||||
/* 标记冗余, 将处理后的结果写入文件 */
|
/* 标记冗余, 将处理后的结果写入文件 */
|
||||||
sam_close(g_inBamFp); // 重新打开bam文件
|
sam_close(g_inBamFp); // 重新打开bam文件
|
||||||
|
|
|
||||||
|
|
@ -28,20 +28,6 @@ using std::set;
|
||||||
using std::unordered_map;
|
using std::unordered_map;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
/* 清除key位置的数据 */
|
|
||||||
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
|
||||||
auto &msIdx = *pmsIdx;
|
|
||||||
if (msIdx.find(key) != msIdx.end())
|
|
||||||
msIdx[key].clear(); // 清除该位点的冗余结果
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 删除key位置的数据 */
|
|
||||||
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
|
||||||
auto &msIdx = *pmsIdx;
|
|
||||||
if (msIdx.find(key) != msIdx.end())
|
|
||||||
msIdx.erase(key);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 计算read的分数
|
* 计算read的分数
|
||||||
*/
|
*/
|
||||||
|
|
@ -108,140 +94,6 @@ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnd
|
||||||
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Takes a list of ReadEndsForMarkDuplicates objects and identify the
|
|
||||||
* representative read based on quality score. For all members of the duplicate
|
|
||||||
* set, add the read1 index-in-file of the representative read to the records of
|
|
||||||
* the first and second in a pair. This value becomes is used for the 'DI' tag.
|
|
||||||
*/
|
|
||||||
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
|
|
||||||
|
|
||||||
/* 处理一组pairend的readends,标记冗余 */
|
|
||||||
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
|
|
||||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
|
|
||||||
if (vpRe.size() < 2) {
|
|
||||||
if (vpRe.size() == 1) {
|
|
||||||
// addSingletonToCount(libraryIdGenerator);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
|
|
||||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
|
||||||
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
|
|
||||||
|
|
||||||
int maxScore = 0;
|
|
||||||
const ReadEnds *pBest = nullptr;
|
|
||||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
|
||||||
for (auto pe : vpRe) // 找分数最高的readend
|
|
||||||
{
|
|
||||||
if (pe->score > maxScore || pBest == nullptr) {
|
|
||||||
maxScore = pe->score;
|
|
||||||
pBest = pe;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
|
||||||
{
|
|
||||||
// trackOpticalDuplicates
|
|
||||||
}
|
|
||||||
for (auto pe : vpRe) // 对非best read标记冗余
|
|
||||||
{
|
|
||||||
if (pe != pBest) // 非best
|
|
||||||
{
|
|
||||||
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
|
|
||||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
|
||||||
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
|
|
||||||
addRepresentativeReadIndex(vpRe);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理一组非paired的readends,标记冗余 */
|
|
||||||
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
|
|
||||||
DupContainer<int64_t> *dupIdx) {
|
|
||||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
|
||||||
|
|
||||||
if (containsPairs) {
|
|
||||||
for (auto pe : vpRe) {
|
|
||||||
if (!pe->IsPaired()) {
|
|
||||||
vDupIdx.push_back(pe->read1IndexInFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
int maxScore = 0;
|
|
||||||
const ReadEnds *pBest = nullptr;
|
|
||||||
for (auto pe : vpRe) {
|
|
||||||
if (pe->score > maxScore || pBest == nullptr) {
|
|
||||||
maxScore = pe->score;
|
|
||||||
pBest = pe;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto pe : vpRe) {
|
|
||||||
if (pe != pBest) {
|
|
||||||
vDupIdx.push_back(pe->read1IndexInFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理位于某个坐标的pairend reads */
|
|
||||||
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
|
|
||||||
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
|
|
||||||
DupContainer<int64_t> *opticalDupIdx) {
|
|
||||||
if (readEnds.size() > 1) { // 有潜在的冗余
|
|
||||||
vpCache.clear();
|
|
||||||
// std::sort(readEnds.begin(), readEnds.end());
|
|
||||||
const ReadEnds *pReadEnd = nullptr;
|
|
||||||
for (auto &re : readEnds) {
|
|
||||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
|
||||||
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
|
||||||
else {
|
|
||||||
markDuplicatePairs(posKey, vpCache, dupIdx,
|
|
||||||
opticalDupIdx); // 不一样
|
|
||||||
vpCache.clear();
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
pReadEnd = &re;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 处理位于某个坐标的 reads */
|
|
||||||
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
|
|
||||||
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
|
|
||||||
if (readEnds.size() > 1) // 有潜在的冗余
|
|
||||||
{
|
|
||||||
vpCache.clear();
|
|
||||||
// std::sort(readEnds.begin(), readEnds.end());
|
|
||||||
const ReadEnds *pReadEnd = nullptr;
|
|
||||||
bool containsPairs = false;
|
|
||||||
bool containsFrags = false;
|
|
||||||
for (auto &re : readEnds) {
|
|
||||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
containsPairs = containsPairs || re.IsPaired();
|
|
||||||
containsFrags = containsFrags || !re.IsPaired();
|
|
||||||
} else {
|
|
||||||
if (vpCache.size() > 1 && containsFrags) {
|
|
||||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
|
||||||
}
|
|
||||||
vpCache.clear();
|
|
||||||
vpCache.push_back(&re);
|
|
||||||
pReadEnd = &re;
|
|
||||||
containsPairs = re.IsPaired();
|
|
||||||
containsFrags = !re.IsPaired();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (vpCache.size() > 1 && containsFrags) {
|
|
||||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 对找到的pairend read end添加一些信息 */
|
/* 对找到的pairend read end添加一些信息 */
|
||||||
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
|
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
|
||||||
auto &pairedEnds = *pPairedEnds;
|
auto &pairedEnds = *pPairedEnds;
|
||||||
|
|
|
||||||
|
|
@ -224,30 +224,6 @@ int16_t computeDuplicateScore(BamWrap &bw);
|
||||||
*/
|
*/
|
||||||
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
|
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
|
||||||
|
|
||||||
/*
|
|
||||||
* 处理一组pairend的readends,标记冗余
|
|
||||||
*/
|
|
||||||
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
|
|
||||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 处理一组非paired的readends,标记冗余
|
|
||||||
*/
|
|
||||||
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
|
|
||||||
DupContainer<int64_t> *dupIdx);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 处理位于某个坐标的pairend reads
|
|
||||||
*/
|
|
||||||
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
|
|
||||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 处理位于某个坐标的非配对的frag reads
|
|
||||||
*/
|
|
||||||
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
|
|
||||||
DupContainer<int64_t> *dupIdx);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 对找到的pairend read end添加一些信息
|
* 对找到的pairend read end添加一些信息
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "md_types.h"
|
|
||||||
|
|
||||||
// 并行运行mark duplicate
|
|
||||||
void parallelMarkDups();
|
|
||||||
|
|
@ -916,9 +916,7 @@ static void *pipeIntersect(void *data) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* 交叉数据处理 readends */
|
/* 交叉数据处理 readends */
|
||||||
// cout << "intersect order: " << pipeArg.intersectOrder << endl;
|
|
||||||
tm_arr[4].acc_start();
|
tm_arr[4].acc_start();
|
||||||
// cout << "intersect markdup size: " << PEEK_LOCK(pipeArg.markDupSig) << endl;
|
|
||||||
doIntersect(pipeArg);
|
doIntersect(pipeArg);
|
||||||
tm_arr[4].acc_end();
|
tm_arr[4].acc_end();
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "md_types.h"
|
|
||||||
|
|
||||||
// 串行运行mark duplicate
|
|
||||||
void serialMarkDups();
|
|
||||||
|
|
@ -139,6 +139,7 @@ struct ReadNameParser {
|
||||||
} else {
|
} else {
|
||||||
// Standard version that will use the regex
|
// Standard version that will use the regex
|
||||||
cmatch m;
|
cmatch m;
|
||||||
|
// cout << "here1" << endl;
|
||||||
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
||||||
loc->tile = std::stoi(m[1].str());
|
loc->tile = std::stoi(m[1].str());
|
||||||
loc->x = std::stoi(m[2].str());
|
loc->x = std::stoi(m[2].str());
|
||||||
|
|
@ -166,6 +167,15 @@ struct ReadNameParser {
|
||||||
readNameRegex.c_str(), readName.c_str(), e.what());
|
readNameRegex.c_str(), readName.c_str(), e.what());
|
||||||
warnedAboutRegexNotMatching = false;
|
warnedAboutRegexNotMatching = false;
|
||||||
}
|
}
|
||||||
|
} catch (...) {
|
||||||
|
if (warnedAboutRegexNotMatching) {
|
||||||
|
Warn(
|
||||||
|
"A field parsed out of a read name was expected to contain "
|
||||||
|
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
|
||||||
|
"%s",
|
||||||
|
readNameRegex.c_str(), readName.c_str());
|
||||||
|
warnedAboutRegexNotMatching = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -190,6 +200,7 @@ struct ReadNameParser {
|
||||||
if (readName.at(i) == delim || 0 == i) {
|
if (readName.at(i) == delim || 0 == i) {
|
||||||
numFields++;
|
numFields++;
|
||||||
const int startIdx = (0 == i) ? 0 : (i + 1);
|
const int startIdx = (0 == i) ? 0 : (i + 1);
|
||||||
|
// cout << readName << endl;
|
||||||
tmpLocationFields[tokensIdx] =
|
tmpLocationFields[tokensIdx] =
|
||||||
std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
||||||
tokensIdx--;
|
tokensIdx--;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue