删除了串行相关的代码,修复了read名字解析坐标时候出异常的catch问题
This commit is contained in:
parent
6de538670f
commit
7352eb9070
10
run.sh
10
run.sh
|
|
@ -1,14 +1,14 @@
|
|||
#nthread=1
|
||||
nthread=1
|
||||
#nthread=2
|
||||
#nthread=4
|
||||
#nthread=8
|
||||
#nthread=16
|
||||
nthread=32
|
||||
#nthread=64
|
||||
#nthread=32
|
||||
#nthread=32
|
||||
#nthread=128
|
||||
#input=/home/zzh/data/bam/zy_normal.bam
|
||||
input=/home/zzh/data/bam/zy_tumor.bam
|
||||
#input=/home/zzh/data/wgs_na12878.bam
|
||||
#input=/home/zzh/data/bam/zy_tumor.bam
|
||||
input=/home/zzh/data/wgs_na12878.bam
|
||||
#input=~/data/bam/100w.bam
|
||||
#input=~/data/bam/t100w.sam
|
||||
#input=~/data/bam/1k.sam
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "serial_md.h"
|
||||
|
||||
#include "md_types.h"
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
|
|
|||
|
|
@ -30,9 +30,7 @@ Date : 2023/10/23
|
|||
#include "dup_metrics.h"
|
||||
#include "markdups_arg.h"
|
||||
#include "md_funcs.h"
|
||||
#include "parallel_md.h"
|
||||
#include "pipeline_md.h"
|
||||
#include "serial_md.h"
|
||||
#include "shared_args.h"
|
||||
|
||||
using namespace std;
|
||||
|
|
@ -177,14 +175,7 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||||
|
||||
/* 冗余检查和标记 */
|
||||
// if (g_gArg.num_threads == 1) {
|
||||
// // serialMarkDups(); // 串行运行
|
||||
// parallelMarkDups(); // 并行运行
|
||||
// } else {
|
||||
// parallelMarkDups(); // 并行运行
|
||||
// }
|
||||
pipelineMarkDups();
|
||||
//parallelMarkDups();
|
||||
|
||||
/* 标记冗余, 将处理后的结果写入文件 */
|
||||
sam_close(g_inBamFp); // 重新打开bam文件
|
||||
|
|
|
|||
|
|
@ -28,20 +28,6 @@ using std::set;
|
|||
using std::unordered_map;
|
||||
using std::vector;
|
||||
|
||||
/* 清除key位置的数据 */
|
||||
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
||||
auto &msIdx = *pmsIdx;
|
||||
if (msIdx.find(key) != msIdx.end())
|
||||
msIdx[key].clear(); // 清除该位点的冗余结果
|
||||
}
|
||||
|
||||
/* 删除key位置的数据 */
|
||||
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
|
||||
auto &msIdx = *pmsIdx;
|
||||
if (msIdx.find(key) != msIdx.end())
|
||||
msIdx.erase(key);
|
||||
}
|
||||
|
||||
/*
|
||||
* 计算read的分数
|
||||
*/
|
||||
|
|
@ -108,140 +94,6 @@ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnd
|
|||
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a list of ReadEndsForMarkDuplicates objects and identify the
|
||||
* representative read based on quality score. For all members of the duplicate
|
||||
* set, add the read1 index-in-file of the representative read to the records of
|
||||
* the first and second in a pair. This value becomes is used for the 'DI' tag.
|
||||
*/
|
||||
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
|
||||
|
||||
/* 处理一组pairend的readends,标记冗余 */
|
||||
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
|
||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
|
||||
if (vpRe.size() < 2) {
|
||||
if (vpRe.size() == 1) {
|
||||
// addSingletonToCount(libraryIdGenerator);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
|
||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
||||
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
|
||||
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
/** All read ends should have orientation FF, FR, RF, or RR **/
|
||||
for (auto pe : vpRe) // 找分数最高的readend
|
||||
{
|
||||
if (pe->score > maxScore || pBest == nullptr) {
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
|
||||
{
|
||||
// trackOpticalDuplicates
|
||||
}
|
||||
for (auto pe : vpRe) // 对非best read标记冗余
|
||||
{
|
||||
if (pe != pBest) // 非best
|
||||
{
|
||||
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
|
||||
if (pe->read2IndexInFile != pe->read1IndexInFile)
|
||||
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
|
||||
}
|
||||
}
|
||||
|
||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
|
||||
addRepresentativeReadIndex(vpRe);
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理一组非paired的readends,标记冗余 */
|
||||
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
|
||||
DupContainer<int64_t> *dupIdx) {
|
||||
auto &vDupIdx = dupIdx->AtPos(posKey);
|
||||
|
||||
if (containsPairs) {
|
||||
for (auto pe : vpRe) {
|
||||
if (!pe->IsPaired()) {
|
||||
vDupIdx.push_back(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int maxScore = 0;
|
||||
const ReadEnds *pBest = nullptr;
|
||||
for (auto pe : vpRe) {
|
||||
if (pe->score > maxScore || pBest == nullptr) {
|
||||
maxScore = pe->score;
|
||||
pBest = pe;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto pe : vpRe) {
|
||||
if (pe != pBest) {
|
||||
vDupIdx.push_back(pe->read1IndexInFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理位于某个坐标的pairend reads */
|
||||
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
|
||||
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
|
||||
DupContainer<int64_t> *opticalDupIdx) {
|
||||
if (readEnds.size() > 1) { // 有潜在的冗余
|
||||
vpCache.clear();
|
||||
// std::sort(readEnds.begin(), readEnds.end());
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
for (auto &re : readEnds) {
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
|
||||
vpCache.push_back(&re); // 处理一个潜在的冗余组
|
||||
else {
|
||||
markDuplicatePairs(posKey, vpCache, dupIdx,
|
||||
opticalDupIdx); // 不一样
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
}
|
||||
}
|
||||
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
|
||||
}
|
||||
}
|
||||
|
||||
/* 处理位于某个坐标的 reads */
|
||||
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
|
||||
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
|
||||
if (readEnds.size() > 1) // 有潜在的冗余
|
||||
{
|
||||
vpCache.clear();
|
||||
// std::sort(readEnds.begin(), readEnds.end());
|
||||
const ReadEnds *pReadEnd = nullptr;
|
||||
bool containsPairs = false;
|
||||
bool containsFrags = false;
|
||||
for (auto &re : readEnds) {
|
||||
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
|
||||
vpCache.push_back(&re);
|
||||
containsPairs = containsPairs || re.IsPaired();
|
||||
containsFrags = containsFrags || !re.IsPaired();
|
||||
} else {
|
||||
if (vpCache.size() > 1 && containsFrags) {
|
||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
||||
}
|
||||
vpCache.clear();
|
||||
vpCache.push_back(&re);
|
||||
pReadEnd = &re;
|
||||
containsPairs = re.IsPaired();
|
||||
containsFrags = !re.IsPaired();
|
||||
}
|
||||
}
|
||||
if (vpCache.size() > 1 && containsFrags) {
|
||||
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* 对找到的pairend read end添加一些信息 */
|
||||
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
|
||||
auto &pairedEnds = *pPairedEnds;
|
||||
|
|
|
|||
|
|
@ -224,30 +224,6 @@ int16_t computeDuplicateScore(BamWrap &bw);
|
|||
*/
|
||||
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
|
||||
|
||||
/*
|
||||
* 处理一组pairend的readends,标记冗余
|
||||
*/
|
||||
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
|
||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
|
||||
|
||||
/*
|
||||
* 处理一组非paired的readends,标记冗余
|
||||
*/
|
||||
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
|
||||
DupContainer<int64_t> *dupIdx);
|
||||
|
||||
/*
|
||||
* 处理位于某个坐标的pairend reads
|
||||
*/
|
||||
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
|
||||
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
|
||||
|
||||
/*
|
||||
* 处理位于某个坐标的非配对的frag reads
|
||||
*/
|
||||
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
|
||||
DupContainer<int64_t> *dupIdx);
|
||||
|
||||
/*
|
||||
* 对找到的pairend read end添加一些信息
|
||||
*/
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "md_types.h"
|
||||
|
||||
// 并行运行mark duplicate
|
||||
void parallelMarkDups();
|
||||
|
|
@ -916,9 +916,7 @@ static void *pipeIntersect(void *data) {
|
|||
break;
|
||||
}
|
||||
/* 交叉数据处理 readends */
|
||||
// cout << "intersect order: " << pipeArg.intersectOrder << endl;
|
||||
tm_arr[4].acc_start();
|
||||
// cout << "intersect markdup size: " << PEEK_LOCK(pipeArg.markDupSig) << endl;
|
||||
doIntersect(pipeArg);
|
||||
tm_arr[4].acc_end();
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "md_types.h"
|
||||
|
||||
// 串行运行mark duplicate
|
||||
void serialMarkDups();
|
||||
|
|
@ -139,6 +139,7 @@ struct ReadNameParser {
|
|||
} else {
|
||||
// Standard version that will use the regex
|
||||
cmatch m;
|
||||
// cout << "here1" << endl;
|
||||
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
||||
loc->tile = std::stoi(m[1].str());
|
||||
loc->x = std::stoi(m[2].str());
|
||||
|
|
@ -166,6 +167,15 @@ struct ReadNameParser {
|
|||
readNameRegex.c_str(), readName.c_str(), e.what());
|
||||
warnedAboutRegexNotMatching = false;
|
||||
}
|
||||
} catch (...) {
|
||||
if (warnedAboutRegexNotMatching) {
|
||||
Warn(
|
||||
"A field parsed out of a read name was expected to contain "
|
||||
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
|
||||
"%s",
|
||||
readNameRegex.c_str(), readName.c_str());
|
||||
warnedAboutRegexNotMatching = false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
@ -190,6 +200,7 @@ struct ReadNameParser {
|
|||
if (readName.at(i) == delim || 0 == i) {
|
||||
numFields++;
|
||||
const int startIdx = (0 == i) ? 0 : (i + 1);
|
||||
// cout << readName << endl;
|
||||
tmpLocationFields[tokensIdx] =
|
||||
std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
||||
tokensIdx--;
|
||||
|
|
|
|||
Loading…
Reference in New Issue