删除了串行相关的代码,修复了read名字解析坐标时候出异常的catch问题

This commit is contained in:
zzh 2024-11-22 00:46:11 +08:00
parent 6de538670f
commit 7352eb9070
11 changed files with 20 additions and 2572 deletions

10
run.sh
View File

@ -1,14 +1,14 @@
#nthread=1
nthread=1
#nthread=2
#nthread=4
#nthread=8
#nthread=16
nthread=32
#nthread=64
#nthread=32
#nthread=32
#nthread=128
#input=/home/zzh/data/bam/zy_normal.bam
input=/home/zzh/data/bam/zy_tumor.bam
#input=/home/zzh/data/wgs_na12878.bam
#input=/home/zzh/data/bam/zy_tumor.bam
input=/home/zzh/data/wgs_na12878.bam
#input=~/data/bam/100w.bam
#input=~/data/bam/t100w.sam
#input=~/data/bam/1k.sam

View File

@ -1,9 +1,11 @@
#pragma once
#include <string>
#include <stdint.h>
#include <string>
#include <vector>
#include "serial_md.h"
#include "md_types.h"
using std::string;
using std::vector;

View File

@ -30,9 +30,7 @@ Date : 2023/10/23
#include "dup_metrics.h"
#include "markdups_arg.h"
#include "md_funcs.h"
#include "parallel_md.h"
#include "pipeline_md.h"
#include "serial_md.h"
#include "shared_args.h"
using namespace std;
@ -177,14 +175,7 @@ int MarkDuplicates(int argc, char *argv[]) {
hts_set_opt(g_outBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
/* 冗余检查和标记 */
// if (g_gArg.num_threads == 1) {
// // serialMarkDups(); // 串行运行
// parallelMarkDups(); // 并行运行
// } else {
// parallelMarkDups(); // 并行运行
// }
pipelineMarkDups();
//parallelMarkDups();
/* 标记冗余, 将处理后的结果写入文件 */
sam_close(g_inBamFp); // 重新打开bam文件

View File

@ -28,20 +28,6 @@ using std::set;
using std::unordered_map;
using std::vector;
/* 清除key位置的数据 */
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx[key].clear(); // 清除该位点的冗余结果
}
/* 删除key位置的数据 */
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx.erase(key);
}
/*
* read
*/
@ -108,140 +94,6 @@ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnd
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
}
/**
* Takes a list of ReadEndsForMarkDuplicates objects and identify the
* representative read based on quality score. For all members of the duplicate
* set, add the read1 index-in-file of the representative read to the records of
* the first and second in a pair. This value becomes is used for the 'DI' tag.
*/
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
/* 处理一组pairend的readends标记冗余 */
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
if (vpRe.size() < 2) {
if (vpRe.size() == 1) {
// addSingletonToCount(libraryIdGenerator);
}
return;
}
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
auto &vDupIdx = dupIdx->AtPos(posKey);
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
int maxScore = 0;
const ReadEnds *pBest = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) // 找分数最高的readend
{
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
{
// trackOpticalDuplicates
}
for (auto pe : vpRe) // 对非best read标记冗余
{
if (pe != pBest) // 非best
{
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
}
}
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
addRepresentativeReadIndex(vpRe);
}
}
/* 处理一组非paired的readends标记冗余 */
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
DupContainer<int64_t> *dupIdx) {
auto &vDupIdx = dupIdx->AtPos(posKey);
if (containsPairs) {
for (auto pe : vpRe) {
if (!pe->IsPaired()) {
vDupIdx.push_back(pe->read1IndexInFile);
}
}
} else {
int maxScore = 0;
const ReadEnds *pBest = nullptr;
for (auto pe : vpRe) {
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
for (auto pe : vpRe) {
if (pe != pBest) {
vDupIdx.push_back(pe->read1IndexInFile);
}
}
}
}
/* 处理位于某个坐标的pairend reads */
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
DupContainer<int64_t> *opticalDupIdx) {
if (readEnds.size() > 1) { // 有潜在的冗余
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组
else {
markDuplicatePairs(posKey, vpCache, dupIdx,
opticalDupIdx); // 不一样
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
}
}
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
}
}
/* 处理位于某个坐标的 reads */
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
if (readEnds.size() > 1) // 有潜在的冗余
{
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
bool containsPairs = false;
bool containsFrags = false;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
vpCache.push_back(&re);
containsPairs = containsPairs || re.IsPaired();
containsFrags = containsFrags || !re.IsPaired();
} else {
if (vpCache.size() > 1 && containsFrags) {
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
containsPairs = re.IsPaired();
containsFrags = !re.IsPaired();
}
}
if (vpCache.size() > 1 && containsFrags) {
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
}
}
/* 对找到的pairend read end添加一些信息 */
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
auto &pairedEnds = *pPairedEnds;

View File

@ -224,30 +224,6 @@ int16_t computeDuplicateScore(BamWrap &bw);
*/
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey);
/*
* pairendreadends
*/
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
/*
* pairedreadends
*/
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
DupContainer<int64_t> *dupIdx);
/*
* pairend reads
*/
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx);
/*
* frag reads
*/
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds, vector<const ReadEnds *> &vpCache,
DupContainer<int64_t> *dupIdx);
/*
* pairend read end
*/

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +0,0 @@
#pragma once
#include "md_types.h"
// 并行运行mark duplicate
void parallelMarkDups();

View File

@ -916,9 +916,7 @@ static void *pipeIntersect(void *data) {
break;
}
/* 交叉数据处理 readends */
// cout << "intersect order: " << pipeArg.intersectOrder << endl;
tm_arr[4].acc_start();
// cout << "intersect markdup size: " << PEEK_LOCK(pipeArg.markDupSig) << endl;
doIntersect(pipeArg);
tm_arr[4].acc_end();

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +0,0 @@
#pragma once
#include "md_types.h"
// 串行运行mark duplicate
void serialMarkDups();

View File

@ -139,6 +139,7 @@ struct ReadNameParser {
} else {
// Standard version that will use the regex
cmatch m;
// cout << "here1" << endl;
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
loc->tile = std::stoi(m[1].str());
loc->x = std::stoi(m[2].str());
@ -166,6 +167,15 @@ struct ReadNameParser {
readNameRegex.c_str(), readName.c_str(), e.what());
warnedAboutRegexNotMatching = false;
}
} catch (...) {
if (warnedAboutRegexNotMatching) {
Warn(
"A field parsed out of a read name was expected to contain "
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
"%s",
readNameRegex.c_str(), readName.c_str());
warnedAboutRegexNotMatching = false;
}
}
return true;
@ -190,6 +200,7 @@ struct ReadNameParser {
if (readName.at(i) == delim || 0 == i) {
numFields++;
const int startIdx = (0 == i) ? 0 : (i + 1);
// cout << readName << endl;
tmpLocationFields[tokensIdx] =
std::stoi(readName.substr(startIdx, endIdx - startIdx));
tokensIdx--;