统计信息写入文件,完整版1.0
This commit is contained in:
parent
0ee22ca331
commit
223d16f2eb
|
|
@ -4,6 +4,7 @@ core.*
|
|||
*.sam
|
||||
*.bam
|
||||
*.log
|
||||
log/
|
||||
# ---> C++
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
|
|
|||
|
|
@ -103,6 +103,7 @@
|
|||
"__bit_reference": "cpp",
|
||||
"__string": "cpp",
|
||||
"filesystem": "cpp",
|
||||
"ios": "cpp"
|
||||
"ios": "cpp",
|
||||
"locale": "cpp"
|
||||
}
|
||||
}
|
||||
17
run.sh
17
run.sh
|
|
@ -6,11 +6,12 @@
|
|||
nthread=32
|
||||
#nthread=64
|
||||
#nthread=128
|
||||
#input=/home/zzh/data/bam/zy_normal.bam
|
||||
#input=/home/zzh/data/bam/normal_all.sam
|
||||
input=/home/zzh/data/bam/zy_normal.bam
|
||||
#input=/home/zzh/data/bam/normal_all.sam
|
||||
#input=/home/zzh/data/bam/zy_tumor.bam
|
||||
#input=/home/zzh/data/wgs_na12878.bam
|
||||
input=~/data/bam/100w.bam
|
||||
#input=~/data/bam/100w.bam
|
||||
#input=~/data/bam/t100w.sam
|
||||
#input=~/data/bam/1k.sam
|
||||
#input=~/data/bam/1kw.sam
|
||||
|
|
@ -19,7 +20,9 @@ input=~/data/bam/100w.bam
|
|||
#input=~/data/bam/tumor_small.bam
|
||||
#input=~/data/bam/normal_small.bam
|
||||
#input=~/data/bam/normal_30.bam
|
||||
output=/home/zzh/data1/na12878_out.bam
|
||||
#output=/home/zzh/data1/na12878_out.bam
|
||||
output=./out.bam
|
||||
metrics=./metrics.txt
|
||||
|
||||
cd ./build/ && make -j 8 && cd ..
|
||||
#./out.sam \
|
||||
|
|
@ -27,11 +30,11 @@ time /home/zzh/work/ngs/picard_cpp/build/bin/picard_cpp \
|
|||
MarkDuplicates \
|
||||
--INPUT $input \
|
||||
--OUTPUT $output \
|
||||
--INDEX_FORMAT BAI \
|
||||
--num_threads $nthread \
|
||||
--max_mem 2G \
|
||||
--METRICS_FILE $metrics \
|
||||
--INDEX_FORMAT CSI \
|
||||
--num_threads $nthread \
|
||||
--verbosity DEBUG \
|
||||
--asyncio true -TAG_DUPLICATE_SET_MEMBERS true --READ_NAME_REGEX null #
|
||||
--asyncio true -TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX null #
|
||||
#--TAG_DUPLICATE_SET_MEMBERS true #--READ_NAME_REGEX ""
|
||||
#--READ_NAME_REGEX ".*?([0-9]+):([0-9]+):([0-9]+)$"
|
||||
#--READ_NAME_REGEX ""
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ void GlobalArg::initGlobalOptions() {
|
|||
v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
|
||||
v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
|
||||
v.push_back({"num_threads", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_NUM_THREADS});
|
||||
v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM});
|
||||
// v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM});
|
||||
v.push_back({"verbosity", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_LOG_LEVEL});
|
||||
v.push_back({"asyncio", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_ASYNCIO});
|
||||
v.push_back({"version", no_argument, NULL, ns_ga::GlobalOptEnum::OPT_VERSION});
|
||||
|
|
@ -43,7 +43,7 @@ void GlobalArg::initGlobalOptions() {
|
|||
vArgInfo.push_back("--INPUT Input file path (bam, vcf ...)\n");
|
||||
vArgInfo.push_back("--OUTPUT Output file path \n");
|
||||
vArgInfo.push_back("--num_threads <num_threads> Number of threads to allocate to this analysis [1]\n");
|
||||
vArgInfo.push_back("--max_mem <max_mem> Set maximum memory; suffix K/M/G recognized [2G]\n");
|
||||
// vArgInfo.push_back("--max_mem <max_mem> Set maximum memory; suffix K/M/G recognized [2G]\n");
|
||||
vArgInfo.push_back("--verbosity <log level> Control verbosity of logging. error/warning/info/debug [info]\n");
|
||||
vArgInfo.push_back("--asyncio Use async io [true]\n");
|
||||
vArgInfo.push_back("--version Output version information\n");
|
||||
|
|
@ -73,8 +73,8 @@ void GlobalArg::parseArgument(int argNum) {
|
|||
else if (*q == 'g' || *q == 'G')
|
||||
mem_arg <<= 30;
|
||||
//if (mem_arg >= max_mem)
|
||||
if (true)
|
||||
max_mem = mem_arg;
|
||||
//if (true)
|
||||
// max_mem = mem_arg;
|
||||
else {
|
||||
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
||||
}
|
||||
|
|
@ -104,10 +104,10 @@ void GlobalArg::parseArgument(int argNum) {
|
|||
}
|
||||
|
||||
string GlobalArg::GetArgValueString() {
|
||||
printf("--INPUT = %s\n", in_fn.c_str());
|
||||
printf("--INPUT = %s\n", in_fn.c_str());
|
||||
printf("--OUTPUT = %s\n", out_fn.c_str());
|
||||
printf("--num_threads = %d\n", num_threads);
|
||||
printf("--max_mem = %ld\n", max_mem);
|
||||
// printf("--max_mem = %ld\n", max_mem);
|
||||
printf("--verbosity = %d\n", verbosity);
|
||||
printf("--asyncio = %d\n", use_asyncio);
|
||||
|
||||
|
|
|
|||
|
|
@ -48,7 +48,8 @@ struct GlobalArg {
|
|||
string in_fn; // input bam filename
|
||||
string out_fn; // output bam filename
|
||||
int num_threads = 1; // 线程个数
|
||||
size_t max_mem = ((size_t)1) << 30; // 最小1G
|
||||
// size_t max_mem = ((size_t)3) << 29; // 最小1.5G
|
||||
size_t max_mem = ((size_t)1) << 31; // 最小2G
|
||||
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
|
||||
bool use_asyncio = true; // 是否使用异步io
|
||||
|
||||
|
|
|
|||
|
|
@ -64,6 +64,16 @@ void Timer::print_current_time() {
|
|||
puts(now);
|
||||
}
|
||||
|
||||
string Timer::get_current_time_str() {
|
||||
time_t time_val;
|
||||
struct tm *at;
|
||||
char now[80];
|
||||
time(&time_val);
|
||||
at = localtime(&time_val);
|
||||
strftime(now, 79, "%B %d, %Y at %I:%M:%S %p %Z", at);
|
||||
return string(now);
|
||||
}
|
||||
|
||||
double Timer::get_mseconds() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
|
|
|
|||
|
|
@ -10,6 +10,9 @@
|
|||
#ifndef TIMER_H_
|
||||
#define TIMER_H_
|
||||
|
||||
#include <string>
|
||||
using std::string;
|
||||
|
||||
/*
|
||||
* @brief: Record the run time of this program
|
||||
*/
|
||||
|
|
@ -27,6 +30,7 @@ public:
|
|||
|
||||
static void log_time(const char *desc);
|
||||
static void print_current_time();
|
||||
static string get_current_time_str();
|
||||
static double get_mseconds();
|
||||
static double get_seconds();
|
||||
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ struct DuplicationMetrics {
|
|||
uint64_t ESTIMATED_LIBRARY_SIZE = 0;
|
||||
|
||||
// 其他的统计数据
|
||||
vector<double> CoverageMult;
|
||||
|
||||
// 比如在该位置,有4个冗余,那么所有4个冗余的位置数量
|
||||
MDMap DuplicateCountHist;
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ Date : 2023/10/23
|
|||
#include <sam/utils/read_ends.h>
|
||||
#include <sam/utils/read_name_parser.h>
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
|
|
@ -39,7 +40,8 @@ using std::string;
|
|||
|
||||
#define SMA_TAG_PG "PG"
|
||||
|
||||
#define BAM_BLOCK_SIZE 32L * 1024 * 1024
|
||||
#define BAM_BLOCK_SIZE 16L * 1024 * 1024
|
||||
// #define BAM_BLOCK_SIZE 32L * 1024 * 1024
|
||||
#define NO_SUCH_INDEX INT64_MAX
|
||||
|
||||
Timer tm_arr[50]; // 用来测试性能
|
||||
|
|
@ -107,6 +109,34 @@ static int64_t estimateLibrarySize(int64_t readPairs, int64_t uniqueReadPairs) {
|
|||
return librarySize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimates the ROI (return on investment) that one would see if a library was sequenced to
|
||||
* x higher coverage than the observed coverage.
|
||||
*
|
||||
* @param estimatedLibrarySize the estimated number of molecules in the library
|
||||
* @param x the multiple of sequencing to be simulated (i.e. how many X sequencing)
|
||||
* @param pairs the number of pairs observed in the actual sequencing
|
||||
* @param uniquePairs the number of unique pairs observed in the actual sequencing
|
||||
* @return a number z <= x that estimates if you had pairs*x as your sequencing then you
|
||||
* would observe uniquePairs*z unique pairs.
|
||||
*/
|
||||
static double estimateRoi(long estimatedLibrarySize, double x, long pairs, long uniquePairs) {
|
||||
return estimatedLibrarySize * (1 - exp(-(x * pairs) / estimatedLibrarySize)) / uniquePairs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates a histogram using the estimateRoi method to estimate the effective yield
|
||||
* doing x sequencing for x=1..10.
|
||||
*/
|
||||
static void calculateRoiHistogram(DuplicationMetrics &metrics) {
|
||||
int64_t uniquePairs = metrics.READ_PAIRS_EXAMINED - metrics.READ_PAIR_DUPLICATES;
|
||||
metrics.CoverageMult.resize(101);
|
||||
for (int x = 1; x <= 100; ++x) {
|
||||
metrics.CoverageMult[x] =
|
||||
estimateRoi(metrics.ESTIMATED_LIBRARY_SIZE, x, metrics.READ_PAIRS_EXAMINED, uniquePairs);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* 获取文件名后缀
|
||||
*/
|
||||
|
|
@ -127,7 +157,7 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
Timer time_all;
|
||||
|
||||
/* 读取命令行参数 */
|
||||
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
||||
g_mdArg.parseArgument(argc, argv, &g_gArg); // 解析命令行参数
|
||||
if (g_gArg.num_threads < 1)
|
||||
g_gArg.num_threads = 1; // 线程数不能小于1
|
||||
|
||||
|
|
@ -150,10 +180,10 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
/* 利用线程池对输入输出文件进行读写 */
|
||||
htsThreadPool htsPoolRead = {NULL, 0}; // 多线程读取,创建线程池
|
||||
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||||
// htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
||||
// htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
||||
htsPoolRead.pool = hts_tpool_init(32);
|
||||
htsPoolWrite.pool = hts_tpool_init(32);
|
||||
htsPoolRead.pool = hts_tpool_init(g_gArg.num_threads);
|
||||
htsPoolWrite.pool = hts_tpool_init(g_gArg.num_threads);
|
||||
//htsPoolRead.pool = hts_tpool_init(16);
|
||||
//htsPoolWrite.pool = hts_tpool_init(32);
|
||||
if (!htsPoolRead.pool || !htsPoolWrite.pool) {
|
||||
Error("[%d] failed to set up thread pool", __LINE__);
|
||||
sam_close(g_inBamFp);
|
||||
|
|
@ -237,7 +267,7 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
uint32_t duplicateSetSize = 0;
|
||||
|
||||
int64_t realDupSize = 0;
|
||||
// exit(0);
|
||||
// exit(0);
|
||||
while (inBuf.ReadStat() >= 0) {
|
||||
Timer tw1;
|
||||
size_t readNum = inBuf.ReadBam();
|
||||
|
|
@ -335,7 +365,7 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
|
||||
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS && isInDuplicateSet) {
|
||||
if (!bw->IsSecondaryOrSupplementary() && !bw->GetReadUnmappedFlag()) {
|
||||
cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl;
|
||||
// cerr << bamIdx << " " << representativeReadIndexInFile << " " << duplicateSetSize << endl;
|
||||
uint8_t *oldTagVal = bam_aux_get(bw->b, g_mdArg.DUPLICATE_SET_INDEX_TAG.c_str());
|
||||
if (oldTagVal != NULL)
|
||||
bam_aux_del(bw->b, oldTagVal);
|
||||
|
|
@ -391,6 +421,7 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
} else {
|
||||
gMetrics.PERCENT_DUPLICATION = 0;
|
||||
}
|
||||
calculateRoiHistogram(gMetrics);
|
||||
|
||||
cout << "metrics: \n"
|
||||
<< "LIBRARY: " << gMetrics.LIBRARY << "\n"
|
||||
|
|
@ -404,12 +435,46 @@ int MarkDuplicates(int argc, char *argv[]) {
|
|||
<< "PERCENT_DUPLICATION: " << gMetrics.PERCENT_DUPLICATION << "\n"
|
||||
<< "ESTIMATED_LIBRARY_SIZE: " << gMetrics.ESTIMATED_LIBRARY_SIZE << endl;
|
||||
|
||||
// 将统计信息写到文件里
|
||||
if (!g_mdArg.METRICS_FILE.empty()) {
|
||||
ofstream ofsM(g_mdArg.METRICS_FILE);
|
||||
ofsM << "## htsjdk.samtools.metrics.StringHeader" << endl;
|
||||
ofsM << "# " << g_mdArg.PROGRAM_RECORD_ID << " " << g_gArg.GetArgValueString() << " " << g_mdArg.GetArgValueString() << endl;
|
||||
ofsM << "## htsjdk.samtools.metrics.StringHeader" << endl;
|
||||
ofsM << "# Started on: " << Timer::get_current_time_str() << endl << endl;
|
||||
ofsM << "## METRICS CLASS\torg.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics" << endl;
|
||||
ofsM << "LIBRARY\tUNPAIRED_READS_EXAMINED\tREAD_PAIRS_EXAMINED\tSECONDARY_OR_SUPPLEMENTARY_RDS\tUNMAPPED_"
|
||||
"READS\tUNPAIRED_READ_DUPLICATES\tREAD_PAIR_DUPLICATES\tREAD_PAIR_OPTICAL_DUPLICATES\tPERCENT_"
|
||||
"DUPLICATION\tESTIMATED_LIBRARY_SIZE"
|
||||
<< endl;
|
||||
ofsM << gMetrics.LIBRARY << "\t" << gMetrics.UNPAIRED_READS_EXAMINED << "\t" << gMetrics.READ_PAIRS_EXAMINED
|
||||
<< "\t" << gMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS << "\t" << gMetrics.UNMAPPED_READS << "\t"
|
||||
<< gMetrics.UNPAIRED_READ_DUPLICATES << "\t" << gMetrics.READ_PAIR_DUPLICATES << "\t"
|
||||
<< gMetrics.READ_PAIR_OPTICAL_DUPLICATES << "\t" << gMetrics.PERCENT_DUPLICATION << "\t"
|
||||
<< gMetrics.ESTIMATED_LIBRARY_SIZE << endl << endl;
|
||||
ofsM << "## HISTOGRAM\tjava.lang.Double" << endl;
|
||||
ofsM << "BIN CoverageMult" << endl;
|
||||
for (int i = 1; i <= 100; ++i) {
|
||||
ofsM << i << "\t" << std::fixed << std::setprecision(6) << gMetrics.CoverageMult[i] << endl;
|
||||
}
|
||||
ofsM.close();
|
||||
}
|
||||
|
||||
#if 0
|
||||
cout << "CoverageMult:" << endl;
|
||||
for (int i = 1; i <= 100; ++i) {
|
||||
cout << i << "\t" << gMetrics.CoverageMult[i] << endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
cout << "singleton pos: " << gMetrics.singletonReads.size() << endl;
|
||||
for (auto &itr : gMetrics.DuplicateCountHist) cout << "dup counts: " << itr.first << ": " << itr.second << endl;
|
||||
for (auto &itr : gMetrics.NonOpticalDuplicateCountHist)
|
||||
cout << "non optical dup counts: " << itr.first << ": " << itr.second << endl;
|
||||
for (auto &itr : gMetrics.OpticalDuplicatesCountHist)
|
||||
cout << "optical dup counts: " << itr.first << ": " << itr.second << endl;
|
||||
#endif
|
||||
cout << "dupsize: " << dupIdxQue.Size() << "\t" << opticalIdxQue.Size() << "\t" << repIdxQue.Size() << endl;
|
||||
if (!indexFn.empty() && sam_idx_save(g_outBamFp) < 0) {
|
||||
Error("writing index failed");
|
||||
|
|
|
|||
|
|
@ -253,6 +253,7 @@ void MarkDupsArg::printArgValue()
|
|||
|
||||
string MarkDupsArg::GetArgValueString() {
|
||||
vector<string> argVals;
|
||||
if (!this->METRICS_FILE.empty()) { argVals.push_back("--METRICS_FILE");argVals.push_back(this->METRICS_FILE); }
|
||||
argVals.push_back("--READ_NAME_REGEX");
|
||||
if (this->READ_NAME_REGEX.empty()) argVals.push_back("null");
|
||||
else argVals.push_back(this->READ_NAME_REGEX.c_str());
|
||||
|
|
|
|||
|
|
@ -660,7 +660,6 @@ static void doIntersect(PipelineArg &pipeArg) {
|
|||
addToGlobal.insert(prevPosKey);
|
||||
}
|
||||
}
|
||||
// return;
|
||||
map<int64_t, TaskSeqDupInfo> taskChanged;
|
||||
MDSet<int64_t> posProcessed;
|
||||
for (auto &e : recalcPos) {
|
||||
|
|
@ -683,10 +682,11 @@ static void doIntersect(PipelineArg &pipeArg) {
|
|||
}
|
||||
// 最后再添加,以防开始赋值,后来这个位置要是又添加了新的数据
|
||||
// 放在这里,因为lp中的unpairedPosArr中的readends可能会被修改(比如optical duplicate)
|
||||
// for (auto posKey : addToGlobal) {
|
||||
// g.unpairedPosArr[posKey] = lp.unpairedPosArr[posKey];
|
||||
// }
|
||||
// return;
|
||||
#if 0
|
||||
for (auto posKey : addToGlobal) {
|
||||
g.unpairedPosArr[posKey] = lpSM.unpairedPosArr[posKey];
|
||||
}
|
||||
#endif
|
||||
/* 不需要把p中找不到lp的unpair,放到global中,否则最后找到pair后,还要再执行一次冗余检测,造成重复的冗余索引 */
|
||||
|
||||
// 更新结果
|
||||
|
|
|
|||
|
|
@ -30,10 +30,10 @@ struct PhysicalLocation {
|
|||
* overflow within a HiSeqX tile.
|
||||
*/
|
||||
int16_t tile = -1;
|
||||
int32_t x = -1;
|
||||
int32_t y = -1;
|
||||
//int16_t x = -1;
|
||||
//int16_t y = -1;
|
||||
//int32_t x = -1;
|
||||
//int32_t y = -1;
|
||||
int16_t x = -1;
|
||||
int16_t y = -1;
|
||||
};
|
||||
|
||||
/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/
|
||||
|
|
|
|||
Loading…
Reference in New Issue