基本完成了参数的处理,帮助信息里有些参数需要删掉
This commit is contained in:
parent
95b0f685ff
commit
023836a047
|
|
@ -12,7 +12,14 @@
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
"program": "${workspaceRoot}/build/bin/picard_cpp",
|
||||||
"args": [
|
"args": [
|
||||||
"MarkDuplicates"
|
"MarkDuplicates",
|
||||||
|
"--INPUT", "test.bam",
|
||||||
|
"--OUTPUT", "out.bam",
|
||||||
|
"--METRICS_FILE", "metrics.txt",
|
||||||
|
"--num_threads", "12",
|
||||||
|
"--max_mem", "4G",
|
||||||
|
"--verbosity", "DEBUG",
|
||||||
|
"--asyncio", "true",
|
||||||
],
|
],
|
||||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
{
|
{
|
||||||
"files.associations": {
|
"files.associations": {
|
||||||
"cstring": "cpp"
|
"cstring": "cpp",
|
||||||
|
"vector": "cpp",
|
||||||
|
"random": "cpp"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
#!/bin/bash
|
||||||
|
dir="/home/zzh/work/GeneKit/picard_cpp/build"
|
||||||
|
#[ -d "$dir" ] && rm -rf "$dir"
|
||||||
|
#mkdir "$dir"
|
||||||
|
cd "$dir"
|
||||||
|
cmake .. -DCMAKE_BUILD_TYPE=Debug
|
||||||
|
#cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||||
|
make -j 8
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
/home/zzh/work/GeneKit/picard_cpp/build/bin/picard_cpp \
|
||||||
|
MarkDuplicates \
|
||||||
|
--INPUT test.bam \
|
||||||
|
--OUTPUT out.bam \
|
||||||
|
--num_threads 12 \
|
||||||
|
--max_mem 4G \
|
||||||
|
--verbosity DEBUG \
|
||||||
|
--asyncio true
|
||||||
|
|
@ -3,7 +3,7 @@ SET(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin")
|
||||||
|
|
||||||
# 源码目录
|
# 源码目录
|
||||||
AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src MAIN_SRC)
|
AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src MAIN_SRC)
|
||||||
# AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/common COMMON)
|
AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/common COMMON)
|
||||||
AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/sam SAM_SRC)
|
AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/sam SAM_SRC)
|
||||||
AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/sam/markdups SAM_MARKDUPS_SRC)
|
AUX_SOURCE_DIRECTORY(${PROJECT_SOURCE_DIR}/src/sam/markdups SAM_MARKDUPS_SRC)
|
||||||
|
|
||||||
|
|
@ -19,7 +19,7 @@ LINK_DIRECTORIES("${PROJECT_SOURCE_DIR}/lib/htslib")
|
||||||
set(PG_NAME "picard_cpp")
|
set(PG_NAME "picard_cpp")
|
||||||
|
|
||||||
# 为程序添加依赖关系
|
# 为程序添加依赖关系
|
||||||
ADD_EXECUTABLE(${PG_NAME} ${MAIN_SRC} ${SAM_SRC} ${SAM_MARKDUPS_SRC})
|
ADD_EXECUTABLE(${PG_NAME} ${MAIN_SRC} ${COMMON} ${SAM_SRC} ${SAM_MARKDUPS_SRC})
|
||||||
|
|
||||||
# 链接库
|
# 链接库
|
||||||
TARGET_LINK_LIBRARIES(${PG_NAME} libhts.a)
|
TARGET_LINK_LIBRARIES(${PG_NAME} libhts.a)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,109 @@
|
||||||
|
/*
|
||||||
|
Description: 全局参数,所有模块都可能用到的参数
|
||||||
|
|
||||||
|
Copyright : All right reserved by NCIC.ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2023/10/23
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "global_arg.h"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
#include <getopt.h>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GlobalArg 类
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct option *GlobalArg::GLOBAL_OPT = nullptr;
|
||||||
|
|
||||||
|
// 初始化参数
|
||||||
|
void GlobalArg::initGlobalOptions()
|
||||||
|
{
|
||||||
|
vector<struct option> v;
|
||||||
|
v.push_back({"INPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_INPUT}); // 输入文件
|
||||||
|
v.push_back({"OUTPUT", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_OUTPUT}); // 输出文件
|
||||||
|
v.push_back({"num_threads", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_NUM_THREADS});
|
||||||
|
v.push_back({"max_mem", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_MAX_MEM});
|
||||||
|
v.push_back({"verbosity", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_LOG_LEVEL});
|
||||||
|
v.push_back({"asyncio", required_argument, NULL, ns_ga::GlobalOptEnum::OPT_ASYNCIO});
|
||||||
|
v.push_back({"version", no_argument, NULL, ns_ga::GlobalOptEnum::OPT_VERSION});
|
||||||
|
v.push_back({"help", no_argument, NULL, ns_ga::GlobalOptEnum::OPT_HELP});
|
||||||
|
v.push_back({0, 0, 0, 0});
|
||||||
|
|
||||||
|
GLOBAL_OPT = new struct option[GLOBAL_ARG_CNT];
|
||||||
|
memcpy(GLOBAL_OPT, v.data(), v.size() * sizeof(struct option));
|
||||||
|
|
||||||
|
/* 添加帮助信息, 按arg enum顺序进行添加信息 */
|
||||||
|
vArgInfo.push_back("--INPUT Input file path (bam, vcf ...)\n");
|
||||||
|
vArgInfo.push_back("--OUTPUT Output file path \n");
|
||||||
|
vArgInfo.push_back("--num_threads <num_threads> Number of threads to allocate to this analysis [1]\n");
|
||||||
|
vArgInfo.push_back("--max_mem <max_mem> Set maximum memory; suffix K/M/G recognized [2G]\n");
|
||||||
|
vArgInfo.push_back("--verbosity <log level> Control verbosity of logging. error/warning/info/debug [info]\n");
|
||||||
|
vArgInfo.push_back("--asyncio Use async io [true]\n");
|
||||||
|
vArgInfo.push_back("--version Output version information\n");
|
||||||
|
vArgInfo.push_back("--help Generate the help message\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析参数
|
||||||
|
void GlobalArg::parseArgument(int argNum)
|
||||||
|
{
|
||||||
|
using namespace ns_ga;
|
||||||
|
switch (argNum)
|
||||||
|
{
|
||||||
|
case OPT_INPUT:
|
||||||
|
in_fn = optarg;
|
||||||
|
break;
|
||||||
|
case OPT_OUTPUT:
|
||||||
|
out_fn = optarg;
|
||||||
|
break;
|
||||||
|
case OPT_NUM_THREADS:
|
||||||
|
num_threads = std::stoi(optarg);
|
||||||
|
break;
|
||||||
|
case OPT_MAX_MEM:
|
||||||
|
{
|
||||||
|
char *q;
|
||||||
|
size_t mem_arg = strtol(optarg, &q, 0);
|
||||||
|
if (*q == 'k' || *q == 'K')
|
||||||
|
mem_arg <<= 10;
|
||||||
|
else if (*q == 'm' || *q == 'M')
|
||||||
|
mem_arg <<= 20;
|
||||||
|
else if (*q == 'g' || *q == 'G')
|
||||||
|
mem_arg <<= 30;
|
||||||
|
if (mem_arg >= max_mem)
|
||||||
|
max_mem = mem_arg;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::cerr << "[Warn] Too small mem size, use default" << std::endl;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case OPT_LOG_LEVEL:
|
||||||
|
{
|
||||||
|
if (strcmp("ERROR", optarg) == 0)
|
||||||
|
verbosity = ns_ga::ERROR;
|
||||||
|
else if (strcmp("WARNING", optarg) == 0)
|
||||||
|
verbosity = ns_ga::WARNING;
|
||||||
|
else if (strcmp("INFO", optarg) == 0)
|
||||||
|
verbosity = ns_ga::INFO;
|
||||||
|
else if (strcmp("DEBUG", optarg) == 0)
|
||||||
|
verbosity = ns_ga::DEBUG;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case OPT_ASYNCIO:
|
||||||
|
{
|
||||||
|
if (strcmp("true", optarg) == 0)
|
||||||
|
use_asyncio = true;
|
||||||
|
else if (strcmp("false", optarg) == 0)
|
||||||
|
use_asyncio = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,105 @@
|
||||||
|
/*
|
||||||
|
Description: picard_cpp共享的一些参数
|
||||||
|
|
||||||
|
Copyright : All right reserved by NCIC.ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2023/10/23
|
||||||
|
*/
|
||||||
|
#ifndef GLOBAL_ARG_H_
|
||||||
|
#define GLOBAL_ARG_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <getopt.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
using std::map;
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
namespace ns_ga {
|
||||||
|
enum GlobalOptEnum
|
||||||
|
{
|
||||||
|
_START_NUM = 1,
|
||||||
|
OPT_INPUT,
|
||||||
|
OPT_OUTPUT,
|
||||||
|
OPT_NUM_THREADS,
|
||||||
|
OPT_MAX_MEM,
|
||||||
|
OPT_LOG_LEVEL,
|
||||||
|
OPT_ASYNCIO,
|
||||||
|
OPT_VERSION,
|
||||||
|
OPT_HELP,
|
||||||
|
_END_NUM
|
||||||
|
};
|
||||||
|
|
||||||
|
// log level
|
||||||
|
enum LogLevelEnum
|
||||||
|
{
|
||||||
|
ERROR,
|
||||||
|
WARNING,
|
||||||
|
INFO,
|
||||||
|
DEBUG
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 全局共享的一些参数 */
|
||||||
|
struct GlobalArg
|
||||||
|
{
|
||||||
|
const static int GLOBAL_ARG_CNT = ns_ga::GlobalOptEnum::_END_NUM - ns_ga::GlobalOptEnum::_START_NUM; // 这里不需要减1
|
||||||
|
static struct option *GLOBAL_OPT;
|
||||||
|
|
||||||
|
string in_fn; // input bam filename
|
||||||
|
string out_fn; // output bam filename
|
||||||
|
int num_threads = 1; // 线程个数
|
||||||
|
size_t max_mem = ((size_t)2) << 30; // 最小2G
|
||||||
|
ns_ga::LogLevelEnum verbosity = ns_ga::INFO; // 打印信息级别
|
||||||
|
bool use_asyncio = true; // 是否使用异步io
|
||||||
|
|
||||||
|
vector<string> vArgInfo; // 每个参数的帮助信息
|
||||||
|
|
||||||
|
// 单例模式
|
||||||
|
GlobalArg(const GlobalArg &) = delete;
|
||||||
|
GlobalArg &operator=(const GlobalArg &) = delete;
|
||||||
|
|
||||||
|
// 获取单例
|
||||||
|
static GlobalArg &Instance()
|
||||||
|
{
|
||||||
|
static GlobalArg instance;
|
||||||
|
return instance;
|
||||||
|
}
|
||||||
|
// 初始化参数
|
||||||
|
void initGlobalOptions();
|
||||||
|
|
||||||
|
// 解析参数
|
||||||
|
void parseArgument(int argNum);
|
||||||
|
|
||||||
|
// 获取对应参数在数组(option和help info)中的索引
|
||||||
|
int getArgIndx(ns_ga::GlobalOptEnum opt)
|
||||||
|
{
|
||||||
|
return opt - ns_ga::GlobalOptEnum::OPT_INPUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 打印某个参数的帮助信息
|
||||||
|
void printArgInfo(ns_ga::GlobalOptEnum arg) {
|
||||||
|
int idx = getArgIndx(arg);
|
||||||
|
fprintf(stdout, "%s\n", vArgInfo[idx].c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
void printArgValue() {
|
||||||
|
printf("--INPUT = %s\n", in_fn.c_str());
|
||||||
|
printf("--OUTPUT = %s\n", out_fn.c_str());
|
||||||
|
printf("--num_threads = %d\n",num_threads);
|
||||||
|
printf("--max_mem = %ld\n", max_mem);
|
||||||
|
printf("--verbosity = %d\n", verbosity);
|
||||||
|
printf("--asyncio = %d\n", use_asyncio);
|
||||||
|
}
|
||||||
|
private :
|
||||||
|
GlobalArg()
|
||||||
|
{
|
||||||
|
initGlobalOptions();
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -6,6 +6,8 @@ Copyright : All right reserved by ICT
|
||||||
Author : Zhang Zhonghai
|
Author : Zhang Zhonghai
|
||||||
Date : 2023/10/23
|
Date : 2023/10/23
|
||||||
*/
|
*/
|
||||||
|
#include "markdups_arg.h"
|
||||||
|
#include <common/global_arg.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
|
|
@ -16,11 +18,18 @@ using namespace std;
|
||||||
*/
|
*/
|
||||||
int MarkDuplicates(int argc, char *argv[])
|
int MarkDuplicates(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
cout << argc << endl;
|
// cout << argc << endl;
|
||||||
for (int i = 0; i < argc; ++i) {
|
// for (int i = 0; i < argc; ++i) {
|
||||||
cout << argv[i] << '\t';
|
// cout << argv[i] << '\t';
|
||||||
}
|
// }
|
||||||
cout << endl;
|
// cout << endl;
|
||||||
|
|
||||||
|
GlobalArg &gArg = GlobalArg::Instance();
|
||||||
|
MarkDupsArg mdArg;
|
||||||
|
vector<AuxVar> vAuxVar;
|
||||||
|
mdArg.parseArgument(argc, argv, &vAuxVar, &gArg);
|
||||||
|
|
||||||
|
// cout << ns_md::ValidationStringency::DEFAULT_STRINGENCY << '\t' << ns_md::ValidationStringency::SILENT << endl;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,488 @@
|
||||||
|
/*
|
||||||
|
Description: Markduplicate需要用到的一些参数,读取命令行给的参数,并做一些初始化
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2023/10/27
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "markdups_arg.h"
|
||||||
|
#include "common/global_arg.h"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
|
#include <getopt.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
using std::cout, std::endl;
|
||||||
|
|
||||||
|
using std::ostringstream;
|
||||||
|
using std::stod;
|
||||||
|
using std::stoi;
|
||||||
|
using std::stol;
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
using namespace ns_md;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* mutect参数
|
||||||
|
*/
|
||||||
|
const static struct option kMdOpts[] = {
|
||||||
|
{"MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP", required_argument, NULL, MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP},
|
||||||
|
{"MAX_FILE_HANDLES_FOR_READ_ENDS_MAP", required_argument, NULL, MAX_FILE_HANDLES_FOR_READ_ENDS_MAP},
|
||||||
|
{"SORTING_COLLECTION_SIZE_RATIO", required_argument, NULL, SORTING_COLLECTION_SIZE_RATIO},
|
||||||
|
{"BARCODE_TAG", required_argument, NULL, BARCODE_TAG},
|
||||||
|
{"READ_ONE_BARCODE_TAG", required_argument, NULL, READ_ONE_BARCODE_TAG},
|
||||||
|
{"READ_TWO_BARCODE_TAG", required_argument, NULL, READ_TWO_BARCODE_TAG},
|
||||||
|
{"TAG_DUPLICATE_SET_MEMBERS", required_argument, NULL, TAG_DUPLICATE_SET_MEMBERS},
|
||||||
|
{"REMOVE_SEQUENCING_DUPLICATES", required_argument, NULL, REMOVE_SEQUENCING_DUPLICATES},
|
||||||
|
{"TAGGING_POLICY", required_argument, NULL, TAGGING_POLICY},
|
||||||
|
{"CLEAR_DT", required_argument, NULL, CLEAR_DT},
|
||||||
|
{"DUPLEX_UMI", required_argument, NULL, DUPLEX_UMI},
|
||||||
|
{"MOLECULAR_IDENTIFIER_TAG", required_argument, NULL, MOLECULAR_IDENTIFIER_TAG},
|
||||||
|
{"METRICS_FILE", required_argument, NULL, METRICS_FILE},
|
||||||
|
{"REMOVE_DUPLICATES", required_argument, NULL, REMOVE_DUPLICATES},
|
||||||
|
{"ASSUME_SORTED", required_argument, NULL, ASSUME_SORTED},
|
||||||
|
{"ASSUME_SORT_ORDER", required_argument, NULL, ASSUME_SORT_ORDER},
|
||||||
|
{"DUPLICATE_SCORING_STRATEGY", required_argument, NULL, DUPLICATE_SCORING_STRATEGY},
|
||||||
|
{"PROGRAM_RECORD_ID", required_argument, NULL, PROGRAM_RECORD_ID},
|
||||||
|
{"PROGRAM_GROUP_VERSION", required_argument, NULL, PROGRAM_GROUP_VERSION},
|
||||||
|
{"PROGRAM_GROUP_COMMAND_LINE", required_argument, NULL, PROGRAM_GROUP_COMMAND_LINE},
|
||||||
|
{"PROGRAM_GROUP_NAME", required_argument, NULL, PROGRAM_GROUP_NAME},
|
||||||
|
{"COMMENT", required_argument, NULL, COMMENT},
|
||||||
|
{"READ_NAME_REGEX", required_argument, NULL, READ_NAME_REGEX},
|
||||||
|
{"OPTICAL_DUPLICATE_PIXEL_DISTANCE", required_argument, NULL, OPTICAL_DUPLICATE_PIXEL_DISTANCE},
|
||||||
|
{"MAX_OPTICAL_DUPLICATE_SET_SIZE", required_argument, NULL, MAX_OPTICAL_DUPLICATE_SET_SIZE},
|
||||||
|
{"QUIET", required_argument, NULL, QUIET},
|
||||||
|
{"VALIDATION_STRINGENCY", required_argument, NULL, VALIDATION_STRINGENCY},
|
||||||
|
{"COMPRESSION_LEVEL", required_argument, NULL, COMPRESSION_LEVEL},
|
||||||
|
{"MAX_RECORDS_IN_RAM", required_argument, NULL, MAX_RECORDS_IN_RAM},
|
||||||
|
{"CREATE_INDEX", required_argument, NULL, CREATE_INDEX},
|
||||||
|
{"CREATE_MD5_FILE", required_argument, NULL, CREATE_MD5_FILE}};
|
||||||
|
|
||||||
|
// 判断bool类型的参数
|
||||||
|
void setBoolArg(bool *arg) {
|
||||||
|
if (strcmp("true", optarg) == 0)
|
||||||
|
*arg = true;
|
||||||
|
else if (strcmp("false", optarg) == 0)
|
||||||
|
*arg = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析参数
|
||||||
|
void MarkDupsArg::parseArgument(int argc,
|
||||||
|
char **argv,
|
||||||
|
vector<AuxVar> *pvAuxVar,
|
||||||
|
GlobalArg *pGArg)
|
||||||
|
{
|
||||||
|
auto &vAuxVar = *pvAuxVar;
|
||||||
|
auto &gArg = *pGArg;
|
||||||
|
|
||||||
|
struct option allOpt[MarkDupsArg::ARG_COUNT + GlobalArg::GLOBAL_ARG_CNT];
|
||||||
|
|
||||||
|
memcpy(allOpt, kMdOpts, MarkDupsArg::ARG_COUNT * sizeof(struct option));
|
||||||
|
memcpy(&allOpt[MarkDupsArg::ARG_COUNT], GlobalArg::GLOBAL_OPT, GlobalArg::GLOBAL_ARG_CNT * sizeof(struct option));
|
||||||
|
|
||||||
|
// int cnt = MarkDupsArg::ARG_COUNT + GlobalArg::GLOBAL_ARG_CNT;
|
||||||
|
// cout << cnt << endl;
|
||||||
|
// for (int i = 0; i < cnt; ++i)
|
||||||
|
// {
|
||||||
|
// cout << i << '\t' << allOpt[i].name << endl;
|
||||||
|
// }
|
||||||
|
|
||||||
|
int c;
|
||||||
|
while ((c = getopt_long_only(argc, argv, "", allOpt, NULL)) >= 0)
|
||||||
|
{
|
||||||
|
|
||||||
|
gArg.parseArgument(c);
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case ns_ga::OPT_VERSION:
|
||||||
|
PrintVersion();
|
||||||
|
exit(0);
|
||||||
|
case ns_ga::OPT_HELP:
|
||||||
|
PrintHelp();
|
||||||
|
exit(0);
|
||||||
|
case ns_md::MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP:
|
||||||
|
MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = stoi(optarg);
|
||||||
|
break;
|
||||||
|
case ns_md::MAX_FILE_HANDLES_FOR_READ_ENDS_MAP:
|
||||||
|
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = stoi(optarg);
|
||||||
|
break;
|
||||||
|
case ns_md::SORTING_COLLECTION_SIZE_RATIO:
|
||||||
|
SORTING_COLLECTION_SIZE_RATIO = stod(optarg);
|
||||||
|
break;
|
||||||
|
case ns_md::BARCODE_TAG:
|
||||||
|
BARCODE_TAG = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::READ_ONE_BARCODE_TAG:
|
||||||
|
READ_ONE_BARCODE_TAG = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::READ_TWO_BARCODE_TAG:
|
||||||
|
READ_TWO_BARCODE_TAG = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::TAG_DUPLICATE_SET_MEMBERS:
|
||||||
|
setBoolArg(&TAG_DUPLICATE_SET_MEMBERS);
|
||||||
|
break;
|
||||||
|
case ns_md::REMOVE_SEQUENCING_DUPLICATES:
|
||||||
|
setBoolArg(&REMOVE_SEQUENCING_DUPLICATES);
|
||||||
|
break;
|
||||||
|
case ns_md::TAGGING_POLICY:
|
||||||
|
if (strcmp("DontTag", optarg) == 0)
|
||||||
|
TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::DontTag;
|
||||||
|
else if (strcmp("OpticalOnly", optarg) == 0)
|
||||||
|
TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::OpticalOnly;
|
||||||
|
else if (strcmp("All", optarg) == 0)
|
||||||
|
TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::All;
|
||||||
|
break;
|
||||||
|
case ns_md::CLEAR_DT:
|
||||||
|
setBoolArg(&CLEAR_DT);
|
||||||
|
break;
|
||||||
|
case ns_md::DUPLEX_UMI:
|
||||||
|
setBoolArg(&DUPLEX_UMI);
|
||||||
|
break;
|
||||||
|
case ns_md::MOLECULAR_IDENTIFIER_TAG:
|
||||||
|
MOLECULAR_IDENTIFIER_TAG = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::METRICS_FILE:
|
||||||
|
METRICS_FILE = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::REMOVE_DUPLICATES:
|
||||||
|
setBoolArg(&REMOVE_DUPLICATES);
|
||||||
|
break;
|
||||||
|
case ns_md::ASSUME_SORTED:
|
||||||
|
setBoolArg(&ASSUME_SORTED);
|
||||||
|
break;
|
||||||
|
case ns_md::ASSUME_SORT_ORDER:
|
||||||
|
if (strcmp("unsorted", optarg) == 0)
|
||||||
|
ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
|
||||||
|
else if (strcmp("queryname", optarg) == 0)
|
||||||
|
ASSUME_SORT_ORDER = ns_md::SortOrder::queryname;
|
||||||
|
else if (strcmp("coordinate", optarg) == 0)
|
||||||
|
ASSUME_SORT_ORDER = ns_md::SortOrder::coordinate;
|
||||||
|
else if (strcmp("duplicate", optarg) == 0)
|
||||||
|
ASSUME_SORT_ORDER = ns_md::SortOrder::duplicate;
|
||||||
|
else if (strcmp("unknown", optarg) == 0)
|
||||||
|
ASSUME_SORT_ORDER = ns_md::SortOrder::unknown;
|
||||||
|
break;
|
||||||
|
case ns_md::DUPLICATE_SCORING_STRATEGY:
|
||||||
|
if (strcmp("SUM_OF_BASE_QUALITIES", optarg) == 0)
|
||||||
|
DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::SUM_OF_BASE_QUALITIES;
|
||||||
|
else if (strcmp("TOTAL_MAPPED_REFERENCE_LENGTH", optarg) == 0)
|
||||||
|
DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH;
|
||||||
|
else if (strcmp("RANDOM", optarg) == 0)
|
||||||
|
DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::RANDOM;
|
||||||
|
break;
|
||||||
|
case ns_md::PROGRAM_RECORD_ID:
|
||||||
|
PROGRAM_RECORD_ID = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::PROGRAM_GROUP_VERSION:
|
||||||
|
PROGRAM_GROUP_VERSION = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::PROGRAM_GROUP_COMMAND_LINE:
|
||||||
|
PROGRAM_GROUP_COMMAND_LINE = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::PROGRAM_GROUP_NAME:
|
||||||
|
PROGRAM_GROUP_NAME = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::COMMENT:
|
||||||
|
COMMENT.push_back(optarg);
|
||||||
|
break;
|
||||||
|
case ns_md::READ_NAME_REGEX:
|
||||||
|
READ_NAME_REGEX = optarg;
|
||||||
|
break;
|
||||||
|
case ns_md::OPTICAL_DUPLICATE_PIXEL_DISTANCE:
|
||||||
|
OPTICAL_DUPLICATE_PIXEL_DISTANCE = stoi(optarg);
|
||||||
|
break;
|
||||||
|
case ns_md::MAX_OPTICAL_DUPLICATE_SET_SIZE:
|
||||||
|
MAX_OPTICAL_DUPLICATE_SET_SIZE = stol(optarg);
|
||||||
|
break;
|
||||||
|
case ns_md::QUIET:
|
||||||
|
setBoolArg(&QUIET);
|
||||||
|
break;
|
||||||
|
case ns_md::VALIDATION_STRINGENCY:
|
||||||
|
if (strcmp("STRICT", optarg) == 0)
|
||||||
|
VALIDATION_STRINGENCY = ns_md::ValidationStringency::STRICT;
|
||||||
|
else if (strcmp("LENIENT", optarg) == 0)
|
||||||
|
VALIDATION_STRINGENCY = ns_md::ValidationStringency::LENIENT;
|
||||||
|
else if (strcmp("SILENT", optarg) == 0)
|
||||||
|
VALIDATION_STRINGENCY = ns_md::ValidationStringency::SILENT;
|
||||||
|
break;
|
||||||
|
case ns_md::COMPRESSION_LEVEL:
|
||||||
|
COMPRESSION_LEVEL = stoi(optarg);
|
||||||
|
break;
|
||||||
|
case ns_md::MAX_RECORDS_IN_RAM:
|
||||||
|
MAX_RECORDS_IN_RAM = stoi(optarg);
|
||||||
|
break;
|
||||||
|
case ns_md::CREATE_INDEX:
|
||||||
|
setBoolArg(&CREATE_INDEX);
|
||||||
|
break;
|
||||||
|
case ns_md::CREATE_MD5_FILE:
|
||||||
|
setBoolArg(&CREATE_MD5_FILE);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
gArg.printArgValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 打印版本信息
|
||||||
|
void MarkDupsArg::PrintVersion()
|
||||||
|
{
|
||||||
|
fprintf(stdout, "\n MarkDuplicate Version: %s\n", MARKDUPLICATE_VERSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 释放资源,关闭文件等
|
||||||
|
void MarkDupsArg::Finalize(MarkDupsArg *pMdArg,
|
||||||
|
vector<AuxVar> *pvAuxVar,
|
||||||
|
GlobalArg *pGArg)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
// 打印帮助信息
|
||||||
|
void MarkDupsArg::PrintHelp()
|
||||||
|
{
|
||||||
|
FILE *fp = stdout;
|
||||||
|
fprintf(fp,
|
||||||
|
"Usage: MarkDuplicates [arguments]\n"
|
||||||
|
"\n"
|
||||||
|
"Example:\n"
|
||||||
|
" ./picard_cpp MarkDuplicates --num_thread 4 --INPUT input.bam --OUTPUT marked_duplicates.bam --METRICS_FILE marked_dup_metrics.txt\n"
|
||||||
|
"\n"
|
||||||
|
"Required Arguments:\n"
|
||||||
|
"\n"
|
||||||
|
"--INPUT,-I <String> One or more input SAM, BAM or CRAM files to analyze. Must be coordinate sorted. This\n"
|
||||||
|
" argument must be specified at least once.Required.\n"
|
||||||
|
"\n"
|
||||||
|
"--METRICS_FILE,-M <File> File to write duplication metrics to Required.\n"
|
||||||
|
"\n"
|
||||||
|
"--OUTPUT,-O <File> The output file to write marked records to Required.\n"
|
||||||
|
"\n"
|
||||||
|
"\n"
|
||||||
|
"Optional Arguments:\n"
|
||||||
|
"\n"
|
||||||
|
"--ADD_PG_TAG_TO_READS <Boolean>\n"
|
||||||
|
" Add PG tag to each read in a SAM or BAM Default value: true. Possible values: {true,\n"
|
||||||
|
" false}\n"
|
||||||
|
"\n"
|
||||||
|
"--arguments_file <File> read one or more arguments files and add them to the command line This argument may be\n"
|
||||||
|
" specified 0 or more times. Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--ASSUME_SORT_ORDER,-ASO <SortOrder>\n"
|
||||||
|
" If not null, assume that the input file has this order even if the header says otherwise.\n"
|
||||||
|
" Default value: null. Possible values: {unsorted, queryname, coordinate, duplicate,\n"
|
||||||
|
" unknown} Cannot be used in conjunction with argument(s) ASSUME_SORTED (AS)\n"
|
||||||
|
"\n"
|
||||||
|
"\n"
|
||||||
|
"--ASSUME_SORTED,-AS <Boolean> If true, assume that the input file is coordinate sorted even if the header says\n"
|
||||||
|
" otherwise. Deprecated, used ASSUME_SORT_ORDER=coordinate instead. Default value: false.\n"
|
||||||
|
" Possible values: {true, false} Cannot be used in conjunction with argument(s)\n"
|
||||||
|
" ASSUME_SORT_ORDER (ASO)\n"
|
||||||
|
"\n"
|
||||||
|
"--BARCODE_TAG <String> Barcode SAM tag (ex. BC for 10X Genomics) Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--CLEAR_DT <Boolean> Clear DT tag from input SAM records. Should be set to false if input SAM doesn't have this\n"
|
||||||
|
" tag. Default true Default value: true. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--COMMENT,-CO <String> Comment(s) to include in the output file's header. This argument may be specified 0 or\n"
|
||||||
|
" more times. Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--COMPRESSION_LEVEL <Integer> Compression level for all compressed files created (e.g. BAM and VCF). Default value: 5.\n"
|
||||||
|
"\n"
|
||||||
|
"--CREATE_INDEX <Boolean> Whether to create an index when writing VCF or coordinate sorted BAM output. Default\n"
|
||||||
|
" value: false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--CREATE_MD5_FILE <Boolean> Whether to create an MD5 digest for any BAM or FASTQ files created. Default value:\n"
|
||||||
|
" false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--DUPLEX_UMI <Boolean> Treat UMIs as being duplex stranded. This option requires that the UMI consist of two\n"
|
||||||
|
" equal length strings that are separated by a hyphen (e.g. 'ATC-GTC'). Reads are considered\n"
|
||||||
|
" duplicates if, in addition to standard definition, have identical normalized UMIs. A UMI\n"
|
||||||
|
" from the 'bottom' strand is normalized by swapping its content around the hyphen (eg.\n"
|
||||||
|
" ATC-GTC becomes GTC-ATC). A UMI from the 'top' strand is already normalized as it is.\n"
|
||||||
|
" Both reads from a read pair considered top strand if the read 1 unclipped 5' coordinate is\n"
|
||||||
|
" less than the read 2 unclipped 5' coordinate. All chimeric reads and read fragments are\n"
|
||||||
|
" treated as having come from the top strand. With this option is it required that the\n"
|
||||||
|
" BARCODE_TAG hold non-normalized UMIs. Default false. Default value: false. Possible\n"
|
||||||
|
" values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--DUPLICATE_SCORING_STRATEGY,-DS <ScoringStrategy>\n"
|
||||||
|
" The scoring strategy for choosing the non-duplicate among candidates. Default value:\n"
|
||||||
|
" SUM_OF_BASE_QUALITIES. Possible values: {SUM_OF_BASE_QUALITIES,\n"
|
||||||
|
" TOTAL_MAPPED_REFERENCE_LENGTH, RANDOM}\n"
|
||||||
|
"\n"
|
||||||
|
"--FLOW_EFFECTIVE_QUALITY_THRESHOLD <Integer>\n"
|
||||||
|
" Threshold for considering a quality value high enough to be included when calculating\n"
|
||||||
|
" FLOW_QUALITY_SUM_STRATEGY calculation. Default value: 15.\n"
|
||||||
|
"\n"
|
||||||
|
"--FLOW_MODE <Boolean> enable parameters and behavior specific to flow based reads. Default value: false.\n"
|
||||||
|
" Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--FLOW_Q_IS_KNOWN_END <Boolean>\n"
|
||||||
|
" Treat position of read trimming based on quality as the known end (relevant for flow based\n"
|
||||||
|
" reads). Default false - if the read is trimmed on quality its end is not defined and the\n"
|
||||||
|
" read is duplicate of any read starting at the same place. Default value: false. Possible\n"
|
||||||
|
" values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--FLOW_QUALITY_SUM_STRATEGY <Boolean>\n"
|
||||||
|
" Use specific quality summing strategy for flow based reads. The strategy ensures that the\n"
|
||||||
|
" same (and correct) quality value is used for all bases of the same homopolymer. Default\n"
|
||||||
|
" value: false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--FLOW_SKIP_FIRST_N_FLOWS <Integer>\n"
|
||||||
|
" Skip first N flows, starting from the read's start, when considering duplicates. Useful\n"
|
||||||
|
" for flow based reads where sometimes there is noise in the first flows (for this argument,\n"
|
||||||
|
" \" read start \" means 5' end). Default value: 0.\n"
|
||||||
|
"\n"
|
||||||
|
"--help,-h <Boolean> display the help message Default value: false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,-MAX_FILE_HANDLES <Integer>\n"
|
||||||
|
" Maximum number of file handles to keep open when spilling read ends to disk. Set this\n"
|
||||||
|
" number a little lower than the per-process maximum number of file that may be open. This\n"
|
||||||
|
" number can be found by executing the 'ulimit -n' command on a Unix system. Default value:\n"
|
||||||
|
" 8000.\n"
|
||||||
|
"\n"
|
||||||
|
"--MAX_OPTICAL_DUPLICATE_SET_SIZE <Long>\n"
|
||||||
|
" This number is the maximum size of a set of duplicate reads for which we will attempt to\n"
|
||||||
|
" determine which are optical duplicates. Please be aware that if you raise this value too\n"
|
||||||
|
" high and do encounter a very large set of duplicate reads, it will severely affect the\n"
|
||||||
|
" runtime of this tool. To completely disable this check, set the value to -1. Default\n"
|
||||||
|
" value: 300000.\n"
|
||||||
|
"\n"
|
||||||
|
"--MAX_RECORDS_IN_RAM <Integer>When writing files that need to be sorted, this will specify the number of records stored\n"
|
||||||
|
" in RAM before spilling to disk. Increasing this number reduces the number of file handles\n"
|
||||||
|
" needed to sort the file, and increases the amount of RAM needed. Default value: 500000.\n"
|
||||||
|
"\n"
|
||||||
|
"--MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,-MAX_SEQS <Integer>\n"
|
||||||
|
" This option is obsolete. ReadEnds will always be spilled to disk. Default value: 50000.\n"
|
||||||
|
"\n"
|
||||||
|
"--MOLECULAR_IDENTIFIER_TAG <String>\n"
|
||||||
|
" SAM tag to uniquely identify the molecule from which a read was derived. Use of this\n"
|
||||||
|
" option requires that the BARCODE_TAG option be set to a non null value. Default null.\n"
|
||||||
|
" Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--OPTICAL_DUPLICATE_PIXEL_DISTANCE <Integer>\n"
|
||||||
|
" The maximum offset between two duplicate clusters in order to consider them optical\n"
|
||||||
|
" duplicates. The default is appropriate for unpatterned versions of the Illumina platform.\n"
|
||||||
|
" For the patterned flowcell models, 2500 is moreappropriate. For other platforms and\n"
|
||||||
|
" models, users should experiment to find what works best. Default value: 100.\n"
|
||||||
|
"\n"
|
||||||
|
"--PROGRAM_GROUP_COMMAND_LINE,-PG_COMMAND <String>\n"
|
||||||
|
" Value of CL tag of PG record to be created. If not supplied the command line will be\n"
|
||||||
|
" detected automatically. Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--PROGRAM_GROUP_NAME,-PG_NAME <String>\n"
|
||||||
|
" Value of PN tag of PG record to be created. Default value: MarkDuplicates.\n"
|
||||||
|
"\n"
|
||||||
|
"--PROGRAM_GROUP_VERSION,-PG_VERSION <String>\n"
|
||||||
|
" Value of VN tag of PG record to be created. If not specified, the version will be detected\n"
|
||||||
|
" automatically. Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--PROGRAM_RECORD_ID,-PG <String>\n"
|
||||||
|
" The program record ID for the @PG record(s) created by this program. Set to null to\n"
|
||||||
|
" disable PG record creation. This string may have a suffix appended to avoid collision\n"
|
||||||
|
" with other program record IDs. Default value: MarkDuplicates.\n"
|
||||||
|
"\n"
|
||||||
|
"--QUIET <Boolean> Whether to suppress job-summary info on System.err. Default value: false. Possible\n"
|
||||||
|
" values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--READ_NAME_REGEX <String> MarkDuplicates can use the tile and cluster positions to estimate the rate of optical\n"
|
||||||
|
" duplication in addition to the dominant source of duplication, PCR, to provide a more\n"
|
||||||
|
" accurate estimation of library size. By default (with no READ_NAME_REGEX specified),\n"
|
||||||
|
" MarkDuplicates will attempt to extract coordinates using a split on ':' (see Note below).\n"
|
||||||
|
" Set READ_NAME_REGEX to 'null' to disable optical duplicate detection. Note that without\n"
|
||||||
|
" optical duplicate counts, library size estimation will be less accurate. If the read name\n"
|
||||||
|
" does not follow a standard Illumina colon-separation convention, but does contain tile and\n"
|
||||||
|
" x,y coordinates, a regular expression can be specified to extract three variables:\n"
|
||||||
|
" tile/region, x coordinate and y coordinate from a read name. The regular expression must\n"
|
||||||
|
" contain three capture groups for the three variables, in order. It must match the entire\n"
|
||||||
|
" read name. e.g. if field names were separated by semi-colon (';') this example regex\n"
|
||||||
|
" could be specified (?:.*;)?([0-9]+)[^;]*;([0-9]+)[^;]*;([0-9]+)[^;]*$ Note that if no\n"
|
||||||
|
" READ_NAME_REGEX is specified, the read name is split on ':'. For 5 element names, the\n"
|
||||||
|
" 3rd, 4th and 5th elements are assumed to be tile, x and y values. For 7 element names\n"
|
||||||
|
" (CASAVA 1.8), the 5th, 6th, and 7th elements are assumed to be tile, x and y values.\n"
|
||||||
|
" Default value: <optimized capture of last three ':' separated fields as numeric values>.\n"
|
||||||
|
"\n"
|
||||||
|
"--READ_ONE_BARCODE_TAG <String>\n"
|
||||||
|
" Read one barcode SAM tag (ex. BX for 10X Genomics) Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--READ_TWO_BARCODE_TAG <String>\n"
|
||||||
|
" Read two barcode SAM tag (ex. BX for 10X Genomics) Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--REFERENCE_SEQUENCE,-R <File>Reference sequence file. Default value: null.\n"
|
||||||
|
"\n"
|
||||||
|
"--REMOVE_DUPLICATES <Boolean> If true do not write duplicates to the output file instead of writing them with\n"
|
||||||
|
" appropriate flags set. Default value: false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--REMOVE_SEQUENCING_DUPLICATES <Boolean>\n"
|
||||||
|
" If true remove 'optical' duplicates and other duplicates that appear to have arisen from\n"
|
||||||
|
" the sequencing process instead of the library preparation process, even if\n"
|
||||||
|
" REMOVE_DUPLICATES is false. If REMOVE_DUPLICATES is true, all duplicates are removed and\n"
|
||||||
|
" this option is ignored. Default value: false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--SORTING_COLLECTION_SIZE_RATIO <Double>\n"
|
||||||
|
" This number, plus the maximum RAM available to the JVM, determine the memory footprint\n"
|
||||||
|
" used by some of the sorting collections. If you are running out of memory, try reducing\n"
|
||||||
|
" this number. Default value: 0.25.\n"
|
||||||
|
"\n"
|
||||||
|
"--TAG_DUPLICATE_SET_MEMBERS <Boolean>\n"
|
||||||
|
" If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG\n"
|
||||||
|
" (DS), indicates the size of the duplicate set. The smallest possible DS value is 2 which\n"
|
||||||
|
" occurs when two reads map to the same portion of the reference only one of which is marked\n"
|
||||||
|
" as duplicate. The second tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier\n"
|
||||||
|
" for the duplicate set to which the record belongs. This identifier is the index-in-file of\n"
|
||||||
|
" the representative read that was selected out of the duplicate set. Default value: false.\n"
|
||||||
|
" Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--TAGGING_POLICY <DuplicateTaggingPolicy>\n"
|
||||||
|
" Determines how duplicate types are recorded in the DT optional attribute. Default value:\n"
|
||||||
|
" DontTag. Possible values: {DontTag, OpticalOnly, All}\n"
|
||||||
|
"\n"
|
||||||
|
"--TMP_DIR <File> One or more directories with space available to be used by this program for temporary\n"
|
||||||
|
" storage of working files This argument may be specified 0 or more times. Default value:\n"
|
||||||
|
" null.\n"
|
||||||
|
"\n"
|
||||||
|
"--UNPAIRED_END_UNCERTAINTY <Integer>\n"
|
||||||
|
" Maximal difference of the read end position that counted as equal. Useful for flow based\n"
|
||||||
|
" reads where the end position might vary due to sequencing errors. (for this argument,\n"
|
||||||
|
" \" read end \" means 3' end) Default value: 0.\n"
|
||||||
|
"\n"
|
||||||
|
"--USE_END_IN_UNPAIRED_READS <Boolean>\n"
|
||||||
|
" Make the end location of single end read be significant when considering duplicates, in\n"
|
||||||
|
" addition to the start location, which is always significant (i.e. require single-ended\n"
|
||||||
|
" reads to start andend on the same position to be considered duplicate) (for this argument,\n"
|
||||||
|
" \" read end \" means 3' end). Default value: false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--USE_JDK_DEFLATER,-use_jdk_deflater <Boolean>\n"
|
||||||
|
" Use the JDK Deflater instead of the Intel Deflater for writing compressed output Default\n"
|
||||||
|
" value: false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--USE_JDK_INFLATER,-use_jdk_inflater <Boolean>\n"
|
||||||
|
" Use the JDK Inflater instead of the Intel Inflater for reading compressed input Default\n"
|
||||||
|
" value: false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--USE_UNPAIRED_CLIPPED_END <Boolean>\n"
|
||||||
|
" Use position of the clipping as the end position, when considering duplicates (or use the\n"
|
||||||
|
" unclipped end position) (for this argument, \" read end \" means 3' end). Default value:\n"
|
||||||
|
" false. Possible values: {true, false}\n"
|
||||||
|
"\n"
|
||||||
|
"--VALIDATION_STRINGENCY <ValidationStringency>\n"
|
||||||
|
" Validation stringency for all SAM files read by this program. Setting stringency to\n"
|
||||||
|
" SILENT can improve performance when processing a BAM file in which variable-length data\n"
|
||||||
|
" (read, qualities, tags) do not otherwise need to be decoded. Default value: STRICT.\n"
|
||||||
|
" Possible values: {STRICT, LENIENT, SILENT}\n"
|
||||||
|
"\n"
|
||||||
|
"--VERBOSITY <LogLevel> Control verbosity of logging. Default value: INFO. Possible values: {ERROR, WARNING,\n"
|
||||||
|
" INFO, DEBUG}\n"
|
||||||
|
"\n"
|
||||||
|
"--version <Boolean> display the version number for this tool Default value: false. Possible values: {true,\n"
|
||||||
|
" false}\n"
|
||||||
|
"\n");
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,315 @@
|
||||||
|
/*
|
||||||
|
Description: Markduplicate需要用到的一些参数
|
||||||
|
|
||||||
|
Copyright : All right reserved by ICT
|
||||||
|
|
||||||
|
Author : Zhang Zhonghai
|
||||||
|
Date : 2023/10/23
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
#define MARKDUPLICATE_VERSION "v0.1"
|
||||||
|
|
||||||
|
class GlobalArg;
|
||||||
|
|
||||||
|
namespace ns_md {
|
||||||
|
/* 用于markduplicate模块的参数,这个枚举用于getoption */
|
||||||
|
enum MarkDupsArgEnum
|
||||||
|
{
|
||||||
|
_START_NUM = 100,
|
||||||
|
MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP,
|
||||||
|
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP,
|
||||||
|
SORTING_COLLECTION_SIZE_RATIO,
|
||||||
|
BARCODE_TAG,
|
||||||
|
READ_ONE_BARCODE_TAG,
|
||||||
|
READ_TWO_BARCODE_TAG,
|
||||||
|
TAG_DUPLICATE_SET_MEMBERS,
|
||||||
|
REMOVE_SEQUENCING_DUPLICATES,
|
||||||
|
TAGGING_POLICY,
|
||||||
|
CLEAR_DT,
|
||||||
|
DUPLEX_UMI,
|
||||||
|
MOLECULAR_IDENTIFIER_TAG,
|
||||||
|
METRICS_FILE,
|
||||||
|
REMOVE_DUPLICATES,
|
||||||
|
ASSUME_SORTED,
|
||||||
|
ASSUME_SORT_ORDER,
|
||||||
|
DUPLICATE_SCORING_STRATEGY,
|
||||||
|
PROGRAM_RECORD_ID,
|
||||||
|
PROGRAM_GROUP_VERSION,
|
||||||
|
PROGRAM_GROUP_COMMAND_LINE,
|
||||||
|
PROGRAM_GROUP_NAME,
|
||||||
|
COMMENT,
|
||||||
|
READ_NAME_REGEX,
|
||||||
|
OPTICAL_DUPLICATE_PIXEL_DISTANCE,
|
||||||
|
MAX_OPTICAL_DUPLICATE_SET_SIZE,
|
||||||
|
QUIET,
|
||||||
|
VALIDATION_STRINGENCY,
|
||||||
|
COMPRESSION_LEVEL,
|
||||||
|
MAX_RECORDS_IN_RAM,
|
||||||
|
CREATE_INDEX,
|
||||||
|
CREATE_MD5_FILE,
|
||||||
|
_END_NUM
|
||||||
|
};
|
||||||
|
|
||||||
|
/* How strict to be when reading a SAM or BAM, beyond bare minimum validation. */
|
||||||
|
enum ValidationStringency
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Do the right thing, throw an exception if something looks wrong.
|
||||||
|
*/
|
||||||
|
STRICT,
|
||||||
|
/**
|
||||||
|
* Emit warnings but keep going if possible.
|
||||||
|
*/
|
||||||
|
LENIENT,
|
||||||
|
/**
|
||||||
|
* Like LENIENT, only don't emit warning messages.
|
||||||
|
*/
|
||||||
|
SILENT,
|
||||||
|
|
||||||
|
DEFAULT_STRINGENCY = SILENT
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enum used to control how duplicates are flagged in the DT optional tag on each read.
|
||||||
|
*/
|
||||||
|
enum DuplicateTaggingPolicy
|
||||||
|
{
|
||||||
|
DontTag,
|
||||||
|
OpticalOnly,
|
||||||
|
All
|
||||||
|
};
|
||||||
|
|
||||||
|
/* 排序的方式 */
|
||||||
|
enum SortOrder
|
||||||
|
{
|
||||||
|
unsorted,
|
||||||
|
queryname,
|
||||||
|
coordinate,
|
||||||
|
duplicate, // NB: this is not in the SAM spec!
|
||||||
|
unknown
|
||||||
|
};
|
||||||
|
|
||||||
|
/* 计算reads分数的方式(比那个read得分更高) */
|
||||||
|
enum ScoringStrategy
|
||||||
|
{
|
||||||
|
SUM_OF_BASE_QUALITIES,
|
||||||
|
TOTAL_MAPPED_REFERENCE_LENGTH,
|
||||||
|
RANDOM
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 用于线程内的各种变量
|
||||||
|
struct AuxVar {
|
||||||
|
const static int MIN_QSUM_QSCORE = 13;
|
||||||
|
const static int REF_CONTEXT_PAD = 3;
|
||||||
|
const static int REFERENCE_HALF_WINDOW_LENGTH = 150;
|
||||||
|
|
||||||
|
double contaminantAlternateFraction;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* markduplicate 需要的参数*/
|
||||||
|
struct MarkDupsArg
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* The optional attribute in SAM/BAM/CRAM files used to store the duplicate type.
|
||||||
|
*/
|
||||||
|
string DUPLICATE_TYPE_TAG = "DT";
|
||||||
|
/**
|
||||||
|
* The duplicate type tag value for duplicate type: library.
|
||||||
|
*/
|
||||||
|
string DUPLICATE_TYPE_LIBRARY = "LB";
|
||||||
|
/**
|
||||||
|
* The duplicate type tag value for duplicate type: sequencing (optical & pad-hopping, or "co-localized").
|
||||||
|
*/
|
||||||
|
string DUPLICATE_TYPE_SEQUENCING = "SQ";
|
||||||
|
/**
|
||||||
|
* The attribute in the SAM/BAM file used to store which read was selected as representative out of a duplicate set
|
||||||
|
*/
|
||||||
|
string DUPLICATE_SET_INDEX_TAG = "DI";
|
||||||
|
/**
|
||||||
|
* The attribute in the SAM/BAM file used to store the size of a duplicate set
|
||||||
|
*/
|
||||||
|
string DUPLICATE_SET_SIZE_TAG = "DS";
|
||||||
|
|
||||||
|
/* OpticalDuplicateFinder */
|
||||||
|
int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
|
||||||
|
int DEFAULT_BIG_DUPLICATE_SET_SIZE = 1000;
|
||||||
|
int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000; // larger than this number will generate over 100 billion comparisons in the n^2 algorithm below
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If more than this many sequences in SAM file, don't spill to disk because there will not
|
||||||
|
* be enough file handles.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* "This option is obsolete. ReadEnds will always be spilled to disk." */
|
||||||
|
int MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = 50000;
|
||||||
|
|
||||||
|
/* "Maximum number of file handles to keep open when spilling read ends to disk. " +
|
||||||
|
"Set this number a little lower than the per-process maximum number of file that may be open. " +
|
||||||
|
"This number can be found by executing the 'ulimit -n' command on a Unix system." */
|
||||||
|
int MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = 8000;
|
||||||
|
|
||||||
|
/* "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by " +
|
||||||
|
"some of the sorting collections. If you are running out of memory, try reducing this number." */
|
||||||
|
double SORTING_COLLECTION_SIZE_RATIO = 0.25;
|
||||||
|
|
||||||
|
/* "Barcode SAM tag (ex. BC for 10X Genomics)", optional = true */
|
||||||
|
string BARCODE_TAG = "";
|
||||||
|
|
||||||
|
/* "Read one barcode SAM tag (ex. BX for 10X Genomics)", optional = true */
|
||||||
|
string READ_ONE_BARCODE_TAG = "";
|
||||||
|
|
||||||
|
/* "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true */
|
||||||
|
string READ_TWO_BARCODE_TAG = "";
|
||||||
|
|
||||||
|
/* "If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG (DS), " +
|
||||||
|
"indicates the size of the duplicate set. The smallest possible DS value is 2 which occurs when two " +
|
||||||
|
"reads map to the same portion of the reference only one of which is marked as duplicate. The second " +
|
||||||
|
"tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier for the duplicate set to which the " +
|
||||||
|
"record belongs. This identifier is the index-in-file of the representative read that was selected out " +
|
||||||
|
"of the duplicate set.",
|
||||||
|
optional = true) */
|
||||||
|
bool TAG_DUPLICATE_SET_MEMBERS = false;
|
||||||
|
|
||||||
|
/* "If true remove 'optical' duplicates and other duplicates that appear to have arisen from the " +
|
||||||
|
"sequencing process instead of the library preparation process, even if REMOVE_DUPLICATES is false. " +
|
||||||
|
"If REMOVE_DUPLICATES is true, all duplicates are removed and this option is ignored.") */
|
||||||
|
bool REMOVE_SEQUENCING_DUPLICATES = false;
|
||||||
|
|
||||||
|
/* "Determines how duplicate types are recorded in the DT optional attribute.") */
|
||||||
|
ns_md::DuplicateTaggingPolicy TAGGING_POLICY = ns_md::DuplicateTaggingPolicy::DontTag;
|
||||||
|
|
||||||
|
/* "Clear DT tag from input SAM records. Should be set to false if input SAM doesn't have this tag. Default true") */
|
||||||
|
bool CLEAR_DT = true;
|
||||||
|
|
||||||
|
/* "Treat UMIs as being duplex stranded. This option requires that the UMI consist of two equal length " +
|
||||||
|
"strings that are separated by a hyphen (e.g. 'ATC-GTC'). Reads are considered duplicates if, in addition to standard " +
|
||||||
|
"definition, have identical normalized UMIs. A UMI from the 'bottom' strand is normalized by swapping its content " +
|
||||||
|
"around the hyphen (eg. ATC-GTC becomes GTC-ATC). A UMI from the 'top' strand is already normalized as it is. " +
|
||||||
|
"Both reads from a read pair considered top strand if the read 1 unclipped 5' coordinate is less than the read " +
|
||||||
|
"2 unclipped 5' coordinate. All chimeric reads and read fragments are treated as having come from the top strand. " +
|
||||||
|
"With this option is it required that the BARCODE_TAG hold non-normalized UMIs. Default false.") */
|
||||||
|
bool DUPLEX_UMI = false;
|
||||||
|
|
||||||
|
/* "SAM tag to uniquely identify the molecule from which a read was derived. Use of this option requires that " +
|
||||||
|
"the BARCODE_TAG option be set to a non null value. Default null.",
|
||||||
|
optional = true) */
|
||||||
|
string MOLECULAR_IDENTIFIER_TAG = "";
|
||||||
|
|
||||||
|
/* 继承自 AbstractMarkDuplicatesCommandLineProgram 的参数*/
|
||||||
|
/* "File to write duplication metrics to" */
|
||||||
|
string METRICS_FILE;
|
||||||
|
|
||||||
|
/* "If true do not write duplicates to the output file instead of writing them with appropriate flags set." */
|
||||||
|
bool REMOVE_DUPLICATES = false;
|
||||||
|
|
||||||
|
/* "If true, assume that the input file is coordinate sorted even if the header says otherwise. " +
|
||||||
|
"Deprecated, used ASSUME_SORT_ORDER=coordinate instead." mutex = {"ASSUME_SORT_ORDER"} */
|
||||||
|
bool ASSUME_SORTED = false;
|
||||||
|
|
||||||
|
/* "If not null, assume that the input file has this order even if the header says otherwise.",
|
||||||
|
optional = true, mutex = {"ASSUME_SORTED"} */
|
||||||
|
ns_md::SortOrder ASSUME_SORT_ORDER = ns_md::SortOrder::unsorted;
|
||||||
|
|
||||||
|
/* "The scoring strategy for choosing the non-duplicate among candidates." */
|
||||||
|
ns_md::ScoringStrategy DUPLICATE_SCORING_STRATEGY = ns_md::ScoringStrategy::TOTAL_MAPPED_REFERENCE_LENGTH;
|
||||||
|
|
||||||
|
/* "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
|
||||||
|
"PG record creation. This string may have a suffix appended to avoid collision with other " +
|
||||||
|
"program record IDs.",
|
||||||
|
optional = true */
|
||||||
|
string PROGRAM_RECORD_ID = "MarkDuplicates";
|
||||||
|
|
||||||
|
/* "Value of VN tag of PG record to be created. If not specified, the version will be detected automatically.",
|
||||||
|
optional = true */
|
||||||
|
string PROGRAM_GROUP_VERSION;
|
||||||
|
|
||||||
|
/* "Value of CL tag of PG record to be created. If not supplied the command line will be detected automatically.",
|
||||||
|
optional = true */
|
||||||
|
string PROGRAM_GROUP_COMMAND_LINE;
|
||||||
|
|
||||||
|
/* "Value of PN tag of PG record to be created." */
|
||||||
|
string PROGRAM_GROUP_NAME = "MarkDuplicates";
|
||||||
|
|
||||||
|
/* "Comment(s) to include in the output file's header.",
|
||||||
|
optional = true */
|
||||||
|
vector<string> COMMENT;
|
||||||
|
|
||||||
|
/* 继承自 AbstractOpticalDuplicateFinderCommandLineProgram 的参数 */
|
||||||
|
|
||||||
|
/* "MarkDuplicates can use the tile and cluster positions to estimate the rate of optical duplication " +
|
||||||
|
"in addition to the dominant source of duplication, PCR, to provide a more accurate estimation of library size. " +
|
||||||
|
"By default (with no READ_NAME_REGEX specified), MarkDuplicates will attempt to extract coordinates " +
|
||||||
|
"using a split on ':' (see Note below). " +
|
||||||
|
"Set READ_NAME_REGEX to 'null' to disable optical duplicate detection. " +
|
||||||
|
"Note that without optical duplicate counts, library size estimation will be less accurate. " +
|
||||||
|
"If the read name does not follow a standard Illumina colon-separation convention, but does contain tile and x,y coordinates, " +
|
||||||
|
"a regular expression can be specified to extract three variables: tile/region, x coordinate and y coordinate from a read name. " +
|
||||||
|
"The regular expression must contain three capture groups for the three variables, in order. " +
|
||||||
|
"It must match the entire read name. " +
|
||||||
|
" e.g. if field names were separated by semi-colon (';') this example regex could be specified " +
|
||||||
|
" (?:.*;)?([0-9]+)[^;]*;([0-9]+)[^;]*;([0-9]+)[^;]*$ " +
|
||||||
|
"Note that if no READ_NAME_REGEX is specified, the read name is split on ':'. " +
|
||||||
|
" For 5 element names, the 3rd, 4th and 5th elements are assumed to be tile, x and y values. " +
|
||||||
|
" For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements are assumed to be tile, x and y values.",
|
||||||
|
optional = true */
|
||||||
|
string READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
||||||
|
|
||||||
|
/* "The maximum offset between two duplicate clusters in order to consider them optical duplicates. The default " +
|
||||||
|
"is appropriate for unpatterned versions of the Illumina platform. For the patterned flowcell models, 2500 is more" +
|
||||||
|
"appropriate. For other platforms and models, users should experiment to find what works best." */
|
||||||
|
int OPTICAL_DUPLICATE_PIXEL_DISTANCE = DEFAULT_OPTICAL_DUPLICATE_DISTANCE;
|
||||||
|
|
||||||
|
/* "This number is the maximum size of a set of duplicate reads for which we will attempt to determine " +
|
||||||
|
"which are optical duplicates. Please be aware that if you raise this value too high and do encounter a very " +
|
||||||
|
"large set of duplicate reads, it will severely affect the runtime of this tool. To completely disable this check, " +
|
||||||
|
"set the value to -1." */
|
||||||
|
long MAX_OPTICAL_DUPLICATE_SET_SIZE = DEFAULT_MAX_DUPLICATE_SET_SIZE;
|
||||||
|
|
||||||
|
/* 继承自 CommandLineProgram 的参数*/
|
||||||
|
|
||||||
|
/* "Whether to suppress job-summary info on System.err.", common = true */
|
||||||
|
bool QUIET = false;
|
||||||
|
|
||||||
|
/* "Validation stringency for all SAM files read by this program. Setting stringency to SILENT " +
|
||||||
|
"can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) " +
|
||||||
|
"do not otherwise need to be decoded.", common=true */
|
||||||
|
ns_md::ValidationStringency VALIDATION_STRINGENCY = ns_md::ValidationStringency::DEFAULT_STRINGENCY;
|
||||||
|
|
||||||
|
/* "Compression level for all compressed files created (e.g. BAM and VCF).", common = true */
|
||||||
|
int COMPRESSION_LEVEL = 5;
|
||||||
|
|
||||||
|
/* "When writing files that need to be sorted, this will specify the number of records stored in RAM before spilling to disk. " +
|
||||||
|
"Increasing this number reduces the number of file handles needed to sort the file, and increases the amount of RAM needed.",
|
||||||
|
optional = true, common = true */
|
||||||
|
int MAX_RECORDS_IN_RAM = 500000;
|
||||||
|
|
||||||
|
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
|
||||||
|
bool CREATE_INDEX = false;
|
||||||
|
|
||||||
|
/* "Whether to create an MD5 digest for any BAM or FASTQ files created. ", common = true */
|
||||||
|
bool CREATE_MD5_FILE = false;
|
||||||
|
|
||||||
|
// mark duplicate参数个数
|
||||||
|
const static int ARG_COUNT = ns_md::MarkDupsArgEnum::_END_NUM - ns_md::MarkDupsArgEnum::_START_NUM - 1;
|
||||||
|
// 解析参数
|
||||||
|
void parseArgument(int argc,
|
||||||
|
char **argv,
|
||||||
|
vector<AuxVar> *pvAuxVar,
|
||||||
|
GlobalArg *pGArg);
|
||||||
|
|
||||||
|
static void PrintHelp();
|
||||||
|
|
||||||
|
static void PrintVersion();
|
||||||
|
|
||||||
|
// 释放资源,关闭文件等
|
||||||
|
static void Finalize(MarkDupsArg *pMdArg,
|
||||||
|
vector<AuxVar> *pvAuxVar,
|
||||||
|
GlobalArg *pGArg);
|
||||||
|
};
|
||||||
Loading…
Reference in New Issue