FastBQSR/src/bqsr/bqsr_args.h

176 lines
6.4 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: BQSR
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/10/10
*/
#pragma once
#include <string>
#include <vector>
using std::string;
using std::vector;
namespace nsbqsr {
enum IndexFormat { BAI, CSI };
} // namespace nsbqsr
/* bqsr parameters */
struct BQSRArg {
// common parameters
string INPUT_FILE; // input bam filename
string OUTPUT_FILE; // output bam filename
int NUM_THREADS = 1;
size_t MAX_MEM = ((size_t)1) << 30; // 1G
bool DUPLEX_IO = true; //
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
bool CREATE_INDEX = true;
nsbqsr::IndexFormat INDEX_FORMAT = nsbqsr::IndexFormat::BAI;
/* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/
bool ADD_PG_TAG_TO_READS = true;
//
string CLI_STR;
//
string START_TIME;
string PROGRAM_RECORD_ID = "FastBQSR";
// reference file
string REFERENCE_FILE;
// known sites vcf files
vector<string> KNOWN_SITES_VCFS;
// end of common parameters
// We always use the same covariates. The field is retained for compatibility with GATK3 reports.
bool DO_NOT_USE_STANDARD_COVARIATES = false;
//It makes no sense to run BQSR without sites. so we remove this option.
bool RUN_WITHOUT_DBSNP = false;
// We don't support SOLID. The field is retained for compatibility with GATK3 reports.
string SOLID_RECAL_MODE = "SET_Q_ZERO";
string SOLID_NOCALL_STRATEGY = "THROW_EXCEPTION";
//@Hidden @Argument(fullName = "default-platform", optional = true,
// doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String
string DEFAULT_PLATFORM = "";
// @Hidden @Argument(fullName = "force-platform", optional = true,
// doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and "solid.")
string FORCE_PLATFORM = "";
string existingRecalibrationReport = "";
/**
* The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be
* between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
*/
int MISMATCHES_CONTEXT_SIZE = 2;
/**
* The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions.
* Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
*/
int INDELS_CONTEXT_SIZE = 3;
/**
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
* This argument is ignored if the Cycle covariate is not used.
*/
int MAXIMUM_CYCLE_VALUE = 500;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace
* all base qualities in the read for this default value. Negative value turns it off. [default is off]
*/
int8_t MISMATCHES_DEFAULT_QUALITY = -1;
/**
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used
* for all reads without insertion quality scores for each base. [default is on]
*/
int8_t INSERTIONS_DEFAULT_QUALITY = 45;
/**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace
* all base qualities in the read for this default value. Negative value turns it off. [default is on]
*/
int8_t DELETIONS_DEFAULT_QUALITY = 45;
/**
* Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter
* defines the quality below which (inclusive) a tail is considered low quality
*/
int8_t LOW_QUAL_TAIL = 2;
/**
* BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base
* qualities, this is done by the engine with the -qq or -bqsr options. This parameter tells BQSR the number of levels of
* quantization to use to build the quantization table.
*/
int QUANTIZING_LEVELS = 16;
/**
* The tag name for the binary tag covariate (if using it)
*/
string BINARY_TAG_NAME = "";
/**
* bqsr-baq-gap-open-penalty, BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better
* for whole genome call sets
*/
double BAQGOP = 40;
/**
* This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in
* the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since base callers use these
* values to indicate random or bad bases. For example, Illumina writes Q2 bases when the machine has really gone wrong.
* This would be fine in and of itself, but when you select a subset of these reads based on their ability to align to the
* reference and their dinucleotide effect, your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream.
*/
int PRESERVE_QSCORES_LESS_THAN = 6;
/**
* enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了用不到了
*/
bool enableBAQ = false;
/**
* compute-indel-bqsr-tables, compute indel BQSR tables"
*/
bool computeIndelBQSRTables = false;
// --------------------------------------------------------------------------------------------------------------
//
// quality encoding checking arguments
//
// --------------------------------------------------------------------------------------------------------------
/**
* This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
* are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
* tag is present for a read, the standard qual score will be used.
*/
bool useOriginalBaseQualities = false;
/**
* If reads are missing some or all base quality scores, this value will be used for all base quality scores.
* By default this is set to -1 to disable default base quality assignment.
*/
int8_t defaultBaseQualities = -1;
};