实现了apply bqsr的单线程版本,结果还有点错误,继续调试
This commit is contained in:
parent
985875ebac
commit
94e06338cd
|
|
@ -6,5 +6,5 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|||
# set(CMAKE_BUILD_TYPE Debug)
|
||||
# set(CMAKE_BUILD_TYPE Release)
|
||||
add_definitions(-DSHOW_PERF)
|
||||
#add_definitions(-DSHOW_PERF=1)
|
||||
# add_definitions(-DSHOW_PERF=1)
|
||||
add_subdirectory(src)
|
||||
|
|
|
|||
|
|
@ -2,4 +2,6 @@
|
|||
|
||||
对GATK bqsr部分进行并行优化
|
||||
|
||||
对应的GATK版本:The Genome Analysis Toolkit (GATK) v4.6.2.0-17-g2a1f41b-SNAPSHOT
|
||||
|
||||
Implement the same functions as GATK for BaseRecalibrator and ApplyBQSR
|
||||
|
|
@ -0,0 +1,336 @@
|
|||
// 一. 读取并解析recalibrate tables
|
||||
|
||||
// 二. 循环处理每一个read
|
||||
|
||||
// 1. 计算read的协变量covs
|
||||
|
||||
// 2. 根据read的协变量信息从recal tables获取对应的数据
|
||||
|
||||
// 3. 根据recal tables数据对read的每个碱基分别计算新的质量分数
|
||||
|
||||
// 4. 将计算后的质量分数赋值给read
|
||||
|
||||
#include <header.h> // in htslib
|
||||
#include <htslib/sam.h>
|
||||
#include <htslib/thread_pool.h>
|
||||
|
||||
#include "aux_arg.h"
|
||||
#include "common_data.h"
|
||||
#include "util/debug.h"
|
||||
#include "util/profiling.h"
|
||||
#include "recal_utils.h"
|
||||
#include "recal_funcs.h"
|
||||
#include "read_utils.h"
|
||||
|
||||
using std::vector;
|
||||
|
||||
namespace nsgv {
|
||||
// 全局变量 for apply bqsr
|
||||
samFile* gOutBamFp; // 输出文件, sam或者bam格式
|
||||
sam_hdr_t* gOutBamHeader; // 输出文件的header
|
||||
RecalTables gRecalTables; // 保留一个全局的recalibrate tables就行了
|
||||
vector<uint8_t> gQuantizedQuals; // 读取quantized info table信息得到的,第三列数据
|
||||
StableArray<bam1_t*> gRecalBams[2]; // 保存已经校正过质量分数的bam数据,双buffer
|
||||
}; // namespace nsgv
|
||||
|
||||
// 串行apply bqsr
|
||||
int SerialApplyBQSR(AuxVar &aux) {
|
||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::applyBqsrReadFilterOut);
|
||||
int64_t readNumSum = 0;
|
||||
int round = 0;
|
||||
|
||||
PerReadCovariateMatrix& readCovariates = aux.readCovariates;
|
||||
// 一. 读取并解析recalibrate tables
|
||||
auto& recalTables = *aux.bqsrTable;
|
||||
|
||||
// 全局的校正后的bam数组
|
||||
auto& recalBams = nsgv::gRecalBams[0];
|
||||
|
||||
auto& sd = aux.sd;
|
||||
|
||||
while (true) {
|
||||
++round;
|
||||
// 一. 读取bam数据
|
||||
size_t readNum = 0;
|
||||
PROF_START(GP_read);
|
||||
if (inBamBuf.ReadStat() >= 0)
|
||||
readNum = inBamBuf.ReadBam();
|
||||
PROF_GP_END(GP_read);
|
||||
if (readNum < 1) {
|
||||
break;
|
||||
}
|
||||
auto bams = inBamBuf.GetBamArr();
|
||||
spdlog::info("{} reads processed in {} round", readNum, round);
|
||||
|
||||
if (recalBams.size() < bams.size()) {
|
||||
int start = recalBams.size();
|
||||
recalBams.resize(bams.size());
|
||||
for (int i = start; i < recalBams.size(); ++i) {
|
||||
recalBams[i] = bam_init1();
|
||||
}
|
||||
}
|
||||
|
||||
// 二. 遍历每个bam(read)记录,进行处理
|
||||
for (int i = 0; i < bams.size(); ++i) {
|
||||
if (bam_copy1(recalBams[i], bams[i]->b) == NULL) {
|
||||
spdlog::error("Copy bam error");
|
||||
exit(1);
|
||||
}
|
||||
BamWrap* bw = bams[i];
|
||||
bw->b = recalBams[i]; // 注意这里的赋值,然后就可以对b进行更改了
|
||||
sd.init();
|
||||
sd.parseForApplyBQSR(bw);
|
||||
sd.rid = i + aux.processedReads;
|
||||
|
||||
// 1. 是否使用original quality来代替当前的quality
|
||||
if (nsgv::gBqsrArg.useOriginalBaseQualities)
|
||||
ReadUtils::resetOriginalBaseQualities(bw->b);
|
||||
|
||||
// 2. 是否将当前的quality保存在tag OQ中
|
||||
if (nsgv::gBqsrArg.emitOriginalQuals)
|
||||
ReadUtils::setOriginalBaseQualsIfNoOQ(bw->b);
|
||||
|
||||
// 3. 计算read的协变量covs
|
||||
PROF_START(GP_covariate);
|
||||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
|
||||
PROF_GP_END(GP_covariate);
|
||||
|
||||
// clear indel qualities
|
||||
ReadUtils::removeAttribute(bw->b, "BI");
|
||||
ReadUtils::removeAttribute(bw->b, "BD");
|
||||
|
||||
// 4. 检查read的readGroup tag是否包含在bqsr table里
|
||||
const char* readGroupId = ReadUtils::getReadGroupId(bw->b);
|
||||
auto& covaritesForRead = readCovariates[EventType::BASE_SUBSTITUTION.index];
|
||||
uint8_t* recalibratedQuals = bam_get_qual(bw->b);
|
||||
auto& preUpdateQuals = sd.base_quals;
|
||||
int rgKey = -1;
|
||||
if (ReadGroupCovariate::RgToId.find(std::string(readGroupId)) != ReadGroupCovariate::RgToId.end())
|
||||
rgKey = ReadGroupCovariate::RgToId[std::string(readGroupId)];
|
||||
if (rgKey == -1) {
|
||||
if (nsgv::gBqsrArg.allowMissingReadGroups) {
|
||||
// Given the way the recalibration code is implemented below, we cannot recalibrate a read with a
|
||||
// read group that's not in the recal table.
|
||||
for (int i = 0; i < sd.read_len; i++) {
|
||||
//recalibratedQuals[i] = staticQuantizedMapping != null ? staticQuantizedMapping[preUpdateQuals[i]] : quantizedQuals.get(preUpdateQuals[i]);
|
||||
recalibratedQuals[i] = nsgv::gQuantizedQuals[preUpdateQuals[i]];
|
||||
}
|
||||
} else {
|
||||
spdlog::error(
|
||||
"Read group {} not found in the recalibration table. Use \"--allow-missing-read-group\" command line argument to ignore this "
|
||||
"error.",
|
||||
readGroupId);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. 根据recal tables数据对read的每个碱基分别计算新的质量分数
|
||||
auto& readGroupDatum = recalTables.readGroupTable(EventType::BASE_SUBSTITUTION.index, rgKey);
|
||||
// Note: this loop is under very heavy use in applyBQSR. Keep it slim.
|
||||
for (int offset = 0; offset < sd.read_len; offset++) { // recalibrate all bases in the read
|
||||
// only recalibrate usable qualities (default: >= 6) (the original quality will come from the instrument -- reported quality)
|
||||
if (recalibratedQuals[offset] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN) {
|
||||
continue;
|
||||
}
|
||||
auto& covs = readCovariates[EventType::BASE_SUBSTITUTION.index][offset];
|
||||
// 根据read的协变量数据获取对应的bqsr数据
|
||||
auto& qualityScoreDatum = recalTables.qualityScoreTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality);
|
||||
auto& contextDatum = recalTables.contextTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.context);
|
||||
auto& cycleDatum = recalTables.cycleTable(EventType::BASE_SUBSTITUTION.index, rgKey, covs.baseQuality, covs.cycle);
|
||||
// 计算校正后的质量分数
|
||||
double priorQualityScore =
|
||||
nsgv::gBqsrArg.globalQScorePrior > 0.0 ? nsgv::gBqsrArg.globalQScorePrior : readGroupDatum.getReportedQuality();
|
||||
double rawRecalibratedQualityScore =
|
||||
RecalFuncs::hierarchicalBayesianQualityEstimate(priorQualityScore, readGroupDatum, qualityScoreDatum, contextDatum, cycleDatum);
|
||||
uint8_t qualIdx = RecalFuncs::getBoundedIntegerQual(rawRecalibratedQualityScore);
|
||||
uint8_t quantizedQualityScore = qualIdx; // nsgv::gQuantizedQuals.at(qualIdx);
|
||||
// TODO: as written the code quantizes *twice* if the static binning is enabled (first time to the dynamic bin). It should be
|
||||
// quantized once.
|
||||
// recalibratedQuals[offset] = staticQuantizedMapping == null ? quantizedQualityScore : staticQuantizedMapping[quantizedQualityScore];
|
||||
recalibratedQuals[offset] = quantizedQualityScore;
|
||||
}
|
||||
|
||||
if (sam_write1(nsgv::gOutBamFp, nsgv::gOutBamHeader, bw->b) < 0) {
|
||||
spdlog::error("failed writing sam record to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE.c_str());
|
||||
sam_close(nsgv::gInBamFp);
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
exit(1);
|
||||
}
|
||||
//break;
|
||||
}
|
||||
// break;
|
||||
|
||||
readNumSum += readNum;
|
||||
AuxVar::processedReads += readNum;
|
||||
inBamBuf.ClearAll(); // 清空bam buf
|
||||
}
|
||||
|
||||
spdlog::info("read count: {}", readNumSum);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 并行 apply bqsr
|
||||
int ParallelApplyBQSR(vector<AuxVar> &auxArr) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 在进行数据处理之前,初始化一些全局数据
|
||||
static void globalInit() {
|
||||
open_debug_files();
|
||||
/* bam */
|
||||
nsgv::gInBamFp = sam_open_format(nsgv::gBqsrArg.INPUT_FILE.c_str(), "r", nullptr);
|
||||
if (!nsgv::gInBamFp) {
|
||||
spdlog::error("[{}] load sam/bam file failed.\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
nsgv::gInBamHeader = sam_hdr_read(nsgv::gInBamFp); // header
|
||||
|
||||
/* 并行读取bam数据 */
|
||||
htsThreadPool htsPoolRead = {NULL, 0}; //
|
||||
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
|
||||
htsPoolRead.pool = hts_tpool_init(readThreadNum);
|
||||
if (!htsPoolRead.pool) {
|
||||
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
exit(1);
|
||||
}
|
||||
hts_set_opt(nsgv::gInBamFp, HTS_OPT_THREAD_POOL, &htsPoolRead);
|
||||
|
||||
if (!nsgv::gInBamHeader->hrecs) {
|
||||
if (sam_hdr_fill_hrecs(nsgv::gInBamHeader) != 0) {
|
||||
spdlog::error("[{}] failed to read sam header", __LINE__);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
/* 并行写入bam文件 */
|
||||
char modeout[12] = "wb";
|
||||
sam_open_mode(modeout + 1, nsgv::gBqsrArg.OUTPUT_FILE.c_str(), NULL);
|
||||
nsgv::gOutBamFp = sam_open(nsgv::gBqsrArg.OUTPUT_FILE.c_str(), modeout);
|
||||
if (!nsgv::gOutBamFp) {
|
||||
spdlog::error("[{}] create output sam/bam file failed.\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
nsgv::gOutBamHeader = sam_hdr_dup(nsgv::gInBamHeader);
|
||||
// 修改输出文件的header
|
||||
sam_hdr_add_line(nsgv::gOutBamHeader, "PG", "ID", PROGRAM_NAME, "VN", FASTBQSR_VERSION, "CL", nsgv::gBqsrArg.CLI_STR.c_str(), NULL);
|
||||
// 用不同的线程数量处理输出文件
|
||||
htsThreadPool htsPoolWrite = {NULL, 0}; // 读写用不同的线程池
|
||||
int writeThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_WRITE_MAX_THREAD);
|
||||
htsPoolWrite.pool = hts_tpool_init(writeThreadNum);
|
||||
if (!htsPoolWrite.pool) {
|
||||
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
exit(1);
|
||||
}
|
||||
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
|
||||
hts_set_opt(nsgv::gOutBamFp, HTS_OPT_THREAD_POOL, &htsPoolWrite);
|
||||
|
||||
if (sam_hdr_write(nsgv::gOutBamFp, nsgv::gOutBamHeader) != 0) {
|
||||
spdlog::error("failed writing header to \"{}\"", nsgv::gBqsrArg.OUTPUT_FILE);
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// 输出index文件
|
||||
nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".bai"; // min_shift = 0 是bai格式
|
||||
if ("sam" == Utils::getFileExtension(nsgv::gBqsrArg.OUTPUT_FILE) || !nsgv::gBqsrArg.CREATE_INDEX) {
|
||||
nsgv::gBqsrArg.INDEX_FILE = "";
|
||||
}
|
||||
if (!nsgv::gBqsrArg.INDEX_FILE.empty()) {
|
||||
int index_min_shift = 0;
|
||||
if (nsgv::gBqsrArg.INDEX_FORMAT == nsbqsr::IndexFormat::CSI) {
|
||||
nsgv::gBqsrArg.INDEX_FILE = nsgv::gBqsrArg.OUTPUT_FILE + ".csi";
|
||||
index_min_shift = 14;
|
||||
}
|
||||
if (sam_idx_init(nsgv::gOutBamFp, nsgv::gOutBamHeader, 0 /*csi 14*/, nsgv::gBqsrArg.INDEX_FILE.c_str()) < 0) {
|
||||
spdlog::error("failed to open index \"{}\" for writing", nsgv::gBqsrArg.INDEX_FILE);
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// 1. 协变量数据相关初始化
|
||||
ContextCovariate::InitContextCovariate(nsgv::gBqsrArg);
|
||||
CycleCovariate::InitCycleCovariate(nsgv::gBqsrArg);
|
||||
|
||||
// 注意初始化顺序,这个必须在协变量初始化之后,因为需要用到MaximumKeyValue
|
||||
// nsgv::gRecalTables.init(nsgv::gInBamHeader->hrecs->nrg);
|
||||
|
||||
// 初始化AuxVar
|
||||
nsgv::gAuxVars.resize(nsgv::gBqsrArg.NUM_THREADS);
|
||||
for (int i = 0; i < nsgv::gBqsrArg.NUM_THREADS; ++i) {
|
||||
nsgv::gAuxVars[i].header = nsgv::gInBamHeader;
|
||||
nsgv::gAuxVars[i].faidx = fai_load(nsgv::gBqsrArg.REFERENCE_FILE.c_str());
|
||||
if (nsgv::gAuxVars[i].faidx == 0)
|
||||
error("[%s] fail to load the fasta index.\n", __func__);
|
||||
// 注意初始化顺序,这个必须在协变量初始化之后,因为需要用到MaximumKeyValue
|
||||
nsgv::gAuxVars[i].bqsrTable = &nsgv::gRecalTables;
|
||||
CovariateUtils::InitPerReadCovMat(nsgv::gAuxVars[i].readCovariates);
|
||||
}
|
||||
|
||||
// 0. 初始化一些全局数据
|
||||
RecalDatum::StaticInit();
|
||||
QualityUtils::StaticInit();
|
||||
MathUtils::StaticInit();
|
||||
BaseUtils::StaticInit();
|
||||
|
||||
// 初始化需要计算的event types
|
||||
RecalUtils::initEventTypes(nsgv::gBqsrArg.computeIndelBQSRTables);
|
||||
|
||||
// 2. 读取bam的read group
|
||||
// if (nsgv::gInBamHeader->hrecs->nrg == 0) {
|
||||
// spdlog::error("No RG tag found in the header!");
|
||||
// exit(1);
|
||||
// }
|
||||
|
||||
/*
|
||||
// 应该是从bqsr table里读取read group信息
|
||||
for (int i = 0; i < nsgv::gInBamHeader->hrecs->nrg; ++i) {
|
||||
// spdlog::info("rg: {}", nsgv::gInBamHeader->hrecs->rg[i].name);
|
||||
ReadGroupCovariate::RgToId[nsgv::gInBamHeader->hrecs->rg[i].name] = i;
|
||||
ReadGroupCovariate::IdToRg[i] = nsgv::gInBamHeader->hrecs->rg[i].name;
|
||||
}
|
||||
*/
|
||||
|
||||
// 读取并解析recalibrate tables
|
||||
RecalUtils::readRecalTables(nsgv::gBqsrArg.BQSR_FILE, nsgv::gBqsrArg, nsgv::gQuantizedQuals, nsgv::gRecalTables);
|
||||
}
|
||||
|
||||
// 全局资源释放
|
||||
static void globalDestroy() {
|
||||
close_debug_files();
|
||||
|
||||
if (!nsgv::gBqsrArg.INDEX_FILE.empty() && sam_idx_save(nsgv::gOutBamFp) < 0) {
|
||||
spdlog::error("writing index failed");
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* 关闭文件,收尾清理 */
|
||||
sam_close(nsgv::gOutBamFp);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
}
|
||||
|
||||
// entrance of BQSR phase-1
|
||||
int ApplyBQSR() {
|
||||
int ret = 0;
|
||||
|
||||
PROF_START(GP_whole_process);
|
||||
globalInit();
|
||||
// if (nsgv::gBqsrArg.NUM_THREADS == 1)
|
||||
ret = SerialApplyBQSR(nsgv::gAuxVars[0]); // 串行处理数据,生成recal bams
|
||||
// else
|
||||
// ret = ParallelApplyBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal bams
|
||||
globalDestroy();
|
||||
PROF_GP_END(GP_whole_process);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -43,11 +43,14 @@ struct AuxVar {
|
|||
vector<VCFParser> vcfArr; // 从vcf中获取已知位点
|
||||
|
||||
PerReadCovariateMatrix readCovariates; // 每个read对应的协变量矩阵
|
||||
RecalTables recalTables; // 每个线程对应一个recal tables
|
||||
RecalTables recalTables; // 每个线程对应一个recal tables,计算table的时候需要每个线程都有一个,输出bam的时候不需要,因为只读取,不写入
|
||||
|
||||
SamData sd;
|
||||
StableArray<int> isSNP, isIns, isDel; // 该位置是否是SNP, indel位置,0不是,1是
|
||||
StableArray<uint8_t> baqArray;
|
||||
StableArray<double> snpErrors, insErrors, delErrors;
|
||||
StableArray<uint8_t> skips; // 该位置是否是已知位点
|
||||
|
||||
// only for apply bqsr
|
||||
RecalTables* bqsrTable; // 保留一份就够
|
||||
};
|
||||
|
|
@ -11,6 +11,8 @@ Date : 2025/10/10
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "qual_utils.h"
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
|
|
@ -36,19 +38,19 @@ struct BQSRArg {
|
|||
/* "Whether to create an index when writing VCF or coordinate sorted BAM output.", common = true */
|
||||
bool CREATE_INDEX = true;
|
||||
|
||||
string INDEX_FILE;
|
||||
|
||||
nsbqsr::IndexFormat INDEX_FORMAT = nsbqsr::IndexFormat::BAI;
|
||||
|
||||
/* Add PG tag to each read in a SAM or BAM (PGTagArgumentCollection)*/
|
||||
bool ADD_PG_TAG_TO_READS = true;
|
||||
|
||||
//
|
||||
// 命令行字符串
|
||||
string CLI_STR;
|
||||
|
||||
//
|
||||
// 开始运行时间
|
||||
string START_TIME;
|
||||
|
||||
string PROGRAM_RECORD_ID = "FastBQSR";
|
||||
|
||||
// reference file
|
||||
string REFERENCE_FILE;
|
||||
|
||||
|
|
@ -137,14 +139,6 @@ struct BQSRArg {
|
|||
*/
|
||||
double BAQGOP = 40;
|
||||
|
||||
/**
|
||||
* This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in
|
||||
* the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since base callers use these
|
||||
* values to indicate random or bad bases. For example, Illumina writes Q2 bases when the machine has really gone wrong.
|
||||
* This would be fine in and of itself, but when you select a subset of these reads based on their ability to align to the
|
||||
* reference and their dinucleotide effect, your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream.
|
||||
*/
|
||||
int PRESERVE_QSCORES_LESS_THAN = 6;
|
||||
|
||||
/**
|
||||
* enable-baq, do BAQ correction" (base alignment quality), 在GATK里hidden了,用不到了?
|
||||
|
|
@ -152,7 +146,7 @@ struct BQSRArg {
|
|||
bool enableBAQ = false;
|
||||
|
||||
/**
|
||||
* compute-indel-bqsr-tables, compute indel BQSR tables"
|
||||
* compute-indel-bqsr-tables, compute indel BQSR tables" hidden
|
||||
*/
|
||||
bool computeIndelBQSRTables = false;
|
||||
|
||||
|
|
@ -162,16 +156,93 @@ struct BQSRArg {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
|
||||
* are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
|
||||
* tag is present for a read, the standard qual score will be used.
|
||||
*/
|
||||
bool useOriginalBaseQualities = false;
|
||||
|
||||
/**
|
||||
* If reads are missing some or all base quality scores, this value will be used for all base quality scores.
|
||||
* By default this is set to -1 to disable default base quality assignment.
|
||||
*/
|
||||
int8_t defaultBaseQualities = -1;
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// args for apply bqsr
|
||||
string BQSR_FILE;
|
||||
|
||||
/**
|
||||
* Turns on the base quantization module. It requires a recalibration report.
|
||||
*
|
||||
* A value of 0 here means "do not quantize".
|
||||
* Any value greater than zero will be used to recalculate the quantization using that many levels.
|
||||
* Negative values mean that we should quantize using the recalibration report's quantization level.
|
||||
*/
|
||||
// @Argument(fullName = "quantize-quals", doc = "Quantize quality scores to a given number of levels", optional = true)
|
||||
int quantizationLevels = 0;
|
||||
|
||||
/**
|
||||
* Static quantized quals are entirely separate from the quantize_qual option which uses dynamic binning.
|
||||
* The two types of binning should not be used together.
|
||||
*
|
||||
* For example, the Warp germline pipeline uses the static bins { 10, 20, 30, 40 }
|
||||
*/
|
||||
//@Advanced @Argument(fullName = static-quantized-quals,
|
||||
// doc = "Use static quantized quality scores to a given number of levels (with -" +
|
||||
// StandardArgumentDefinitions.BQSR_TABLE_SHORT_NAME + ")", optional = true, mutex = "quantize-quals")
|
||||
vector<int> staticQuantizationQuals;
|
||||
|
||||
/**
|
||||
* Round down quantized only works with the static_quantized_quals option, and should not be used with
|
||||
* the dynamic binning option provided by quantize_quals. When roundDown = false, rounding is done in
|
||||
* probability space to the nearest bin. When roundDown = true, the value is rounded to the nearest bin
|
||||
* that is smaller than the current bin.
|
||||
*/
|
||||
// @Advanced @Argument(fullName = "round-down-quantized", doc = "Round quals down to nearest quantized qual", optional = true,
|
||||
// mutex = "quantize-quals") public
|
||||
bool roundDown = false;
|
||||
|
||||
/**
|
||||
* The tool is capable of writing out the original quality scores of each read in the recalibrated output file
|
||||
* under the "OQ" tag. By default, this behavior is disabled because emitting original qualities results in a
|
||||
* significant increase of the file size. Use this flag to turn on emission of original qualities.
|
||||
*/
|
||||
//@Argument(fullName = "emit-original-quals", doc = "Emit original base qualities under the OQ tag",
|
||||
// optional = true)
|
||||
bool emitOriginalQuals = false;
|
||||
|
||||
/**
|
||||
* If specified, the value of this argument will be used as a flat prior for all mismatching quality scores instead
|
||||
* of the reported quality score (assigned by the sequencer).
|
||||
*/
|
||||
// @Argument(fullName = "global-qscore-prior", doc = "Global Qscore Bayesian prior to use for BQSR",
|
||||
// optional = true)
|
||||
double globalQScorePrior = -1.0;
|
||||
|
||||
/**
|
||||
* If set to true, do not throw an error upon encountering a read with a read group that's not in the recalibration table.
|
||||
* Instead, simply set the quantized original base qualities as the recalibrated base qualities.
|
||||
*/
|
||||
// @Argument(fullName = allow-missing-read-group, doc = "Do not throw an error when encountering a read group not in the recal table",
|
||||
// optional = true)
|
||||
bool allowMissingReadGroups = false;
|
||||
|
||||
/**
|
||||
* This flag tells GATK not to modify quality scores less than this value. Instead they will be written out
|
||||
* unmodified in the recalibrated BAM file. In general it's unsafe to change qualities scores below < 6, since
|
||||
* base callers use these values to indicate random or bad bases. For example, Illumina writes Q2 bases when the
|
||||
* machine has really gone wrong. This would be fine in and of itself, but when you select a subset of these reads
|
||||
* based on their ability to align to the reference and their dinucleotide effect, your Q2 bin can be elevated to
|
||||
* Q8 or Q10, leading to issues downstream.
|
||||
*/
|
||||
// @Argument(fullName = "preserve-qscores-less-than", doc = "Don't recalibrate bases with quality scores less than this threshold", optional = true,
|
||||
// minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE)
|
||||
int PRESERVE_QSCORES_LESS_THAN = QualityUtils::MIN_USABLE_Q_SCORE;
|
||||
|
||||
/**
|
||||
* This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which
|
||||
* are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ
|
||||
* tag is present for a read, the standard quality score will be used.
|
||||
*/
|
||||
// @Argument(fullName = use-original-qualities, shortName = "OQ", doc = "Use the base quality scores from the OQ tag",
|
||||
// optional = true)
|
||||
bool useOriginalBaseQualities = false;
|
||||
};
|
||||
|
|
@ -1,13 +1,12 @@
|
|||
/*
|
||||
Description:
|
||||
bam,bam,bam
|
||||
Description: bqsr的第一阶段,生成协变量统计表的程序入口
|
||||
|
||||
Copyright : All right reserved by ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2023/10/23
|
||||
Date : 2025/11/23
|
||||
*/
|
||||
#include <header.h>
|
||||
#include <header.h> // in htslib
|
||||
#include <htslib/faidx.h>
|
||||
#include <htslib/kstring.h>
|
||||
#include <htslib/sam.h>
|
||||
|
|
@ -44,11 +43,10 @@ Date : 2023/10/23
|
|||
#include "util/stable_array.h"
|
||||
#include "util/utils.h"
|
||||
#include "util/vcf_parser.h"
|
||||
#include "common_data.h"
|
||||
|
||||
using std::deque;
|
||||
|
||||
#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M
|
||||
|
||||
namespace nsgv {
|
||||
|
||||
// 全局变量 for bqsr
|
||||
|
|
@ -58,29 +56,6 @@ sam_hdr_t* gInBamHeader; // input BAM header
|
|||
vector<AuxVar> gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个
|
||||
}; // namespace nsgv
|
||||
|
||||
|
||||
// 过滤掉bqsr过程不符合要求的bam数据
|
||||
bool bqsrReadFilterOut(const bam1_t *b) {
|
||||
// 过滤掉unmapped的read
|
||||
if (b->core.qual == 0) // mapping quality 0
|
||||
return true;
|
||||
if (b->core.qual == 255) // mapping quality not available
|
||||
return true;
|
||||
if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
|
||||
return true;
|
||||
}
|
||||
if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
|
||||
return true;
|
||||
}
|
||||
if (b->core.flag & BAM_FDUP) { // secondary alignment
|
||||
return true;
|
||||
}
|
||||
if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// 数据总结
|
||||
void collapseQualityScoreTableToReadGroupTable(Array2D<RecalDatum> &byReadGroupTable, Array3D<RecalDatum> &byQualTable) {
|
||||
// 遍历quality table
|
||||
|
|
@ -137,7 +112,7 @@ static void printRecalTables(const RecalTables& rt) {
|
|||
// 串行bqsr
|
||||
int SerialBQSR(AuxVar &aux) {
|
||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
|
||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut);
|
||||
int64_t readNumSum = 0;
|
||||
int round = 0;
|
||||
|
||||
|
|
@ -227,7 +202,7 @@ int SerialBQSR(AuxVar &aux) {
|
|||
|
||||
// 8. 计算这条read对应的协变量
|
||||
PROF_START(GP_covariate);
|
||||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, 0);
|
||||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, 0);
|
||||
PROF_GP_END(GP_covariate);
|
||||
|
||||
// fprintf(gf[4], "%ld %d\n", sd.rid, sd.read_len);
|
||||
|
|
@ -251,7 +226,7 @@ int SerialBQSR(AuxVar &aux) {
|
|||
|
||||
// 9. 计算这条read需要跳过的位置
|
||||
PROF_START(GP_read_vcf);
|
||||
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, 0);
|
||||
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, RecalFuncs::MAX_SITES_INTERVAL, skips, 0);
|
||||
for (int ii = 0; ii < sd.read_len; ++ii) {
|
||||
skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) ||
|
||||
sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||
|
|
@ -336,6 +311,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
|
|||
int stopIdx = std::min((size_t)(idx + 1) * blockReadNums, bams.size());
|
||||
#endif
|
||||
// spdlog::info("tid {}, index {}, steal {}", tid, idx, steal);
|
||||
// spdlog::info("interval span: {}", bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos());
|
||||
int sitesStride = bams[stopIdx-1]->end_pos() + 1 - bams[startIdx]->start_pos();
|
||||
// sitesStride = sitesStride >= RecalFuncs::MAX_SITES_INTERVAL ? sitesStride : RecalFuncs::MAX_SITES_INTERVAL;
|
||||
sitesStride = RecalFuncs::MAX_SITES_INTERVAL;
|
||||
aux.threadProcessedReads += stopIdx - startIdx;
|
||||
for (int i = startIdx; i < stopIdx; ++i) {
|
||||
// spdlog::info("Thread {} processing read idx: {}", tid, i);
|
||||
|
|
@ -364,10 +343,10 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
|
|||
if (!baqCalculated) continue;
|
||||
|
||||
PROF_START(TP_covariate);
|
||||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, true, thid);
|
||||
CovariateUtils::ComputeCovariates(sd, aux.header, readCovariates, nsgv::gBqsrArg.computeIndelBQSRTables, thid);
|
||||
PROF_TP_END(TP_covariate);
|
||||
|
||||
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, thid);
|
||||
RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, sitesStride, skips, thid);
|
||||
for (int ii = 0; ii < sd.read_len; ++ii) {
|
||||
skips[ii] =
|
||||
skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
|
||||
|
|
@ -390,7 +369,7 @@ static void thread_worker(void* data, long idx, int thid, int steal) {
|
|||
// 并行bqsr
|
||||
int ParallelBQSR(vector<AuxVar>& auxArr) {
|
||||
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
|
||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, bqsrReadFilterOut);
|
||||
inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM, RecalFuncs::bqsrReadFilterOut);
|
||||
int64_t readNumSum = 0;
|
||||
int round = 0;
|
||||
|
||||
|
|
@ -484,8 +463,8 @@ static void globalInit() {
|
|||
|
||||
/* 并行读取bam数据 */
|
||||
htsThreadPool htsPoolRead = {NULL, 0}; //
|
||||
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, 8);
|
||||
htsPoolRead.pool = hts_tpool_init(nsgv::gBqsrArg.NUM_THREADS);
|
||||
int readThreadNum = min(nsgv::gBqsrArg.NUM_THREADS, BAM_READ_MAX_THREAD);
|
||||
htsPoolRead.pool = hts_tpool_init(readThreadNum);
|
||||
if (!htsPoolRead.pool ) {
|
||||
spdlog::error("[{}] failed to set up thread pool", __LINE__);
|
||||
sam_close(nsgv::gInBamFp);
|
||||
|
|
@ -549,9 +528,10 @@ static void globalInit() {
|
|||
// 全局资源释放
|
||||
static void globalDestroy() {
|
||||
close_debug_files();
|
||||
sam_close(nsgv::gInBamFp);
|
||||
}
|
||||
|
||||
// entrance of mark BQSR
|
||||
// entrance of BQSR phase-1
|
||||
int BaseRecalibrator() {
|
||||
int ret = 0;
|
||||
|
||||
|
|
@ -562,7 +542,6 @@ int BaseRecalibrator() {
|
|||
else
|
||||
ret = ParallelBQSR(nsgv::gAuxVars); // 并行处理数据,生成recal table
|
||||
globalDestroy();
|
||||
sam_close(nsgv::gInBamFp);
|
||||
PROF_GP_END(GP_whole_process);
|
||||
|
||||
return ret;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
Description: 共用的一些宏定义,全局变量等
|
||||
|
||||
Copyright : All right reserved by ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2026/01/01
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "bqsr_args.h"
|
||||
#include <htslib/sam.h>
|
||||
#include "aux_arg.h"
|
||||
#include "fastbqsr_version.h"
|
||||
|
||||
#define BAM_BLOCK_SIZE 16L * 1024 * 1024 // 16M
|
||||
#define BAM_READ_MAX_THREAD 8
|
||||
#define BAM_WRITE_MAX_THREAD 16
|
||||
|
||||
#define PROGRAM_NAME "FastBQSR"
|
||||
|
||||
namespace nsgv {
|
||||
|
||||
// 全局变量 for bqsr
|
||||
extern BQSRArg gBqsrArg; // bqsr arguments
|
||||
extern samFile* gInBamFp; // input BAM file pointer
|
||||
extern sam_hdr_t* gInBamHeader; // input BAM header
|
||||
extern vector<AuxVar> gAuxVars; // auxiliary variables,保存一些文件,数据等,每个线程对应一个
|
||||
}; // namespace nsgv
|
||||
|
|
@ -250,19 +250,20 @@ void ContextCovariate::GetStrandedClippedBytes(SamData& sd, StableArray<char>& c
|
|||
* @param end the end position in the array (exclusive)
|
||||
* @return the key representing the dna sequence
|
||||
*/
|
||||
int ContextCovariate::KeyFromContext(const StableArray<char>& dna, const int start, const int end) {
|
||||
int key = end - start;
|
||||
int bitOffset = LENGTH_BITS;
|
||||
for (int i = start; i < end; i++) {
|
||||
const int baseIndex = baseIndexMap[dna[i] & 0xff];
|
||||
if (baseIndex == -1) { // ignore non-ACGT bases
|
||||
return -1;
|
||||
}
|
||||
key |= (baseIndex << bitOffset);
|
||||
bitOffset += 2;
|
||||
}
|
||||
return key;
|
||||
}
|
||||
// template<typename Arr> // 好像是类的模板函数必须在头文件里实现
|
||||
// int ContextCovariate::KeyFromContext(const Arr& dna, const int start, const int end) {
|
||||
// int key = end - start;
|
||||
// int bitOffset = LENGTH_BITS;
|
||||
// for (int i = start; i < end; i++) {
|
||||
// const int baseIndex = baseIndexMap[dna[i] & 0xff];
|
||||
// if (baseIndex == -1) { // ignore non-ACGT bases
|
||||
// return -1;
|
||||
// }
|
||||
// key |= (baseIndex << bitOffset);
|
||||
// bitOffset += 2;
|
||||
// }
|
||||
// return key;
|
||||
// }
|
||||
|
||||
/**
|
||||
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||||
|
|
@ -334,6 +335,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const StableArray<char>& bas
|
|||
void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||||
const int originalReadLength = sd.read_len;
|
||||
|
||||
const char* qname = bam_get_qname(sd.bw->b);
|
||||
// store the original bases and then write Ns over low quality ones
|
||||
auto &strandedClippedBases = sd.strandedClippedBases;
|
||||
strandedClippedBases.copy(sd.bases);
|
||||
|
|
@ -348,13 +350,13 @@ void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovar
|
|||
// since the context covariate may not span the entire set of values in read covariates
|
||||
// due to the clipping of the low quality bases
|
||||
// 这段代码应该不会执行,因为clip with N不会改变read长度
|
||||
if (readLengthAfterClipping != originalReadLength) {
|
||||
// don't bother zeroing out if we are going to overwrite the whole array
|
||||
for (int i = 0; i < originalReadLength; i++) {
|
||||
// this base has been clipped off, so zero out the covariate values here
|
||||
CovariateUtils::SetContext(0, 0, 0, i, values);
|
||||
}
|
||||
}
|
||||
// if (readLengthAfterClipping != originalReadLength) {
|
||||
// // don't bother zeroing out if we are going to overwrite the whole array
|
||||
// for (int i = 0; i < originalReadLength; i++) {
|
||||
// // this base has been clipped off, so zero out the covariate values here
|
||||
// CovariateUtils::SetContext(0, 0, 0, i, values);
|
||||
// }
|
||||
// }
|
||||
|
||||
const bool negativeStrand = sd.bw->GetReadNegativeStrandFlag();
|
||||
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ using std::vector;
|
|||
|
||||
// 协变量的值, 4个协变量
|
||||
struct CovariateValues {
|
||||
int readGroup = 0;
|
||||
int readGroup = 0; // 默认是read group 0
|
||||
int baseQuality = 0;
|
||||
int context = -1;
|
||||
int cycle = -1;
|
||||
|
|
@ -61,6 +61,16 @@ struct EventType {
|
|||
static EventTypeValue BASE_INSERTION;
|
||||
static EventTypeValue BASE_DELETION;
|
||||
static vector<EventTypeValue> EVENTS;
|
||||
static int GetIndexForEventRep(char eventType) {
|
||||
if (eventType == BASE_SUBSTITUTION.representation) {
|
||||
return BASE_SUBSTITUTION.index;
|
||||
} else if (eventType == BASE_INSERTION.representation) {
|
||||
return BASE_DELETION.index;
|
||||
} else if (eventType == BASE_DELETION.representation) {
|
||||
return BASE_DELETION.index;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
// Read group协变量
|
||||
|
|
@ -201,7 +211,22 @@ struct ContextCovariate {
|
|||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||||
static void GetStrandedClippedBytes(SamData& ad, StableArray<char>& clippedBases, uint8_t lowQTail);
|
||||
// Creates a int representation of a given dna string.
|
||||
static int KeyFromContext(const StableArray<char>& dna, const int start, const int end);
|
||||
template <typename Arr>
|
||||
static int KeyFromContext(const Arr& dna, const int start, const int end) {
|
||||
int key = end - start;
|
||||
int bitOffset = LENGTH_BITS;
|
||||
for (int i = start; i < end; i++) {
|
||||
const int baseIndex = baseIndexMap[dna[i] & 0xff];
|
||||
if (baseIndex == -1) { // ignore non-ACGT bases
|
||||
return -1;
|
||||
}
|
||||
key |= (baseIndex << bitOffset);
|
||||
bitOffset += 2;
|
||||
}
|
||||
return key;
|
||||
}
|
||||
|
||||
static int KeyFromContext(const string& dna) { return KeyFromContext(dna, 0, dna.size()); }
|
||||
// For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||||
static void GetReadContextAtEachPosition(const StableArray<char>& bases, const int contextSize, const int mask, StableArray<int>& keys);
|
||||
|
||||
|
|
@ -219,6 +244,7 @@ struct CycleCovariate {
|
|||
|
||||
static int MaximumKeyValue() { return (MAXIMUM_CYCLE_VALUE << 1) + 1; }
|
||||
|
||||
static int KeyFromCycle(const int cycle) { return KeyFromCycle(cycle, MAXIMUM_CYCLE_VALUE); }
|
||||
/**
|
||||
* Encodes the cycle number as a key.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
Description: apply bqsr过程中对sam的一些修改函数
|
||||
|
||||
Copyright : All right reserved by ICT
|
||||
|
||||
Author : Zhang Zhonghai
|
||||
Date : 2026/01/02
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <htslib/sam.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
struct ReadUtils {
|
||||
// sam/bam文件中的quality score要在原来的quality上加33,应该是映射到字符区,方便用字母表示
|
||||
// htslib解析之后已经减掉了,就是真实的质量分数
|
||||
static constexpr int QUALITY_SCORE_ADD_IN_FILE = 33;
|
||||
/**
|
||||
* Resets the quality scores of the reads to the orginal (pre-BQSR) ones.
|
||||
*/
|
||||
static void resetOriginalBaseQualities(bam1_t *b) {
|
||||
uint8_t* oq = bam_aux_get(b, "OQ");
|
||||
char* oqVal = nullptr;
|
||||
if (oq)
|
||||
oqVal = bam_aux2Z(oq);
|
||||
int crg = 0;
|
||||
if (oqVal == nullptr) {
|
||||
} else {
|
||||
uint8_t* quals = bam_get_qual(b);
|
||||
for (int i = 0; i < b->core.l_qseq; ++i) {
|
||||
quals[i] = oqVal[i] - QUALITY_SCORE_ADD_IN_FILE; // 要减掉33
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// get read group id
|
||||
static const char* getReadGroupId(bam1_t *b) {
|
||||
uint8_t* rgStr = bam_aux_get(b, "RG");
|
||||
char* rgVal = nullptr;
|
||||
if (rgStr)
|
||||
rgVal = bam_aux2Z(rgStr);
|
||||
return rgVal;
|
||||
}
|
||||
|
||||
// 将当前quals保存在OQ tag中 (如果当前read不存在OQ tag的情况下)
|
||||
static void setOriginalBaseQualsIfNoOQ(bam1_t *b) {
|
||||
uint8_t* tagData = bam_aux_get(b, "OQ");
|
||||
if (tagData)
|
||||
return;
|
||||
char* tagVal = nullptr;
|
||||
const char* oldQual = (char*)bam_get_qual(b);
|
||||
string qual(b->core.l_qseq + 1, '\0');
|
||||
for (int i = 0; i < qual.size() - 1; ++i) {
|
||||
qual[i] = oldQual[i] + QUALITY_SCORE_ADD_IN_FILE;
|
||||
}
|
||||
// bam_aux_append 最后一个字符必须是'\0'
|
||||
if (bam_aux_append(b, "OQ", 'Z', qual.size(), (const uint8_t*)qual.c_str()) != 0) {
|
||||
spdlog::error("Add OQ (original quality score) tag failed. ");
|
||||
}
|
||||
}
|
||||
|
||||
// 移除给定的tag
|
||||
static void removeAttribute(bam1_t *b, const string &tag) {
|
||||
uint8_t* tagData = bam_aux_get(b, "OQ");
|
||||
if (!tagData)
|
||||
return;
|
||||
if (bam_aux_remove(b, tagData) == nullptr) {
|
||||
spdlog::error("Remove tag {} failed. ", tag);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
@ -147,14 +147,18 @@ struct RecalDatum {
|
|||
*/
|
||||
inline double calcExpectedErrors() const { return numObservations * QualityUtils::qualToErrorProb(reportedQuality); }
|
||||
inline double getNumMismatches() const { return numMismatches / MULTIPLIER; }
|
||||
inline void setNumMismatches(double val) { numMismatches = val * MULTIPLIER; }
|
||||
inline uint64_t getNumObservations() const { return numObservations; }
|
||||
inline void setNumObservations(uint64_t val) { numObservations = val; }
|
||||
inline double getReportedQuality() const { return reportedQuality; }
|
||||
inline void setReportedQuality(double val) { reportedQuality = val; }
|
||||
|
||||
/**
|
||||
* Computes the empirical quality of the datum, using the reported quality as the prior.
|
||||
* @see #getEmpiricalQuality(double) below.
|
||||
*/
|
||||
double getEmpiricalQuality() { return getEmpiricalQuality(getReportedQuality()); }
|
||||
inline void setEmpiricalQuality(double val) { empiricalQuality = val; }
|
||||
|
||||
/**
|
||||
* Computes the empirical base quality (roughly (num errors)/(num observations)) from the counts stored in this datum.
|
||||
|
|
|
|||
|
|
@ -8,19 +8,55 @@
|
|||
*/
|
||||
#pragma once
|
||||
|
||||
#include "util/stable_array.h"
|
||||
#include "util/sam_data.h"
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <numeric>
|
||||
|
||||
#include "bqsr/aux_arg.h"
|
||||
#include "util/bam_wrap.h"
|
||||
#include "util/interval.h"
|
||||
#include "util/vcf_parser.h"
|
||||
#include "util/profiling.h"
|
||||
#include "util/debug.h"
|
||||
#include "util/interval.h"
|
||||
#include "util/profiling.h"
|
||||
#include "util/sam_data.h"
|
||||
#include "util/stable_array.h"
|
||||
#include "util/vcf_parser.h"
|
||||
|
||||
struct RecalFuncs {
|
||||
static constexpr int MAX_SITES_INTERVAL = 100000;
|
||||
//static constexpr int MAX_SITES_INTERVAL = 100000;
|
||||
static constexpr int MAX_SITES_INTERVAL = 21500; // 经验值,这个数读取vcf和计算的性能最好
|
||||
static constexpr uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@';
|
||||
|
||||
// 过滤掉bqsr过程不符合要求的bam数据
|
||||
static bool bqsrReadFilterOut(const bam1_t* b) {
|
||||
// 过滤掉unmapped的read
|
||||
if (b->core.qual == 0) // mapping quality 0
|
||||
return true;
|
||||
if (b->core.qual == 255) // mapping quality not available
|
||||
return true;
|
||||
if (b->core.flag & BAM_FUNMAP || b->core.tid == -1 || b->core.pos == -1) { // unmapped
|
||||
return true;
|
||||
}
|
||||
if (b->core.flag & BAM_FSECONDARY) { // secondary alignment
|
||||
return true;
|
||||
}
|
||||
if (b->core.flag & BAM_FDUP) { // secondary alignment
|
||||
return true;
|
||||
}
|
||||
if (b->core.flag & BAM_FQCFAIL) { // Not passing quality controls
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// 过滤掉apply bqsr过程不符合要求的bam数据
|
||||
static bool applyBqsrReadFilterOut(const bam1_t* b) {
|
||||
// 好像4.6.2的GATK版本的welformed filter啥也没过滤
|
||||
// b->core.flag & BAM_FUNMAP ||
|
||||
if ((b->core.tid == -1 || b->core.pos == -1) && !(b->core.flag & BAM_FMUNMAP)) { // unmapped
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// 设置某个位置是indel
|
||||
static inline void updateIndel(StableArray<int>& isIndel, int index) {
|
||||
if (index >= 0 && index < isIndel.size()) {
|
||||
|
|
@ -94,18 +130,16 @@ struct RecalFuncs {
|
|||
}
|
||||
|
||||
// 获取一行字符串
|
||||
static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* line) {
|
||||
line->clear();
|
||||
if (*cur >= total)
|
||||
static void get_line_from_buf(char* buf, int64_t total, int64_t* cur) {
|
||||
if (*cur >= total) {
|
||||
(*cur)++;
|
||||
return;
|
||||
char b;
|
||||
while (*cur < total && (b = buf[(*cur)++]) != '\n') {
|
||||
line->push_back(b);
|
||||
}
|
||||
while (*cur < total && buf[(*cur)++] != '\n');
|
||||
}
|
||||
|
||||
// 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取
|
||||
static void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, sam_hdr_t* samHdr, StableArray<uint8_t>& knownSites, int thid) {
|
||||
static void calculateKnownSites(SamData& sd, vector<VCFParser>& vcfs, sam_hdr_t* samHdr, int sitesStride, StableArray<uint8_t>& knownSites, int thid) {
|
||||
BamWrap* bw = sd.bw;
|
||||
int tid = bw->contig_id();
|
||||
int64_t startPos = bw->start_pos(); // 闭区间,使用clip之前的read匹配的范围
|
||||
|
|
@ -113,7 +147,6 @@ struct RecalFuncs {
|
|||
knownSites.resize_fill(sd.read_len, 0);
|
||||
|
||||
// update vcfs
|
||||
// int idx = 0;
|
||||
PROF_START(TP_read_vcf);
|
||||
for (auto& vcf : vcfs) {
|
||||
if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam,不需要读取
|
||||
|
|
@ -130,7 +163,7 @@ struct RecalFuncs {
|
|||
vcf.knownSites.clear(); // 清空,因为后面会读入覆盖bam的所有vcf位点
|
||||
// 读取新的interval
|
||||
int64_t fpos, flen;
|
||||
endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos);
|
||||
endPos = std::max(startPos + sitesStride, endPos);
|
||||
Interval readIntv(startPos, endPos);
|
||||
vcf.index.SearchInterval(startPos, endPos, &fpos, &flen);
|
||||
// fprintf(gf[thid * 2 + idx], "%s %d %ld %ld %ld\n", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid, fpos, flen);
|
||||
|
|
@ -143,10 +176,12 @@ struct RecalFuncs {
|
|||
}
|
||||
char* buf = vcf.buf;
|
||||
vcf.inStm.read(buf, flen);
|
||||
string line;
|
||||
int64_t cur = 0;
|
||||
get_line_from_buf(buf, flen, &cur, &line);
|
||||
while (line.size() > 0) {
|
||||
int64_t start = 0;
|
||||
get_line_from_buf(buf, flen, &cur);
|
||||
while (cur > start + 1) {
|
||||
const string line(buf + start, buf + cur - 1);
|
||||
// spdlog::info("s: {}, e: {}, line: {}", start, cur, line);
|
||||
stringstream ss_line(line);
|
||||
string stid;
|
||||
int tid, pos;
|
||||
|
|
@ -159,7 +194,8 @@ struct RecalFuncs {
|
|||
if (varIntv.overlaps(readIntv)) {
|
||||
vcf.knownSites.push_back(varIntv); // 闭区间
|
||||
}
|
||||
get_line_from_buf(buf, flen, &cur, &line);
|
||||
start = cur;
|
||||
get_line_from_buf(buf, flen, &cur);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -195,7 +231,6 @@ struct RecalFuncs {
|
|||
knownSites[i] = true;
|
||||
}
|
||||
}
|
||||
//idx += 1;
|
||||
}
|
||||
PROF_TP_END(TP_calc_skips);
|
||||
//fprintf(gf[0], "\n");
|
||||
|
|
@ -241,4 +276,53 @@ struct RecalFuncs {
|
|||
calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// for apply bqsr
|
||||
/**
|
||||
* Quality score recalibration algorithm works as follows:
|
||||
* - Start with the (approximate, or "estimated") reported quality score. (Approximation occurs when marginalizing/collapsing
|
||||
* over the reported qualities for each read group).
|
||||
* - Compute (retrieve?) the empirical quality score using the per-read group datum (i.e. counts). Call it y_1.
|
||||
* - Use y_1 just computed as the prior for the empirical quality score for the datum for the 2-tuple ( read group, quality score). Call it y_2.
|
||||
* - Use y_2 as the prior to compute the empirical quality for the 3-tuple ( read-group, quality-score, special covariate ). Call it y_3 for the
|
||||
* context covariate. Similarly define y_4 for the cycle covariate. Let d_3 = y_3 - y_2, d_4 = y_4 - y_2.
|
||||
* - (final recalibrated score) = y_2 + d_3 + d_4 = y_3 + y_4 - y_2.
|
||||
*
|
||||
* @param priorQualityScore the prior quality score (in log space). It is either the "estimated" or collapsed reported quality score
|
||||
* for the read group, or the constant prior if given. This value has type double because of the "combine" (or collapse)
|
||||
* operation that collapses the quality scores represented within the same read group.
|
||||
* @param readGroupDatum the RecalDatum object for a particular read group at hand. May be null.
|
||||
* @param qualityScoreDatum the RecalDatum object for a particular (read group, reported quality) tuple at hand. May be null.
|
||||
* @param specialCovariateDatums the array of RecalDatum objects for the non-required covariates (cycle and context covariates by default).
|
||||
* @return
|
||||
*/
|
||||
static double hierarchicalBayesianQualityEstimate(double priorQualityScore, RecalDatum& readGroupDatum, RecalDatum& qualityScoreDatum,
|
||||
RecalDatum& contextDatum, RecalDatum& cycleDatum) {
|
||||
double empiricalQualityForReadGroup =
|
||||
readGroupDatum.getNumObservations() == 0 ? priorQualityScore : readGroupDatum.getEmpiricalQuality(priorQualityScore);
|
||||
double posteriorEmpiricalQualityForReportedQuality = qualityScoreDatum.getNumObservations() == 0
|
||||
? empiricalQualityForReadGroup
|
||||
: qualityScoreDatum.getEmpiricalQuality(empiricalQualityForReadGroup);
|
||||
|
||||
double deltaSpecialCovariates = 0.0;
|
||||
// At this point we stop being iterative; the special covariates (context and cycle by default) are treated differently.
|
||||
if (contextDatum.getNumObservations() > 0) {
|
||||
// TODO: the prior is ignored if the empirical quality for the datum is already cached.
|
||||
deltaSpecialCovariates +=
|
||||
contextDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality;
|
||||
}
|
||||
if (cycleDatum.getNumObservations() > 0) {
|
||||
// TODO: the prior is ignored if the empirical quality for the datum is already cached.
|
||||
deltaSpecialCovariates +=
|
||||
cycleDatum.getEmpiricalQuality(posteriorEmpiricalQualityForReportedQuality) - posteriorEmpiricalQualityForReportedQuality;
|
||||
}
|
||||
|
||||
return posteriorEmpiricalQualityForReportedQuality + deltaSpecialCovariates;
|
||||
}
|
||||
|
||||
// recalibrated quality is bound between 1 and MAX_QUAL
|
||||
static uint8_t getBoundedIntegerQual(double recalibratedQualDouble) {
|
||||
return QualityUtils::boundQual(MathUtils::fastRound(recalibratedQualDouble), RecalDatum::MAX_RECALIBRATED_Q_SCORE);
|
||||
}
|
||||
};
|
||||
|
|
@ -11,9 +11,10 @@
|
|||
#include "covariate.h"
|
||||
#include "nested_array.h"
|
||||
#include "recal_datum.h"
|
||||
#include "qual_utils.h"
|
||||
|
||||
struct RecalTables {
|
||||
int qualDimension = 94; // MAX_SAM_QUAL_SCORE(93) + 1
|
||||
int qualDimension = QualityUtils::MAX_SAM_QUAL_SCORE + 1; // MAX_SAM_QUAL_SCORE(93) + 1
|
||||
int eventDimension = EventType::EVENT_SIZE;
|
||||
int numReadGroups;
|
||||
|
||||
|
|
@ -41,4 +42,22 @@ struct RecalTables {
|
|||
contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
|
||||
cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
|
||||
}
|
||||
|
||||
void initReadGroupTable(int _numReadGroups) {
|
||||
numReadGroups = _numReadGroups;
|
||||
readGroupTable.init(eventDimension, numReadGroups);
|
||||
}
|
||||
|
||||
// 必须在调用initReadGroupTable之后,或者设置了numReadGroups之后
|
||||
void initQualityScoreTable() {
|
||||
qualityScoreTable.init(eventDimension, numReadGroups, qualDimension);
|
||||
}
|
||||
|
||||
void initContextTable() {
|
||||
contextTable.init(eventDimension, numReadGroups, qualDimension, ContextCovariate::MaximumKeyValue() + 1);
|
||||
}
|
||||
|
||||
void initCycleTable() {
|
||||
cycleTable.init(eventDimension, numReadGroups, qualDimension, CycleCovariate::MaximumKeyValue() + 1);
|
||||
}
|
||||
};
|
||||
|
|
@ -11,6 +11,8 @@
|
|||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include "bqsr_args.h"
|
||||
#include "nested_array.h"
|
||||
|
|
@ -79,25 +81,6 @@ struct RecalUtils {
|
|||
}
|
||||
}
|
||||
}
|
||||
// fprintf(gf[4], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
||||
// _Foreach3D(qualityScoreTable, val, {
|
||||
// if (val.numObservations > 0)
|
||||
// fprintf(gf[4], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||
// });
|
||||
// fprintf(gf[4], "\n");
|
||||
|
||||
// fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1);
|
||||
// for (auto& arr1 : contextTable.data) {
|
||||
// for (size_t si = 0; si < arr1.size(); ++si) {
|
||||
// for (auto &arr2 : arr1[si]) {
|
||||
// for (auto& val : arr2) {
|
||||
// if (val.numObservations > 0)
|
||||
// fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// fprintf(gf[3], "\n");
|
||||
}
|
||||
|
||||
// 输出bqsr报告
|
||||
|
|
@ -110,7 +93,7 @@ struct RecalUtils {
|
|||
exit(1);
|
||||
}
|
||||
// 输出version信息
|
||||
fprintf(fpout, "%s\n", REPORT_HEADER_VERSION);
|
||||
fprintf(fpout, "%s:%s\n", REPORT_HEADER_VERSION, REPORT_HEADER_MINOR_VERSION);
|
||||
// 输出参数信息
|
||||
outputArgsTable(RAC, fpout);
|
||||
// 输出量化质量分数信息
|
||||
|
|
@ -268,4 +251,220 @@ struct RecalUtils {
|
|||
});
|
||||
table.write(fpout);
|
||||
}
|
||||
|
||||
// 解析bqsr table文件
|
||||
static void readRecalTables(const string& recalTableFn, BQSRArg& RAC, vector<uint8_t>& quantizedScore, RecalTables& recalTables) {
|
||||
std::ifstream infs(recalTableFn, ifstream::in);
|
||||
string line;
|
||||
// 读取文件头
|
||||
std::getline(infs, line);
|
||||
spdlog::info("header: {}", line);
|
||||
if (line.find(REPORT_HEADER_VERSION) != 0) {
|
||||
spdlog::error("BQSR table version not supported! {} {}", line, __LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// 读取参数表
|
||||
readArgsTable(infs, RAC);
|
||||
|
||||
readQuantTable(infs, quantizedScore);
|
||||
|
||||
readReadGroupTable(infs, recalTables);
|
||||
|
||||
readQualityScoreTable(infs, recalTables);
|
||||
|
||||
readContextCycleTable(infs, recalTables);
|
||||
|
||||
infs.close();
|
||||
}
|
||||
|
||||
template<typename Arr>
|
||||
static void split(const string& str, Arr& dat) {
|
||||
dat.clear();
|
||||
std::istringstream iss(str);
|
||||
string item;
|
||||
while (iss >> item) {
|
||||
dat.push_back(item);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Arr>
|
||||
static void split(const string& str, const char delimiter, Arr& dat) {
|
||||
dat.clear();
|
||||
std::istringstream iss(str);
|
||||
string token;
|
||||
while (std::getline(iss, token, delimiter)) {
|
||||
dat.push_back(token);
|
||||
}
|
||||
}
|
||||
|
||||
// 读取参数表
|
||||
static void readArgsTable(std::ifstream& infs, BQSRArg& p) {
|
||||
string line;
|
||||
StableArray<string> dat;
|
||||
std::getline(infs, line);
|
||||
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||
split(line, ':', dat);
|
||||
int argRowCnt = std::stoi(dat[3]);
|
||||
std::getline(infs, line); // 忽略表格说明的一行
|
||||
std::getline(infs, line); // 忽略表头column说明
|
||||
for (int i = 0; i < argRowCnt; ++i) {
|
||||
std::getline(infs, line);
|
||||
split(line, dat);
|
||||
if ("covariate" == dat[0]) {
|
||||
} else if ("no_standard_covs" == dat[0]) {
|
||||
p.DO_NOT_USE_STANDARD_COVARIATES = ReportUtil::ToBool(dat[1]);
|
||||
} else if ("run_without_dbsnp" == dat[0]) {
|
||||
p.RUN_WITHOUT_DBSNP = ReportUtil::ToBool(dat[1]);
|
||||
} else if ("solid_recal_mode" == dat[0]) {
|
||||
p.SOLID_RECAL_MODE = ReportUtil::ParseString(dat[1]);
|
||||
} else if ("solid_nocall_strategy" == dat[0]) {
|
||||
p.SOLID_NOCALL_STRATEGY = ReportUtil::ParseString(dat[1]);
|
||||
} else if ("mismatches_context_size" == dat[0]) {
|
||||
p.MISMATCHES_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]);
|
||||
} else if ("indels_context_size" == dat[0]) {
|
||||
p.INDELS_CONTEXT_SIZE = ReportUtil::ToInt(dat[1]);
|
||||
} else if ("mismatches_default_quality" == dat[0]) {
|
||||
p.MISMATCHES_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
|
||||
// spdlog::info("int8_t : {}", (int)p.MISMATCHES_DEFAULT_QUALITY);
|
||||
} else if ("deletions_default_quality" == dat[0]) {
|
||||
p.DELETIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
|
||||
} else if ("insertions_default_quality" == dat[0]) {
|
||||
p.INSERTIONS_DEFAULT_QUALITY = ReportUtil::ToInt(dat[1]);
|
||||
} else if ("maximum_cycle_value" == dat[0]) {
|
||||
p.MAXIMUM_CYCLE_VALUE = ReportUtil::ToInt(dat[1]);
|
||||
} else if ("low_quality_tail" == dat[0]) {
|
||||
p.LOW_QUAL_TAIL = ReportUtil::ToInt(dat[1]);
|
||||
} else if ("default_platform" == dat[0]) {
|
||||
p.DEFAULT_PLATFORM = ReportUtil::ParseString(dat[1]);
|
||||
} else if ("force_platform" == dat[0]) {
|
||||
p.FORCE_PLATFORM = ReportUtil::ParseString(dat[1]);
|
||||
} else if ("quantizing_levels" == dat[0]) {
|
||||
p.QUANTIZING_LEVELS = ReportUtil::ToInt(dat[1]);
|
||||
} else if ("recalibration_report" == dat[0]) {
|
||||
p.existingRecalibrationReport = ReportUtil::ParseString(dat[1]);
|
||||
} else if ("binary_tag_name" == dat[0]) {
|
||||
p.BINARY_TAG_NAME = ReportUtil::ParseString(dat[1]);
|
||||
// spdlog::info("BINARY_TAG_NAME : {}", p.BINARY_TAG_NAME);
|
||||
} else {
|
||||
spdlog::error("unknown argument : {} {}", dat[0], dat[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 读取量化质量分数信息表
|
||||
static void readQuantTable(std::ifstream& infs, vector<uint8_t>& quantizedScore) {
|
||||
string line;
|
||||
StableArray<string> dat;
|
||||
std::getline(infs, line);
|
||||
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||
split(line, ':', dat);
|
||||
int argRowCnt = std::stoi(dat[3]);
|
||||
std::getline(infs, line); // 忽略表格说明的一行
|
||||
std::getline(infs, line); // 忽略表头column说明
|
||||
quantizedScore.resize(argRowCnt);
|
||||
for (int i = 0; i < argRowCnt; ++i) {
|
||||
std::getline(infs, line);
|
||||
// spdlog::info("{}", line);
|
||||
split(line, dat);
|
||||
quantizedScore[ReportUtil::ToInt(dat[0])] = ReportUtil::ToInt(dat[2]);
|
||||
// spdlog::info("{}, {}", ReportUtil::ToInt(dat[0]), quantizedScore[ReportUtil::ToInt(dat[0])]);
|
||||
}
|
||||
}
|
||||
|
||||
// 读取read group表
|
||||
static void readReadGroupTable(std::ifstream& infs, RecalTables& recalTables) {
|
||||
string line;
|
||||
StableArray<string> dat;
|
||||
std::getline(infs, line);
|
||||
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||
split(line, ':', dat);
|
||||
int argRowCnt = std::stoi(dat[3]);
|
||||
std::getline(infs, line); // 忽略表格说明的一行
|
||||
std::getline(infs, line); // 忽略表头column说明
|
||||
vector<vector<string>> dats(argRowCnt);
|
||||
int rgId = 0;
|
||||
for (int i = 0; i < argRowCnt; ++i) {
|
||||
std::getline(infs, line);
|
||||
split(line, dats[i]);
|
||||
if (ReadGroupCovariate::RgToId.find(dats[i][0]) == ReadGroupCovariate::RgToId.end()) {
|
||||
ReadGroupCovariate::RgToId[dats[i][0]] = rgId;
|
||||
ReadGroupCovariate::IdToRg[rgId] = dats[i][0];
|
||||
++rgId;
|
||||
}
|
||||
}
|
||||
recalTables.initReadGroupTable(rgId);
|
||||
// spdlog::info("read group num: {}, {}", rgId, ReadGroupCovariate::IdToRg[0]);
|
||||
for (int i = 0; i < dats.size(); ++i) {
|
||||
int k2 = ReadGroupCovariate::RgToId[dats[i][0]];
|
||||
int k1 = EventType::GetIndexForEventRep(dats[i][1][0]);
|
||||
auto& datum = recalTables.readGroupTable(k1, k2);
|
||||
datum.setEmpiricalQuality(ReportUtil::ToDouble(dats[i][2]));
|
||||
datum.setReportedQuality(ReportUtil::ToDouble(dats[i][3]));
|
||||
datum.setNumObservations(ReportUtil::ToUint64(dats[i][4]));
|
||||
datum.setNumMismatches(ReportUtil::ToDouble(dats[i][5]));
|
||||
}
|
||||
}
|
||||
|
||||
// 读取质量分数表
|
||||
static void readQualityScoreTable(std::ifstream& infs, RecalTables& recalTables) {
|
||||
string line;
|
||||
StableArray<string> dat;
|
||||
std::getline(infs, line);
|
||||
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||
split(line, ':', dat);
|
||||
int argRowCnt = std::stoi(dat[3]);
|
||||
std::getline(infs, line); // 忽略表格说明的一行
|
||||
std::getline(infs, line); // 忽略表头column说明
|
||||
recalTables.initQualityScoreTable();
|
||||
int rgId = 0;
|
||||
for (int i = 0; i < argRowCnt; ++i) {
|
||||
std::getline(infs, line);
|
||||
split(line, dat);
|
||||
int k2 = ReadGroupCovariate::RgToId[dat[0]];
|
||||
int k3 = ReportUtil::ToInt(dat[1]);
|
||||
int k1 = EventType::GetIndexForEventRep(dat[2][0]);
|
||||
auto& datum = recalTables.qualityScoreTable(k1, k2, k3);
|
||||
datum.setEmpiricalQuality(ReportUtil::ToDouble(dat[3]));
|
||||
datum.setReportedQuality((double)k3);
|
||||
datum.setNumObservations(ReportUtil::ToUint64(dat[4]));
|
||||
datum.setNumMismatches(ReportUtil::ToDouble(dat[5]));
|
||||
}
|
||||
}
|
||||
|
||||
// 读取context和cycle表
|
||||
static void readContextCycleTable(std::ifstream& infs, RecalTables& recalTables) {
|
||||
string line;
|
||||
StableArray<string> dat;
|
||||
std::getline(infs, line);
|
||||
while (line.empty() && std::getline(infs, line)); // 跳过空行
|
||||
split(line, ':', dat);
|
||||
int argRowCnt = std::stoi(dat[3]);
|
||||
std::getline(infs, line); // 忽略表格说明的一行
|
||||
std::getline(infs, line); // 忽略表头column说明
|
||||
recalTables.initContextTable();
|
||||
recalTables.initCycleTable();
|
||||
for (int i = 0; i < argRowCnt; ++i) {
|
||||
std::getline(infs, line);
|
||||
split(line, dat);
|
||||
int k2 = ReadGroupCovariate::RgToId[dat[0]];
|
||||
int k3 = ReportUtil::ToInt(dat[1]);
|
||||
int k4 = 0;
|
||||
int k1 = EventType::GetIndexForEventRep(dat[4][0]);
|
||||
RecalDatum* dp;
|
||||
if (dat[3] == "Cycle") {
|
||||
k4 = CycleCovariate::KeyFromCycle(ReportUtil::ToInt(dat[2]));
|
||||
dp = &recalTables.cycleTable(k1, k2, k3, k4);
|
||||
} else if (dat[3] == "Context") {
|
||||
k4 = ContextCovariate::KeyFromContext(dat[2]);
|
||||
dp = &recalTables.contextTable(k1, k2, k3, k4);
|
||||
} else {
|
||||
spdlog::error("Unknown CovariateName {}", dat[3]);
|
||||
}
|
||||
dp->setEmpiricalQuality(ReportUtil::ToDouble(dat[5]));
|
||||
dp->setReportedQuality((double)k3);
|
||||
dp->setNumObservations(ReportUtil::ToUint64(dat[6]));
|
||||
dp->setNumMismatches(ReportUtil::ToDouble(dat[7]));
|
||||
}
|
||||
}
|
||||
};
|
||||
206
src/main.cpp
206
src/main.cpp
|
|
@ -12,10 +12,7 @@
|
|||
#include "fastbqsr_version.h"
|
||||
#include "bqsr/bqsr_args.h"
|
||||
#include "util/profiling.h"
|
||||
|
||||
namespace nsgv {
|
||||
extern BQSRArg gBqsrArg;
|
||||
};
|
||||
#include "bqsr/common_data.h"
|
||||
|
||||
int BaseRecalibrator();
|
||||
int ApplyBQSR();
|
||||
|
|
@ -30,28 +27,18 @@ string get_current_time_str() {
|
|||
return string(now);
|
||||
}
|
||||
|
||||
int main_BaseRecalibrator(int argc, char *argv[]) {
|
||||
// init arg parser
|
||||
argparse::ArgumentParser program(nsgv::gBqsrArg.PROGRAM_RECORD_ID, FASTBQSR_VERSION, argparse::default_arguments::none);
|
||||
program.add_description(
|
||||
"First pass of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n"
|
||||
"user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)");
|
||||
|
||||
// 添加bqsr和apply bqsr的共同使用的参数
|
||||
static void addCommonArgs(argparse::ArgumentParser &program, const char *outputDesc) {
|
||||
program.add_argument("--input")
|
||||
.help("BAM/SAM/CRAM file containing reads This argument must be specified at least once.")
|
||||
.metavar("<INPUT>")
|
||||
.required();
|
||||
|
||||
program.add_argument("--output")
|
||||
.help("The output recalibration table file to create.")
|
||||
.help(outputDesc)
|
||||
.metavar("<OUTPUT>")
|
||||
.required();
|
||||
|
||||
program.add_argument("--reference")
|
||||
.help("Reference sequence file.")
|
||||
.metavar("<Reference>")
|
||||
.required();
|
||||
|
||||
program.add_argument("--num-threads")
|
||||
.help("Number of threads to use.")
|
||||
.scan<'i', int>()
|
||||
|
|
@ -59,6 +46,66 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
|||
.nargs(1)
|
||||
.metavar("<NUM_THREADS>");
|
||||
|
||||
program.add_argument("--reference")
|
||||
.help("Reference sequence file.")
|
||||
.metavar("<Reference>")
|
||||
.required();
|
||||
}
|
||||
|
||||
// 添加帮助和版本信息
|
||||
static void addHelpVersion(argparse::ArgumentParser& program) {
|
||||
program.add_argument("-h", "--help")
|
||||
.action([&](const auto& /*unused*/) {
|
||||
std::cout << program.help().str();
|
||||
std::exit(0);
|
||||
})
|
||||
.default_value(false)
|
||||
.help("shows help message and exits")
|
||||
.implicit_value(true)
|
||||
.nargs(0);
|
||||
|
||||
program.add_argument("-v", "--version")
|
||||
.action([&](const auto& /*unused*/) {
|
||||
std::cout << FASTBQSR_VERSION << std::endl;
|
||||
std::exit(0);
|
||||
})
|
||||
.default_value(false)
|
||||
.help("prints version information and exits")
|
||||
.implicit_value(true)
|
||||
.nargs(0);
|
||||
}
|
||||
|
||||
// 添加运行时间信息和命令行参数列表
|
||||
static void addTimeCLI(int argc, char* argv[]) {
|
||||
nsgv::gBqsrArg.START_TIME = get_current_time_str();
|
||||
nsgv::gBqsrArg.CLI_STR = argv[0];
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]);
|
||||
}
|
||||
// spdlog::info("cmd: {}", nsgv::gBqsrArg.CLI_STR);
|
||||
}
|
||||
|
||||
// 解析公共参数
|
||||
static void parseCommonArgs(argparse::ArgumentParser& program) {
|
||||
nsgv::gBqsrArg.INPUT_FILE = program.get("--input");
|
||||
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
|
||||
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
|
||||
if (nsgv::gBqsrArg.NUM_THREADS < 1) {
|
||||
spdlog::error("num-threads must be positive.");
|
||||
exit(1);
|
||||
}
|
||||
nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
|
||||
}
|
||||
|
||||
int main_BaseRecalibrator(int argc, char *argv[]) {
|
||||
// init arg parser
|
||||
argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none);
|
||||
program.add_description(
|
||||
"First phase of the Base Quality Score Recalibration (BQSR) -- Generates recalibration table based on various\n"
|
||||
"user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context.)");
|
||||
|
||||
addCommonArgs(program, "The output recalibration table file to create.");
|
||||
|
||||
program.add_argument("--known-sites")
|
||||
.help(
|
||||
"One or more databases of known polymorphic sites used to exclude regions around known polymorphisms from "
|
||||
|
|
@ -66,6 +113,59 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
|||
.metavar("<KnownSites>")
|
||||
.nargs(argparse::nargs_pattern::any);
|
||||
|
||||
program.add_argument("--enable-baq")
|
||||
.help("Whether to do BAQ correction.")
|
||||
.default_value(false)
|
||||
.implicit_value(true)
|
||||
.hidden();
|
||||
|
||||
program.add_argument("--compute-indel-bqsr-tables")
|
||||
.help("Whether to compute indel BQSR tables.")
|
||||
.default_value(false)
|
||||
.implicit_value(true)
|
||||
.hidden();
|
||||
|
||||
// add help and version args
|
||||
addHelpVersion(program);
|
||||
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
parseCommonArgs(program);
|
||||
|
||||
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
|
||||
nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
|
||||
nsgv::gBqsrArg.computeIndelBQSRTables = program.get<bool>("--compute-indel-bqsr-tables");
|
||||
// spdlog::info("known sites vcf files:");
|
||||
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
||||
// spdlog::info(" {}", ks);
|
||||
// }
|
||||
} catch (const std::exception &err) {
|
||||
spdlog::error(err.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
spdlog::info("fast base recalibration phase-1 start");
|
||||
BaseRecalibrator();
|
||||
spdlog::info("fast base recalibration phase-1 end");
|
||||
|
||||
DisplayProfilingBQSR(nsgv::gBqsrArg.NUM_THREADS);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main_ApplyBQSR(int argc, char* argv[]) {
|
||||
// init arg parser
|
||||
argparse::ArgumentParser program(PROGRAM_NAME, FASTBQSR_VERSION, argparse::default_arguments::none);
|
||||
program.add_description(
|
||||
"Second phase of the Base Quality Score Recalibration (BQSR) -- Apply the bqsr table to generate recalibrated bam file.)");
|
||||
|
||||
addCommonArgs(program, "The output bam/sam file to create.");
|
||||
|
||||
program.add_argument("--bqsr")
|
||||
.help("Input recalibration table for BQSR.")
|
||||
.metavar("<BQSR>")
|
||||
.required();
|
||||
|
||||
program.add_argument("--create-index")
|
||||
.help("Whether to create an index when writing coordinate sorted BAM output.")
|
||||
.default_value(false)
|
||||
|
|
@ -78,83 +178,37 @@ int main_BaseRecalibrator(int argc, char *argv[]) {
|
|||
.nargs(1)
|
||||
.metavar("<IndexFormat>");
|
||||
|
||||
program.add_argument("--enable-baq")
|
||||
.help("Whether to do BAQ correction.")
|
||||
.default_value(false)
|
||||
.implicit_value(true)
|
||||
.hidden();
|
||||
|
||||
// add help and version args
|
||||
program.add_argument("-h", "--help")
|
||||
.action([&](const auto & /*unused*/) {
|
||||
std::cout << program.help().str();
|
||||
std::exit(0);
|
||||
})
|
||||
.default_value(false)
|
||||
.help("shows help message and exits")
|
||||
.implicit_value(true)
|
||||
.nargs(0);
|
||||
|
||||
program.add_argument("-v", "--version")
|
||||
.action([&](const auto & /*unused*/) {
|
||||
std::cout << FASTBQSR_VERSION << std::endl;
|
||||
std::exit(0);
|
||||
})
|
||||
.default_value(false)
|
||||
.help("prints version information and exits")
|
||||
.implicit_value(true)
|
||||
.nargs(0);
|
||||
|
||||
// std::cout << program << std::endl;
|
||||
|
||||
nsgv::gBqsrArg.START_TIME = get_current_time_str();
|
||||
nsgv::gBqsrArg.CLI_STR = argv[0];
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
nsgv::gBqsrArg.CLI_STR += " " + std::string(argv[i]);
|
||||
}
|
||||
|
||||
addHelpVersion(program);
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
nsgv::gBqsrArg.INPUT_FILE = program.get("--input");
|
||||
nsgv::gBqsrArg.OUTPUT_FILE = program.get("--output");
|
||||
nsgv::gBqsrArg.NUM_THREADS = program.get<int>("--num-threads");
|
||||
if (nsgv::gBqsrArg.NUM_THREADS < 1) {
|
||||
spdlog::error("num-threads must be positive.");
|
||||
exit(1);
|
||||
}
|
||||
parseCommonArgs(program);
|
||||
nsgv::gBqsrArg.BQSR_FILE = program.get("--bqsr");
|
||||
nsgv::gBqsrArg.CREATE_INDEX = program.get<bool>("--create-index");
|
||||
nsgv::gBqsrArg.REFERENCE_FILE = program.get<string>("--reference");
|
||||
nsgv::gBqsrArg.KNOWN_SITES_VCFS = program.get<std::vector<string>>("--known-sites");
|
||||
nsgv::gBqsrArg.enableBAQ = program.get<bool>("--enable-baq");
|
||||
// spdlog::info("known sites vcf files:");
|
||||
// for (const auto& ks : nsgv::gBqsrArg.KNOWN_SITES_VCFS) {
|
||||
// spdlog::info(" {}", ks);
|
||||
// }
|
||||
string idxFormat = program.get("--index-format");
|
||||
std::transform(idxFormat.begin(), idxFormat.end(), idxFormat.begin(), ::toupper);
|
||||
nsgv::gBqsrArg.INDEX_FORMAT = idxFormat == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;
|
||||
|
||||
nsgv::gBqsrArg.INDEX_FORMAT =
|
||||
program.get("--index-format") == "BAI" ? nsbqsr::IndexFormat::BAI : nsbqsr::IndexFormat::CSI;
|
||||
|
||||
} catch (const std::exception &err) {
|
||||
} catch (const std::exception& err) {
|
||||
spdlog::error(err.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
spdlog::info("fast base recalibration phase-1 start");
|
||||
BaseRecalibrator();
|
||||
spdlog::info("fast base recalibration phase-1 end");
|
||||
spdlog::info("fast base recalibration phase-2 start");
|
||||
ApplyBQSR();
|
||||
spdlog::info("fast base recalibration phase-2 end");
|
||||
|
||||
DisplayProfiling(nsgv::gBqsrArg.NUM_THREADS);
|
||||
// DisplayProfilingApplyBQSR(nsgv::gBqsrArg.NUM_THREADS);
|
||||
DisplayProfilingApplyBQSR(1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main_ApplyBQSR(int argc, char* argv[]) { return 0; }
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// init log
|
||||
spdlog::set_default_logger(spdlog::stderr_color_st("fastbqsr"));
|
||||
spdlog::cfg::load_env_levels();
|
||||
|
||||
addTimeCLI(argc, argv);
|
||||
string bqsr_prog = argv[1];
|
||||
if (bqsr_prog == "BaseRecalibrator") {
|
||||
return main_BaseRecalibrator(argc - 1, argv + 1);
|
||||
|
|
|
|||
|
|
@ -45,8 +45,7 @@ static int CalcThreadTime(uint64_t *a, int len, double *max, double *min, double
|
|||
fprintf(stderr, "time %-15s: avg %0.2lfs min %0.2lfs max %0.2lfs\n", #tpname, avgTime, minTime, maxTime); \
|
||||
}
|
||||
|
||||
int DisplayProfiling(int nthread) {
|
||||
|
||||
int DisplayProfilingBQSR(int nthread) {
|
||||
#ifdef SHOW_PERF
|
||||
fprintf(stderr, "\n");
|
||||
PRINT_GP(GP_read);
|
||||
|
|
@ -80,6 +79,27 @@ int DisplayProfiling(int nthread) {
|
|||
}
|
||||
PRINT_GP(GP_whole_process);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int DisplayProfilingApplyBQSR(int nthread) {
|
||||
#ifdef SHOW_PERF
|
||||
fprintf(stderr, "\n");
|
||||
PRINT_GP(GP_read);
|
||||
if (nthread == 1) {
|
||||
PRINT_GP(GP_covariate);
|
||||
} else {
|
||||
PRINT_TP(TP_covariate);
|
||||
PRINT_TP(TP_readgroup);
|
||||
PRINT_TP(TP_qualityscore);
|
||||
PRINT_TP(TP_context);
|
||||
PRINT_TP(TP_cycle);
|
||||
}
|
||||
PRINT_GP(GP_whole_process);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -79,7 +79,8 @@ enum {
|
|||
|
||||
uint64_t RealtimeMsec(void);
|
||||
|
||||
int DisplayProfiling(int);
|
||||
int DisplayProfilingBQSR(int);
|
||||
int DisplayProfilingApplyBQSR(int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,24 +22,46 @@ using std::string;
|
|||
using std::stringstream;
|
||||
using std::vector;
|
||||
|
||||
#define REPORT_HEADER_VERSION "#:GATKReport.v1.1:5"
|
||||
#define REPORT_HEADER_VERSION "#:GATKReport.v1.1"
|
||||
#define REPORT_HEADER_MINOR_VERSION "5"
|
||||
|
||||
struct ReportUtil {
|
||||
static string ToString(const bool val) { return val ? "true" : "false"; }
|
||||
static string ToString(const char val) {
|
||||
static inline string ToString(const bool val) { return val ? "true" : "false"; }
|
||||
static inline string ToString(const char val) {
|
||||
string s = "";
|
||||
s += val;
|
||||
// spdlog::info("char: {}, str: {}", val, s);
|
||||
return s;
|
||||
}
|
||||
static string ToString(const string& val) { return val == "" ? "null" : val; }
|
||||
static string ToString(const double val, int precise) {
|
||||
static inline string ToString(const string& val) { return val == "" ? "null" : val; }
|
||||
static inline string ToString(const double val, int precise) {
|
||||
stringstream ss;
|
||||
ss << std::fixed << std::setprecision(precise) << val;
|
||||
return ss.str();
|
||||
}
|
||||
template<typename T>
|
||||
static string ToString(const T val) { return std::to_string(val); }
|
||||
static inline string ToString(const T val) { return std::to_string(val); }
|
||||
|
||||
// 转换成bool
|
||||
static inline bool ToBool(const string &val) {
|
||||
if (val == "false" || val == "False" || val == "FALSE" || val.empty())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline string ParseString(const string &val) {
|
||||
if (val == "null")
|
||||
return "";
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline int ToInt(const string& val) { return std::stoi(val); }
|
||||
|
||||
static inline double ToDouble(const string& val) { return std::stod(val); }
|
||||
|
||||
static inline int64_t ToInt64(const string& val) { return std::stoll(val); }
|
||||
|
||||
static inline uint64_t ToUint64(const string& val) { return std::stoull(val); }
|
||||
};
|
||||
|
||||
struct ReportTable {
|
||||
|
|
|
|||
|
|
@ -54,7 +54,8 @@ struct SamData {
|
|||
StableArray<uint8_t> base_quals; // 对应的质量分数
|
||||
StableArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
|
||||
StableArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
|
||||
StableArray<Cigar> cigars;
|
||||
StableArray<Cigar> cigars; // clip 之后的cigar
|
||||
StableArray<uint8_t> recaled_quals; // 保存校正之后的质量分数
|
||||
|
||||
// 用作临时buffer
|
||||
StableArray<char> strandedClippedBases; // for context covariate
|
||||
|
|
@ -87,6 +88,20 @@ struct SamData {
|
|||
end_pos = 0;
|
||||
}
|
||||
|
||||
// 解析apply bqsr里用到的信息
|
||||
void parseForApplyBQSR(BamWrap *_bw) {
|
||||
bw = _bw;
|
||||
read_len = bw->b->core.l_qseq;
|
||||
bases.resize(read_len);
|
||||
base_quals.resize(read_len);
|
||||
uint8_t* seq = bam_get_seq(bw->b);
|
||||
uint8_t* quals = bam_get_qual(bw->b);
|
||||
for (int i = 0; i < read_len; ++i) {
|
||||
bases[i] = BaseUtils::cBaseToChar[bam_seqi(seq, i)];
|
||||
base_quals[i] = quals[i];
|
||||
}
|
||||
}
|
||||
|
||||
// 初步解析bam
|
||||
void parseBasic(BamWrap *_bw) {
|
||||
bw = _bw;
|
||||
|
|
|
|||
Loading…
Reference in New Issue