Compare commits

...

2 Commits

8 changed files with 152 additions and 1124 deletions

View File

@ -38,6 +38,7 @@ Date : 2023/10/23
#include "util/utils.h"
#include "util/math/math_utils.h"
#include "quant_info.h"
#include "util/debug.h"
using std::deque;
@ -270,7 +271,7 @@ void calculateReadBases(BamWrap* bw, SamData& ad) {
uint8_t* quals = bam_get_qual(bw->b);
for (int i = 0; i < ad.read_len; ++i) {
ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)];
ad.base_quals[i] = quals[i];
ad.base_quals[i] = quals[i + ad.left_clip];
}
}
@ -470,7 +471,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li
}
// 计算与read有交叉的已知位点信息 应该要判断一下是按照read的范围去读取vcf还是按照一个batch read的范围去读取
void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> knownSites) {
void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vector<bool> &knownSites) {
int tid = bw->contig_id();
uint64_t startPos = bw->start_pos(); // 闭区间
uint64_t endPos = bw->end_pos(); // 闭区间
@ -519,7 +520,7 @@ void calculateKnownSites(BamWrap* bw, SamData& ad, vector<VCFParser> &vcfs, vect
int64_t varStart = BamWrap::bam_global_pos(tid, pos);
Interval varIntv(varStart, varStart + ref.size() - 1);
if (readIntv.overlaps(varIntv)) {
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size()));
vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间
//spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size());
}
get_line_from_buf(buf, flen, &cur, &line);
@ -619,7 +620,8 @@ void updateRecalTablesForRead(ReadRecalInfo &info) {
int readLength = read.read_len;
for (int offset = 0; offset < readLength; ++offset) {
if (!info.skips[offset]) { // 不跳过当前位置
if (!info.skips[offset]) {
//if (true){ // 不跳过当前位置
for (int idx = 0; idx < nsgv::gEventTypes.size(); ++idx) {
// 获取四个值readgroup / qualityscore / context / cycle
EventTypeValue& event = nsgv::gEventTypes[idx];
@ -693,6 +695,7 @@ void roundTableValues(RecalTables& rt) {
// 串行bqsr
int SerialBQSR() {
open_debug_files();
int round = 0;
BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO);
// inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM);
@ -754,6 +757,7 @@ int SerialBQSR() {
// spdlog::info("bam idx: {}", i);
BamWrap* bw = bams[i];
SamData ad;
ad.rid = i;
ad.bw = bw;
ad.read_len = BamWrap::BamEffectiveLength(bw->b);
ad.cigar_end = bw->b->core.n_cigar;
@ -788,6 +792,18 @@ int SerialBQSR() {
vector<int> isIns(ad.read_len, 0); // 该位置是否是插入位置0不是1是
vector<int> isDel(ad.read_len, 0); // 该位置是否是删除位置0不是1是
const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel);
/*fprintf(gf[0], "%d\t", ad.read_len);
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]);
fprintf(gf[0], "\n");
fprintf(gf[1], "%d\t", ad.read_len);
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]);
fprintf(gf[1], "\n");
fprintf(gf[2], "%d\t", ad.read_len);
for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]);
fprintf(gf[2], "\n");
*/
// spdlog::info("nErrors: {}", nErrors);
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); }
@ -811,8 +827,19 @@ int SerialBQSR() {
PROF_START(covariate);
CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true);
PROF_END(gprof[GP_covariate], covariate);
test = readCovariates[1][0][0] + readCovariates[2][1][3];
int end_pos = bw->contig_end_pos();
// fprintf(gf[3], "%ld %ld %d %ld\n", ad.rid, readCovariates.size(), ad.read_len, readCovariates[0][0].size());
// for (auto &arr1 : readCovariates) {
// for (size_t si = 0; si < ad.read_len; ++si) {
// for (auto &val : arr1[si]) {
// fprintf(gf[3], "%d ", val);
// }
// }
// }
// fprintf(gf[3], "\n");
//test = readCovariates[1][0][0] + readCovariates[2][1][3];
//int end_pos = bw->contig_end_pos();
//spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos,
// bw->GetReadNegativeStrandFlag() ? "reverse" : "forward");
// for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); }
@ -825,6 +852,9 @@ int SerialBQSR() {
ad.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN;
}
PROF_END(gprof[GP_read_vcf], known_sites);
// fprintf(gf[0], "%d\t", ad.read_len);
// for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0);
// fprintf(gf[0], "\n");
// 10. 根据BAQ进一步处理snpindel得到处理后的数据
vector<double> snpErrors, insErrors, delErrors;
@ -911,6 +941,8 @@ int SerialBQSR() {
}
spdlog::info("read count: {}", readNumSum);
close_debug_files();
return 0;
}

View File

@ -98,6 +98,51 @@ static char SimpleComplement(const char base) {
}
}
static void ClipLowQualEndsWithN(string& bases, const FastArray<uint8_t> &quals, uint8_t lowQTail, bool isNegativeStrand) {
// 处理左边
int left = 0;
int readLen = bases.size();
if (isNegativeStrand) {
for (; left < readLen; ++left) {
if (quals[readLen - 1 - left] <= lowQTail)
bases[left] = 'N';
else
break;
}
} else {
for (; left < readLen; ++left) {
if (quals[left] <= lowQTail)
bases[left] = 'N';
else
break;
}
}
if (left == readLen) {
bases.clear();
return;
}
// 处理右边
int right = (int)bases.size() - 1;
if (isNegativeStrand) {
for (; right >= 0; --right) {
if (quals[readLen - 1 - right] <= lowQTail)
bases[right] = 'N';
else
break;
}
} else {
for (; right >= 0; --right) {
if (quals[right] <= lowQTail)
bases[right] = 'N';
else
break;
}
}
if (right < left)
bases.clear();
}
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
@ -106,28 +151,7 @@ void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string&
for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
}
// 处理左边
int left = 0;
for (; left < ad.read_len; ++left) {
if (quals[left] <= lowQTail)
clippedBases[left] = 'N';
else
break;
}
if (left == ad.read_len) {
clippedBases.clear();
return;
}
// 处理右边
int right = ad.read_len - 1;
for (; right >= 0; --right) {
if (quals[right] <= lowQTail)
clippedBases[right] = 'N';
else
break;
}
if (right < left)
clippedBases.clear();
ClipLowQualEndsWithN(clippedBases, ad.base_quals, lowQTail, bw->GetReadNegativeStrandFlag());
}
/**
@ -224,7 +248,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header,
// store the original bases and then write Ns over low quality ones
string strandedClippedBases(ad.bases);
GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
GetStrandedClippedBytes(bw, ad, strandedClippedBases, 30); // 注意这里的lowQualTail数值
// spdlog::info("bases: {}", strandedClippedBases);
vector<int> nBasePairContextAtEachCycle;
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);

View File

@ -225,6 +225,8 @@ struct QualityUtils {
static uint8_t errorProbToQual(const double errorRate, const uint8_t maxQual) {
// Utils.validateArg(MathUtils.isValidProbability(errorRate), ()->"errorRate must be good probability but got " + errorRate);
const double d = std::round(-10.0 * std::log10(errorRate));
if (d > maxQual) // 注意当d超过int表示的最大值时(如inf)转变成int会变成负数
return maxQual;
return boundQual((int)d, maxQual);
}

View File

@ -113,13 +113,13 @@ struct RecalUtils {
table.addColumn({"Observations", "%d"});
table.addColumn({"Errors", "%.2f"});
spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches,
r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality);
//spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches,
// r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality);
_Foreach2DK(r.readGroupTable, datum, {
RecalDatum &dat = const_cast<RecalDatum&>(datum);
spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches());
spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations());
// spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches());
// spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations());
if (dat.getNumObservations() > 0) {
table.addRowData({
ReadGroupCovariate::IdToRg[k1],

View File

@ -31,7 +31,7 @@ struct Cigar {
};
struct ReadIdxCigar {
int readIdx = 0; // 在read序列中的位置
int readIdx = -1; // 在read序列中的位置
char cigarOp = '0'; // 当前位置对应的cigar
};
@ -57,6 +57,7 @@ struct FastArray {
}
}
inline T& operator[](size_t pos) { return arr[pos]; }
inline const T& operator[](size_t pos) const { return arr[pos]; }
struct iterator {
typename std::vector<T>::iterator it;
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
@ -74,6 +75,7 @@ struct FastArray {
// 对原始bam数据的补充比如对两端进行hardclip等
class BamWrap;
struct SamData {
int64_t rid = 0; // for debug
int read_len = 0; // read长度各种clip之后的长度
int cigar_start = 0; // cigar起始位置闭区间
int cigar_end = 0; // cigar结束位置开区间

31
src/util/debug.cpp 100644
View File

@ -0,0 +1,31 @@
/*********************************************************************************************
Description: data and files for debugging
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/09
***********************************************************************************************/
#include "debug.h"
#include <stdio.h>
FILE* gf[4] = {0};
int open_debug_files() {
char fn[1024] = {0};
int i = 0;
for (; i < 4; ++i) {
sprintf(fn, "./test/fp%d.txt", i);
gf[i] = fopen(fn, "w");
}
return 0;
}
int close_debug_files() {
int i = 0;
for (; i < 4; ++i) {
if (gf[i] != 0)
fclose(gf[i]);
}
return 0;
}

27
src/util/debug.h 100644
View File

@ -0,0 +1,27 @@
/*********************************************************************************************
Description: data and files for debugging
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/09
***********************************************************************************************/
#include <stdio.h>
#include <stdint.h>
////////////////// for debug and test //////////////////////////
#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件并记录debug信息
// #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度以及最终扩展的长度
// #define GET_FULL_MATCH_READ // 获取完全匹配的reads
// #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量
// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的querytarget和其他用于计算的数据
// #define GET_KSW_ALIGN_SEQ
// #define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里
////////////////////////////////////////////////////////////////
extern FILE *gf[4];
int open_debug_files();
int close_debug_files();

File diff suppressed because it is too large Load Diff