diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index 90e9338..eabf1f8 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -38,6 +38,7 @@ Date : 2023/10/23 #include "util/utils.h" #include "util/math/math_utils.h" #include "quant_info.h" +#include "util/debug.h" using std::deque; @@ -270,7 +271,7 @@ void calculateReadBases(BamWrap* bw, SamData& ad) { uint8_t* quals = bam_get_qual(bw->b); for (int i = 0; i < ad.read_len; ++i) { ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)]; - ad.base_quals[i] = quals[i]; + ad.base_quals[i] = quals[i + ad.left_clip]; } } @@ -470,7 +471,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li } // 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取 -void calculateKnownSites(BamWrap* bw, SamData& ad, vector &vcfs, vector knownSites) { +void calculateKnownSites(BamWrap* bw, SamData& ad, vector &vcfs, vector &knownSites) { int tid = bw->contig_id(); uint64_t startPos = bw->start_pos(); // 闭区间 uint64_t endPos = bw->end_pos(); // 闭区间 @@ -519,7 +520,7 @@ void calculateKnownSites(BamWrap* bw, SamData& ad, vector &vcfs, vect int64_t varStart = BamWrap::bam_global_pos(tid, pos); Interval varIntv(varStart, varStart + ref.size() - 1); if (readIntv.overlaps(varIntv)) { - vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size())); + vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间 //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size()); } get_line_from_buf(buf, flen, &cur, &line); @@ -619,7 +620,8 @@ void updateRecalTablesForRead(ReadRecalInfo &info) { int readLength = read.read_len; for (int offset = 0; offset < readLength; ++offset) { - if (!info.skips[offset]) { // 不跳过当前位置 + if (!info.skips[offset]) { + //if (true){ // 不跳过当前位置 for (int idx = 0; idx < nsgv::gEventTypes.size(); ++idx) { // 获取四个值,readgroup / qualityscore / context / cycle EventTypeValue& event = nsgv::gEventTypes[idx]; @@ -693,6 +695,7 @@ void roundTableValues(RecalTables& rt) { // 串行bqsr int SerialBQSR() { + open_debug_files(); int round = 0; BamBufType inBamBuf(nsgv::gBqsrArg.DUPLEX_IO); // inBamBuf.Init(nsgv::gInBamFp, nsgv::gInBamHeader, nsgv::gBqsrArg.MAX_MEM); @@ -754,6 +757,7 @@ int SerialBQSR() { // spdlog::info("bam idx: {}", i); BamWrap* bw = bams[i]; SamData ad; + ad.rid = i; ad.bw = bw; ad.read_len = BamWrap::BamEffectiveLength(bw->b); ad.cigar_end = bw->b->core.n_cigar; @@ -788,6 +792,18 @@ int SerialBQSR() { vector isIns(ad.read_len, 0); // 该位置是否是插入位置,0不是,1是 vector isDel(ad.read_len, 0); // 该位置是否是删除位置,0不是,1是 const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel); + + /*fprintf(gf[0], "%d\t", ad.read_len); + for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]); + fprintf(gf[0], "\n"); + fprintf(gf[1], "%d\t", ad.read_len); + for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]); + fprintf(gf[1], "\n"); + fprintf(gf[2], "%d\t", ad.read_len); + for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]); + fprintf(gf[2], "\n"); + */ + // spdlog::info("nErrors: {}", nErrors); // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); } @@ -811,8 +827,19 @@ int SerialBQSR() { PROF_START(covariate); CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true); PROF_END(gprof[GP_covariate], covariate); - test = readCovariates[1][0][0] + readCovariates[2][1][3]; - int end_pos = bw->contig_end_pos(); + + // fprintf(gf[3], "%ld %ld %d %ld\n", ad.rid, readCovariates.size(), ad.read_len, readCovariates[0][0].size()); + // for (auto &arr1 : readCovariates) { + // for (size_t si = 0; si < ad.read_len; ++si) { + // for (auto &val : arr1[si]) { + // fprintf(gf[3], "%d ", val); + // } + // } + // } + // fprintf(gf[3], "\n"); + + //test = readCovariates[1][0][0] + readCovariates[2][1][3]; + //int end_pos = bw->contig_end_pos(); //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos, // bw->GetReadNegativeStrandFlag() ? "reverse" : "forward"); // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); } @@ -825,6 +852,9 @@ int SerialBQSR() { ad.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; } PROF_END(gprof[GP_read_vcf], known_sites); + // fprintf(gf[0], "%d\t", ad.read_len); + // for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0); + // fprintf(gf[0], "\n"); // 10. 根据BAQ进一步处理snp,indel,得到处理后的数据 vector snpErrors, insErrors, delErrors; @@ -911,6 +941,8 @@ int SerialBQSR() { } spdlog::info("read count: {}", readNumSum); + close_debug_files(); + return 0; } diff --git a/src/bqsr/covariate.cpp b/src/bqsr/covariate.cpp index 2d994b3..429973d 100644 --- a/src/bqsr/covariate.cpp +++ b/src/bqsr/covariate.cpp @@ -98,6 +98,51 @@ static char SimpleComplement(const char base) { } } +static void ClipLowQualEndsWithN(string& bases, const FastArray &quals, uint8_t lowQTail, bool isNegativeStrand) { + // 处理左边 + int left = 0; + int readLen = bases.size(); + if (isNegativeStrand) { + for (; left < readLen; ++left) { + if (quals[readLen - 1 - left] <= lowQTail) + bases[left] = 'N'; + else + break; + } + } else { + for (; left < readLen; ++left) { + if (quals[left] <= lowQTail) + bases[left] = 'N'; + else + break; + } + } + if (left == readLen) { + bases.clear(); + return; + } + // 处理右边 + int right = (int)bases.size() - 1; + if (isNegativeStrand) { + for (; right >= 0; --right) { + if (quals[readLen - 1 - right] <= lowQTail) + bases[right] = 'N'; + else + break; + } + } else { + for (; right >= 0; --right) { + if (quals[right] <= lowQTail) + bases[right] = 'N'; + else + break; + } + } + if (right < left) + bases.clear(); + +} + // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) { uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip; @@ -106,28 +151,7 @@ void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]); } - // 处理左边 - int left = 0; - for (; left < ad.read_len; ++left) { - if (quals[left] <= lowQTail) - clippedBases[left] = 'N'; - else - break; - } - if (left == ad.read_len) { - clippedBases.clear(); - return; - } - // 处理右边 - int right = ad.read_len - 1; - for (; right >= 0; --right) { - if (quals[right] <= lowQTail) - clippedBases[right] = 'N'; - else - break; - } - if (right < left) - clippedBases.clear(); + ClipLowQualEndsWithN(clippedBases, ad.base_quals, lowQTail, bw->GetReadNegativeStrandFlag()); } /** @@ -224,7 +248,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, // store the original bases and then write Ns over low quality ones string strandedClippedBases(ad.bases); - GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail); + GetStrandedClippedBytes(bw, ad, strandedClippedBases, 30); // 注意这里的lowQualTail数值 // spdlog::info("bases: {}", strandedClippedBases); vector nBasePairContextAtEachCycle; GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle); diff --git a/src/bqsr/qual_utils.h b/src/bqsr/qual_utils.h index 35d4c05..5272cc7 100644 --- a/src/bqsr/qual_utils.h +++ b/src/bqsr/qual_utils.h @@ -225,6 +225,8 @@ struct QualityUtils { static uint8_t errorProbToQual(const double errorRate, const uint8_t maxQual) { // Utils.validateArg(MathUtils.isValidProbability(errorRate), ()->"errorRate must be good probability but got " + errorRate); const double d = std::round(-10.0 * std::log10(errorRate)); + if (d > maxQual) // 注意当d超过int表示的最大值时(如inf),转变成int会变成负数 + return maxQual; return boundQual((int)d, maxQual); } diff --git a/src/bqsr/recal_utils.h b/src/bqsr/recal_utils.h index 072eeff..9c8d82a 100644 --- a/src/bqsr/recal_utils.h +++ b/src/bqsr/recal_utils.h @@ -113,13 +113,13 @@ struct RecalUtils { table.addColumn({"Observations", "%d"}); table.addColumn({"Errors", "%.2f"}); - spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches, - r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality); + //spdlog::info("rg0: {}, {}, {}, {}", r.readGroupTable[0][0].numObservations, r.readGroupTable[0][0].numMismatches, + // r.readGroupTable[0][0].reportedQuality, r.readGroupTable[0][0].empiricalQuality); _Foreach2DK(r.readGroupTable, datum, { RecalDatum &dat = const_cast(datum); - spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches()); - spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations()); + // spdlog::info("errors: {}, {}", datum.numMismatches, dat.getNumMismatches()); + // spdlog::info("obs: {}, {}", datum.numObservations, dat.getNumObservations()); if (dat.getNumObservations() > 0) { table.addRowData({ ReadGroupCovariate::IdToRg[k1], diff --git a/src/util/bam_wrap.h b/src/util/bam_wrap.h index 611b67d..8424bea 100644 --- a/src/util/bam_wrap.h +++ b/src/util/bam_wrap.h @@ -31,7 +31,7 @@ struct Cigar { }; struct ReadIdxCigar { - int readIdx = 0; // 在read序列中的位置 + int readIdx = -1; // 在read序列中的位置 char cigarOp = '0'; // 当前位置对应的cigar }; @@ -57,6 +57,7 @@ struct FastArray { } } inline T& operator[](size_t pos) { return arr[pos]; } + inline const T& operator[](size_t pos) const { return arr[pos]; } struct iterator { typename std::vector::iterator it; iterator(typename std::vector::iterator _it) : it(_it) {} @@ -74,6 +75,7 @@ struct FastArray { // 对原始bam数据的补充,比如对两端进行hardclip等 class BamWrap; struct SamData { + int64_t rid = 0; // for debug int read_len = 0; // read长度,各种clip之后的长度 int cigar_start = 0; // cigar起始位置,闭区间 int cigar_end = 0; // cigar结束位置,开区间 diff --git a/src/util/debug.cpp b/src/util/debug.cpp new file mode 100644 index 0000000..c7b6e8b --- /dev/null +++ b/src/util/debug.cpp @@ -0,0 +1,31 @@ +/********************************************************************************************* + Description: data and files for debugging + Copyright : All right reserved by NCIC.ICT + + Author : Zhang Zhonghai + Date : 2024/04/09 +***********************************************************************************************/ +#include "debug.h" + +#include + +FILE* gf[4] = {0}; + +int open_debug_files() { + char fn[1024] = {0}; + int i = 0; + for (; i < 4; ++i) { + sprintf(fn, "./test/fp%d.txt", i); + gf[i] = fopen(fn, "w"); + } + return 0; +} + +int close_debug_files() { + int i = 0; + for (; i < 4; ++i) { + if (gf[i] != 0) + fclose(gf[i]); + } + return 0; +} \ No newline at end of file diff --git a/src/util/debug.h b/src/util/debug.h new file mode 100644 index 0000000..106be74 --- /dev/null +++ b/src/util/debug.h @@ -0,0 +1,27 @@ +/********************************************************************************************* + Description: data and files for debugging + Copyright : All right reserved by NCIC.ICT + + Author : Zhang Zhonghai + Date : 2024/04/09 +***********************************************************************************************/ +#include +#include + +////////////////// for debug and test ////////////////////////// + +#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件,并记录debug信息 +// #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度,以及最终扩展的长度 +// #define GET_FULL_MATCH_READ // 获取完全匹配的reads +// #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量 +// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的query,target,和其他用于计算的数据 +// #define GET_KSW_ALIGN_SEQ +// #define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里 + +//////////////////////////////////////////////////////////////// + +extern FILE *gf[4]; + +int open_debug_files(); + +int close_debug_files(); \ No newline at end of file