diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index 66ff367..f7e765e 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -249,7 +249,7 @@ int SerialBQSR(AuxVar &aux) { // 9. 计算这条read需要跳过的位置 PROF_START(read_vcf); - RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips); + RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, 0); for (int ii = 0; ii < sd.read_len; ++ii) { skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; @@ -317,7 +317,9 @@ static void thread_worker(void* data, long idx, int tid, int steal) { StableArray&snpErrors = aux.snpErrors, &insErrors = aux.insErrors, &delErrors = aux.delErrors; StableArray& skips = aux.skips; // 该位置是否是已知位点 auto &bams = *aux.bamArr; - if (steal) for (auto& vcf : aux.vcfArr) vcf.knownSites.clear(); + if (steal) + for (auto& vcf : aux.vcfArr) vcf.knownSites.clear(); + int f = tid * 4; #if 1 int startIdx = idx * aux.BAM_BLOCK_NUM; int stopIdx = std::min((size_t)(idx + 1) * aux.BAM_BLOCK_NUM, bams.size()); @@ -326,6 +328,7 @@ static void thread_worker(void* data, long idx, int tid, int steal) { int startIdx = idx * blockReadNums; int stopIdx = std::min((size_t)(idx + 1) * blockReadNums, bams.size()); #endif + // spdlog::info("tid {}, index {}, steal {}", tid, idx, steal); aux.threadProcessedReads += stopIdx - startIdx; for (int i = startIdx; i < stopIdx; ++i) { // spdlog::info("Thread {} processing read idx: {}", tid, i); @@ -344,7 +347,17 @@ static void thread_worker(void* data, long idx, int tid, int steal) { // PROF_END(gprof[GP_clip_read], clip_read); const int nErrors = RecalFuncs::calculateIsSNPOrIndel(aux, sd, isSNP, isIns, isDel); - +#if 0 + fprintf(gf[f + 0], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[f + 0], "%d ", isSNP[ii]); + fprintf(gf[f + 0], "\n"); + fprintf(gf[f + 1], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[f + 1], "%d ", isIns[ii]); + fprintf(gf[f + 1], "\n"); + fprintf(gf[f + 2], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[f + 2], "%d ", isDel[ii]); + fprintf(gf[f + 2], "\n"); +#endif bool baqCalculated = false; if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) { baqCalculated = BAQ::flatBAQArray(sd, baqArray); @@ -358,12 +371,19 @@ static void thread_worker(void* data, long idx, int tid, int steal) { // PROF_END(gprof[GP_covariate], covariate); // PROF_START(read_vcf); - RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips); + RecalFuncs::calculateKnownSites(sd, aux.vcfArr, aux.header, skips, tid); for (int ii = 0; ii < sd.read_len; ++ii) { skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; } // PROF_GP_END(read_vcf); + +#if 0 + fprintf(gf[f + 3], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); + for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[f + 3], "%d ", skips[ii] ? 1 : 0); + fprintf(gf[f + 3], "\n"); +#endif + #if 0 int fidx = 0 + 2 * tid; //if (sd.rid % 2 == 0) fidx = 0 + 2 * tid; @@ -446,7 +466,7 @@ int ParallelBQSR(vector& auxArr) { collapseQualityScoreTableToReadGroupTable(recalTables.readGroupTable, recalTables.qualityScoreTable); roundTableValues(recalTables); - printRecalTables(recalTables); + // printRecalTables(recalTables); // 量化质量分数 QuantizationInfo quantInfo(recalTables, nsgv::gBqsrArg.QUANTIZING_LEVELS); diff --git a/src/bqsr/recal_funcs.h b/src/bqsr/recal_funcs.h index 41a53f1..ae0c640 100644 --- a/src/bqsr/recal_funcs.h +++ b/src/bqsr/recal_funcs.h @@ -15,6 +15,7 @@ #include "util/interval.h" #include "util/vcf_parser.h" #include "util/profiling.h" +#include "util/debug.h" struct RecalFuncs { static constexpr int MAX_SITES_INTERVAL = 100000; @@ -104,32 +105,39 @@ struct RecalFuncs { } // 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取 - static void calculateKnownSites(SamData& sd, vector& vcfs, sam_hdr_t* samHdr, StableArray& knownSites) { + static void calculateKnownSites(SamData& sd, vector& vcfs, sam_hdr_t* samHdr, StableArray& knownSites, int thid) { BamWrap* bw = sd.bw; int tid = bw->contig_id(); - int64_t startPos = bw->start_pos(); // 闭区间 + int64_t startPos = bw->start_pos(); // 闭区间,使用clip之前的read匹配的范围 int64_t endPos = bw->end_pos(); // 闭区间 knownSites.resize_fill(sd.read_len, 0); - + // update vcfs + // int idx = 0; for (auto& vcf : vcfs) { - if (!vcf.knownSites.empty() && vcf.knownSites.back().right < startPos) - vcf.knownSites.clear(); - // 清理旧的interval - while (!vcf.knownSites.empty()) { - auto& intv = vcf.knownSites.front(); - if (intv.right < startPos) - vcf.knownSites.pop_front(); - else - break; - } - if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) // 此时vcf的区域包含bam,不需要读取 + // 为啥多线程环境会出现,deque的front和[0]不一样?好像是调试的时候的问题,实际运行时没再出现 + // if (vcf.knownSites.front().left != vcf.knownSites[0].left || vcf.knownSites.front().right != vcf.knownSites[0].right) + // spdlog::error("front is different with [0]: {} - {}", vcf.knownSites.front().left, vcf.knownSites[0].left); + + if (!vcf.knownSites.empty() && vcf.knownSites.back().left > endPos) {// 此时vcf的区域包含bam,不需要读取 + // 清理旧的interval,只有此时才有清理的必要 + while (!vcf.knownSites.empty()) { + auto& intv = vcf.knownSites.front(); + if (intv.right < startPos) + vcf.knownSites.pop_front(); + else + break; + } continue; + } + vcf.knownSites.clear(); // 清空,因为后面会读入覆盖bam的所有vcf位点 // 读取新的interval int64_t fpos, flen; endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos); Interval readIntv(startPos, endPos); vcf.index.SearchInterval(startPos, endPos, &fpos, &flen); + // fprintf(gf[thid * 2 + idx], "%s %d %ld %ld %ld\n", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid, fpos, flen); + if (flen > 0) { vcf.inStm.seekg(fpos, ios::beg); if (flen > vcf.bufLen) { @@ -149,22 +157,30 @@ struct RecalFuncs { string id, ref; ss_line >> stid >> pos >> id >> ref; tid = sam_hdr_name2tid(samHdr, stid.c_str()); - int64_t varStart = BamWrap::bam_global_pos(tid, pos); + int64_t varStart = BamWrap::bam_global_pos(tid, pos - 1); // vcf中的pos是1-based,程序中计算是0-based Interval varIntv(varStart, varStart + ref.size() - 1); if (varIntv.overlaps(readIntv)) { - vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size() - 1)); // 闭区间 + vcf.knownSites.push_back(varIntv); // 闭区间 } get_line_from_buf(buf, flen, &cur, &line); } } } + // fprintf(gf[0], "%s %d %ld ", bam_get_qname(sd.bw->b), sd.bw->b->core.flag, sd.rid); for (auto& vcf : vcfs) { for (auto& intv : vcf.knownSites) { // knownSite is outside clipping window for the read, ignore +#if 1 if (intv.right < sd.softStart()) continue; if (intv.left > sd.softEnd()) break; +#else + if (intv.right < sd.softStart() || intv.left > sd.softEnd()) + continue; +#endif + //fprintf(gf[0], "%ld %ld ", intv.left, intv.right); + // 计算对应的位点 ReadIdxCigar rcStart = sd.getReadIndexForReferenceCoordinate(intv.contigLeft()); int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx; @@ -180,7 +196,9 @@ struct RecalFuncs { knownSites[i] = true; } } + //idx += 1; } + //fprintf(gf[0], "\n"); } // 应该是计算一段数据的平均值 diff --git a/src/util/debug.h b/src/util/debug.h index 5f875fe..5308f2f 100644 --- a/src/util/debug.h +++ b/src/util/debug.h @@ -20,7 +20,7 @@ //////////////////////////////////////////////////////////////// -#define DEBUG_FILE_NUM 5 +#define DEBUG_FILE_NUM 8 extern FILE* gf[DEBUG_FILE_NUM]; diff --git a/src/util/linear_index.cpp b/src/util/linear_index.cpp index c2924c4..d12a273 100644 --- a/src/util/linear_index.cpp +++ b/src/util/linear_index.cpp @@ -30,7 +30,11 @@ void LinearIndex::SearchInterval(int64_t start, int64_t end, int64_t* file_pos, } int sb_idx, eb_idx; if (stid == stid_origin) { - sb_idx = spos / idx_[stid].binWidth; + // 下面修改的这两行代码与原版不同的,是查找所有包含[spos, epos]的变异信息 + spos -= idx_[stid].longestFeature; // 保证起始位置小于spos,但是结束位置大于spos的变异包含进来 + spos = spos >= 0 ? spos : 0; + sb_idx = spos / idx_[stid].binWidth; // 找到spos对应的桶 + sb_idx = sb_idx >= 0 ? sb_idx : 0; if (sb_idx >= idx_[stid].size()) { // 开始区域没有对应的block sb_idx = idx_[stid].size() - 1; Block& sb = idx_[stid][sb_idx];