From 25f079b9365e38ef82d93a007f4d106c9eb63be3 Mon Sep 17 00:00:00 2001 From: zzh Date: Wed, 24 Dec 2025 12:47:26 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=B2=E8=A1=8C=E7=89=88=E6=9C=AC=E8=BF=98?= =?UTF-8?q?=E5=B7=AE=E6=9C=80=E5=90=8E=E4=B8=80=E6=AD=A5=EF=BC=8C=E5=B0=86?= =?UTF-8?q?=E4=BF=A1=E6=81=AF=E5=90=88=E5=B9=B6=E5=88=B0=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E6=B1=87=E6=80=BB=E8=A1=A8=E4=B8=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bqsr/baq.cpp | 2 +- src/bqsr/baq.h | 2 +- src/bqsr/bqsr_entry.cpp | 172 ++++++++++++++++++++++++++++++++++++--- src/bqsr/bqsr_pipeline.h | 2 +- src/bqsr/covariate.cpp | 22 ++--- src/bqsr/covariate.h | 14 ++-- src/util/bam_wrap.h | 83 ++++++++++++++++++- src/util/profiling.cpp | 16 ++-- src/util/profiling.h | 3 + 9 files changed, 274 insertions(+), 42 deletions(-) diff --git a/src/bqsr/baq.cpp b/src/bqsr/baq.cpp index d2339a1..a7c7488 100644 --- a/src/bqsr/baq.cpp +++ b/src/bqsr/baq.cpp @@ -6,7 +6,7 @@ vector BAQ::qual2prob(256); // 质量分数转化概率 vector>> BAQ::EPSILONS(256, vector>(256, vector(SAM_MAX_PHRED_SCORE + 1))); // [ref][read][qual] // 计算baq数组,返回成功与否 -bool BAQ::calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector& baqArray) { +bool BAQ::calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector& baqArray) { // 检测ref是否覆盖了read if (ref.size() < refOffset + ad.read_len) { spdlog::error("BAQ calculation error: reference sequence length {} is less than required length {} (refOffset {} + read_len {})", diff --git a/src/bqsr/baq.h b/src/bqsr/baq.h index f7c4b2e..247217a 100644 --- a/src/bqsr/baq.h +++ b/src/bqsr/baq.h @@ -84,5 +84,5 @@ struct BAQ { double calcEpsilon(uint8_t ref, uint8_t read, uint8_t qualB) { return EPSILONS[ref][read][qualB]; } // 计算baq数组,返回成功与否 - bool calcBAQFromHMM(BamWrap* bw, ReadAdditionData& ad, string ref, int refOffset, vector& baqArray); + bool calcBAQFromHMM(BamWrap* bw, SamData& ad, string ref, int refOffset, vector& baqArray); }; \ No newline at end of file diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index 1f9e4d5..d39e5fa 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -36,6 +36,9 @@ Date : 2023/10/23 using std::deque; #define BAM_BLOCK_SIZE 16L * 1024 * 1024 +#define MAX_SITES_INTERVAL 100000 + +const uint8_t NO_BAQ_UNCERTAINTY = (uint8_t)'@'; const char cBaseToChar[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; @@ -210,7 +213,7 @@ PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStar } // 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量 -void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAdditionData &ad) { +void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &ad) { int start, stop; // Determine the read coordinate to start and stop hard clipping if (refStart < 0) { @@ -235,7 +238,7 @@ void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, ReadAddi } // 计算切掉adapter之后,ref相对原始ref的偏移量 -void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) { +void calculateRefOffset(BamWrap *bw, SamData &ad) { const uint32_t* cigar = bam_get_cigar(bw->b); const bam1_core_t& bc = bw->b->core; int i = 0; @@ -253,16 +256,68 @@ void calculateRefOffset(BamWrap *bw, ReadAdditionData &ad) { } // 计算clip处理之后,剩余的碱基 -void calculateReadBases(BamWrap* bw, ReadAdditionData& ad) { +void calculateReadBases(BamWrap* bw, SamData& ad) { ad.bases.resize(ad.read_len); + ad.quals.resize(ad.read_len); uint8_t* seq = bam_get_seq(bw->b); + uint8_t* quals = bam_get_qual(bw->b); for (int i = 0; i < ad.read_len; ++i) { ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)]; + ad.quals[i] = quals[i]; } } +// 计算read两端clip之后的softstart和softend +void calculateSoftStartEnd(BamWrap* bw, SamData& ad) { + int64_t softStart = bw->b->core.pos + ad.ref_offset; + int64_t softEnd = softStart - 1; // 闭区间 + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + int cigar_start = ad.cigar_start; + int cigar_end = ad.cigar_end; + bool rightTail = false; + for (int i = ad.cigar_start; i < ad.cigar_end; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + int len = bam_cigar_oplen(cigar[i]); + if (i == ad.cigar_start) len -= ad.first_cigar_clip; + if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip; + // if (c == 'S' || c == 'I') + if (c == 'S') + softStart -= len; + else if (c != 'H') + rightTail = true; + if (rightTail) { + if (consumeRefBases(c) || c == 'S') + softEnd += len; + } + } + ad.softStart() = softStart; + ad.softEnd() = softEnd; +} + +// 计算clip之后的cigar +void calculateCigar(BamWrap* bw, SamData& ad) { + ad.cigars.clear(); + const uint32_t* cigar = bam_get_cigar(bw->b); + const bam1_core_t& bc = bw->b->core; + int cigar_start = ad.cigar_start; + int cigar_end = ad.cigar_end; + for (int i = ad.cigar_start; i < ad.cigar_end; ++i) { + const char c = bam_cigar_opchr(cigar[i]); + int len = bam_cigar_oplen(cigar[i]); + if (i == ad.cigar_start) + len -= ad.first_cigar_clip; + if (i == ad.cigar_end - 1) + len -= ad.last_cigar_clip; + ad.cigars.push_back({c, len}); + } + //for(auto &cigar : ad.cigars) { + // spdlog::info("op: {}, len: {}", cigar.op, cigar.len); + //} +} + // 计算read两端softclip的碱基数量,可能会修改ad里的clip值 -void calculateSoftClip(BamWrap *bw, ReadAdditionData &ad) { +void calculateSoftClip(BamWrap *bw, SamData &ad) { const uint32_t* cigar = bam_get_cigar(bw->b); const bam1_core_t& bc = bw->b->core; int readIndex = ad.left_clip; @@ -324,10 +379,12 @@ inline void updateIndel(vector &isIndel, int index) { } // 计算该read的每个碱基位置是否是SNP或Indel -int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector &isSNP, vector &isIns, vector &isDel) { +int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &isSNP, vector &isIns, vector &isDel) { // 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤 Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间 + PROF_START(ref); read_ref_base(aux, interval.left, interval); + PROF_END(gprof[GP_read_ref], ref); string refBases(aux.ref_seq); // spdlog::info("ref: {}, {}, {} - {}", aux.ref_seq, aux.ref_len, bw->contig_pos(), bw->contig_end_pos()); @@ -381,13 +438,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, ReadAdditionData &ad, vector } // 简单计算baq数组,就是全部赋值为'@' (64) -bool flatBAQArray(BamWrap* bw, ReadAdditionData& ad, vector& baqArray) { +bool flatBAQArray(BamWrap* bw, SamData& ad, vector& baqArray) { baqArray.resize(ad.read_len, (int)'@'); return true; } // 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了 -bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, ReadAdditionData& ad, vector& baqArray) { +bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector& baqArray) { baqArray.resize(ad.read_len, 0); return true; } @@ -404,7 +461,7 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li } // 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取 -void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector &vcfs) { +void calculateKnownSites(BamWrap* bw, SamData& ad, vector &vcfs, vector knownSites) { int tid = bw->contig_id(); uint64_t startPos = bw->start_pos(); // 闭区间 uint64_t endPos = bw->end_pos(); // 闭区间 @@ -427,6 +484,8 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector &v //spdlog::info("before intervals : {}", vcf.knownSites.size()); // 读取新的interval int64_t fpos, flen; + endPos = std::max(startPos + MAX_SITES_INTERVAL, endPos); + Interval readIntv(startPos, endPos); vcf.index.SearchInterval(startPos, endPos, &fpos, &flen); //spdlog::info("file index: {}, {}", fpos, flen); if (flen > 0) { @@ -448,7 +507,9 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector &v string id, ref; ss_line >> stid >> pos >> id >> ref; tid = sam_hdr_name2tid(nsgv::gInBamHeader, stid.c_str()); - if (tid >= 0 && pos > 0) { + int64_t varStart = BamWrap::bam_global_pos(tid, pos); + Interval varIntv(varStart, varStart + ref.size() - 1); + if (readIntv.overlaps(varIntv)) { vcf.knownSites.push_back(Interval(tid, pos - 1, pos - 1 + ref.size())); //spdlog::info("intv-1 {}, {}, {}", tid, pos, ref.size()); } @@ -461,6 +522,74 @@ void calculateKnownSites(BamWrap* bw, ReadAdditionData& ad, vector &v //} } //exit(0); + knownSites.resize(ad.read_len); + endPos = bw->end_pos(); + + for(auto &vcf : vcfs) { + for (auto &intv : vcf.knownSites) { + // knownSite is outside clipping window for the read, ignore + if (intv.right < ad.softStart()) + continue; + if (intv.left > ad.softEnd()) + break; + // 计算对应的位点 + ReadIdxCigar rcStart = ad.getReadIndexForReferenceCoordinate(intv.left); + int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx; + if (rcStart.cigarOp == 'D') { + --featureStartOnRead; + } + ReadIdxCigar rcEnd = ad.getReadIndexForReferenceCoordinate(intv.right); + int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? ad.read_len : rcEnd.readIdx; + if (featureStartOnRead > ad.read_len) { + featureStartOnRead = featureEndOnRead = ad.read_len; + } + for (int i = max(0, featureStartOnRead); i < min(ad.read_len, featureEndOnRead + 1); ++i) { + knownSites[i] = true; + } + } + } + +} + +// 应该是计算一段数据的平均值 +static void calculateAndStoreErrorsInBlock(int i, int blockStartIndex, vector& errorArr, vector& fracErrs) { + int totalErrors = 0; + for (int j = max(0, blockStartIndex - 1); j <= i; j++) { + totalErrors += errorArr[j]; + } + for (int j = max(0, blockStartIndex - 1); j <= i; j++) { + fracErrs[j] = ((double)totalErrors) / ((double)(i - max(0, blockStartIndex - 1) + 1)); + } +} + +// 应该是用来处理BAQ的,把不等于特定BAQ分数的碱基作为一段数据统一处理 +void calculateFractionalErrorArray(vector& errorArr, vector& baqArr, vector& fracErrs) { + fracErrs.resize(baqArr.size()); + // errorArray和baqArray必须长度相同 + const int BLOCK_START_UNSET = -1; + bool inBlock = false; + int blockStartIndex = BLOCK_START_UNSET; + int i; + for (i = 0; i < fracErrs.size(); ++i) { + if (baqArr[i] == NO_BAQ_UNCERTAINTY) { // 不等于NO_BAQ_UNCERTAINTY的碱基当成一段数据来处理 + if (!inBlock) { + fracErrs[i] = errorArr[i]; + } else { + calculateAndStoreErrorsInBlock(i, blockStartIndex, errorArr, fracErrs); + inBlock = false; // reset state variables + blockStartIndex = BLOCK_START_UNSET; // reset state variables + } + } else { + inBlock = true; + if (blockStartIndex == BLOCK_START_UNSET) { + blockStartIndex = i; + } + } + } + // 处理最后一个block + if (inBlock) { + calculateAndStoreErrorsInBlock(i - 1, blockStartIndex, errorArr, fracErrs); + } } // 串行bqsr @@ -513,7 +642,7 @@ int SerialBQSR() { // 3. 如果bam文件之前做过bqsr,tag中包含OQ(originnal quality,原始质量分数),检查用户参数里是否指定用原始质量分数进行bqsr,如果是则将质量分数替换为OQ,否则忽略OQ,先忽略 // 4. 对read的两端进行检测,去除(hardclip)adapter BamWrap *bw = bams[i]; - ReadAdditionData ad; + SamData ad; ad.read_len = BamWrap::BamEffectiveLength(bw->b); ad.cigar_end = bw->b->core.n_cigar; if (ad.read_len <= 0) continue; @@ -535,6 +664,9 @@ int SerialBQSR() { calculateRefOffset(bw, ad); // 计算ref_offset,就是相对比对的position,要将ref右移多少 calculateReadBases(bw, ad); // 计算clip处理之后,剩余的碱基 + // 计算clip之后,两端的softstart和softend + calculateSoftStartEnd(bw, ad); + calculateCigar(bw, ad); //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq, // ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str()); @@ -560,7 +692,9 @@ int SerialBQSR() { // 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了 // 8. 计算这条read对应的协变量 + PROF_START(covariate); CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true); + PROF_END(gprof[GP_covariate], covariate); test = readCovariates[1][0][0] + readCovariates[2][1][3]; int end_pos = bw->contig_end_pos(); //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos, @@ -568,7 +702,23 @@ int SerialBQSR() { // 9. 计算这条read需要跳过的位置 vector skip(ad.read_len, 0); - calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr); + PROF_START(known_sites); + calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skip); + for (int ii = 0; ii < ad.read_len; ++ii) { + skip[ii] = + skip[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) || ad.quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; + } + PROF_END(gprof[GP_read_vcf], known_sites); + + // 10. 根据BAQ进一步处理snp,indel,得到处理后的数据 + vector snpErrors, insErrors, delErrors; + calculateFractionalErrorArray(isSNP, baqArray, snpErrors); + calculateFractionalErrorArray(isIns, baqArray, insErrors); + calculateFractionalErrorArray(isDel, baqArray, delErrors); + + // aggregate all of the info into our info object, and update the data + // 11. 合并之前计算的数据,得到info,并更新bqsr table数据 + } #if 0 diff --git a/src/bqsr/bqsr_pipeline.h b/src/bqsr/bqsr_pipeline.h index 9d6a6e6..9c1b6be 100644 --- a/src/bqsr/bqsr_pipeline.h +++ b/src/bqsr/bqsr_pipeline.h @@ -9,7 +9,7 @@ struct ReadData { vector bams; // read step output int64_t bamStartIdx = 0; // bam,bam - int64_t taskSeq = 0; // + int64_t taskSeq = 0; // }; struct GenREData { diff --git a/src/bqsr/covariate.cpp b/src/bqsr/covariate.cpp index 4c15958..ede899c 100644 --- a/src/bqsr/covariate.cpp +++ b/src/bqsr/covariate.cpp @@ -22,16 +22,16 @@ int CycleCovariate::MAXIMUM_CYCLE_VALUE; // for CovariateUtils // 对一条read计算协变量(该协变量被上一个read用过) -void CovariateUtils::ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, +void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { - //ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues); - //BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues); - //ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues); - //CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues); + ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues); + BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues); + ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues); + CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues); } // ReadGroupCovariate 协变量的方法 -void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { +void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { uint8_t *rgStr = bam_aux_get(bw->b, "RG"); char* rgVal = nullptr; if (rgStr) rgVal = bam_aux2Z(rgStr); @@ -47,7 +47,7 @@ void ReadGroupCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr } // BaseQualityCovariate 协变量的方法 -void BaseQualityCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, +void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { // 在前面的处理过后,quals应该和base长度一致了 #define __bq_set_cov(ins, del) \ @@ -98,7 +98,7 @@ static char SimpleComplement(const char base) { } // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) -void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail) { +void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) { uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip; if (bw->GetReadNegativeStrandFlag()) { // 反向互补 @@ -218,7 +218,7 @@ void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const i } } -void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { +void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { const int originalReadLength = ad.read_len; // store the original bases and then write Ns over low quality ones @@ -270,7 +270,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t * @param maxCycle max value of the base to compute the key for * (this method throws UserException if the computed absolute value of the cycle number is higher than this value). */ -int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle) { +int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) { const bool isNegStrand = bw->GetReadNegativeStrandFlag(); const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2); const int readLength = ad.read_len; @@ -299,7 +299,7 @@ int CycleCovariate::CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNu } // Used to pick out the covariate's value from attributes of the read -void CycleCovariate::RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { +void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { const int readLength = ad.read_len; // Note: duplicate the loop to void checking recordIndelValues on every iteration if (recordIndelValues) { diff --git a/src/bqsr/covariate.h b/src/bqsr/covariate.h index 0c9edc0..860d6f8 100644 --- a/src/bqsr/covariate.h +++ b/src/bqsr/covariate.h @@ -69,7 +69,7 @@ struct CovariateUtils { } // 对一条read计算协变量(该协变量被上一个read用过) - static void ComputeCovariates(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Read group协变量 @@ -78,13 +78,13 @@ struct ReadGroupCovariate { static map RgToId; // read group name到id的映射 static map IdToRg; // id到read group name的映射 - static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Base quality协变量 struct BaseQualityCovariate { static constexpr int index = 1; // 在协变量数组中的索引位置 - static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Context协变量 @@ -162,14 +162,14 @@ struct ContextCovariate { } // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) - static void GetStrandedClippedBytes(BamWrap* bw, ReadAdditionData& ad, string& clippedBases, uint8_t lowQTail); + static void GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail); // Creates a int representation of a given dna string. static int KeyFromContext(const string& dna, const int start, const int end); // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector& keys); // 设置协变量的值 - static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Cycle协变量 @@ -203,7 +203,7 @@ struct CycleCovariate { } // Computes the encoded value of CycleCovariate's key for the given position at the read. - static int CycleKey(BamWrap* bw, ReadAdditionData& ad, const int baseNumber, const bool indel, const int maxCycle); + static int CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle); - static void RecordValues(BamWrap* bw, ReadAdditionData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; \ No newline at end of file diff --git a/src/util/bam_wrap.h b/src/util/bam_wrap.h index 387f19e..9175f15 100644 --- a/src/util/bam_wrap.h +++ b/src/util/bam_wrap.h @@ -11,16 +11,61 @@ #include #include #include +#include #include +#include #include #include #include using namespace std; +struct Cigar { + char op = '0'; + int len = 0; + // 该操作符是否消耗read的碱基 + static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; } + // 该操作符是否消耗参考基因组的碱基 + static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; } +}; + +struct ReadIdxCigar { + int readIdx = 0; // 在read序列中的位置 + char cigarOp = '0'; // 当前位置对应的cigar +}; + +// 不用经常释放array的内存空间,减少频繁的内存开辟和释放操作 +template +struct FastArray { + vector arr; + size_t idx; + void clear() { idx = 0; } + size_t size() { return idx; } + void push_back(const T& val) { + if (idx < arr.size()) { + arr[idx++] = val; + } else { + arr.push_back(val); + idx++; + } + } + struct iterator { + typename std::vector::iterator it; + iterator(typename std::vector::iterator _it) : it(_it) {} + iterator& operator++() { ++it; return *this;} + iterator& operator--() { --it; return *this;} + T& operator*() const { return *it; } + bool operator!=(const iterator& other) const { return it != other.it; } + bool operator==(const iterator& other) const { return it == other.it; } + }; + + iterator begin() { return arr.begin(); } + iterator end() { return arr.begin() + idx; } +}; + // 对原始bam数据的补充,比如对两端进行hardclip等 -struct ReadAdditionData { +struct SamData { int read_len = 0; // read长度,各种clip之后的长度 int cigar_start = 0; // cigar起始位置,闭区间 int cigar_end = 0; // cigar结束位置,开区间 @@ -29,7 +74,41 @@ struct ReadAdditionData { int left_clip = 0; // 左侧被切掉的碱基长度 int right_clip = 0; // 右侧被切掉的碱基长度 int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量 - string bases; // 处理之后的read的碱基 + + // 记录一下bqsr运算过程中用到的数据,回头提前计算一下,修正现在的复杂逻辑 + static constexpr int READ_INDEX_NOT_FOUND = -1; + string bases; // 处理之后的read的碱基 + vector quals; // 对应的质量分数 + int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间 + int64_t end_pos; // 同上,闭区间 + FastArray cigars; + int64_t& softStart() { return start_pos; } + int64_t& softEnd() { return end_pos; } + + // functions + ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) { + ReadIdxCigar rc; + if (refPos < start_pos) + return rc; + int firstReadPosOfElement = 0; // inclusive + int firstRefPosOfElement = start_pos; // inclusive + int lastReadPosOfElement = 0; // exclusive + int lastRefPosOfElement = start_pos; // exclusive + // advance forward through all the cigar elements until we bracket the reference coordinate + for (auto& cigar : cigars) { + firstReadPosOfElement = lastReadPosOfElement; + firstRefPosOfElement = lastRefPosOfElement; + lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0; + lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0; + if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element + int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0); + rc.cigarOp = cigar.op; + rc.readIdx = readPosAtRefCoord; + return rc; + } + } + return rc; + } }; /* diff --git a/src/util/profiling.cpp b/src/util/profiling.cpp index 17833a2..00ede23 100644 --- a/src/util/profiling.cpp +++ b/src/util/profiling.cpp @@ -54,11 +54,11 @@ int DisplayProfiling(int nthread) { // PRINT_GP(sort_wait); // PRINT_GP(markdup_wait); // PRINT_GP(intersect_wait); - PRINT_GP(read); - PRINT_GP(gen); - PRINT_GP(sort); - PRINT_GP(markdup); - PRINT_GP(intersect); + PRINT_GP(read_ref); + PRINT_GP(read_vcf); + PRINT_GP(covariate); + //PRINT_GP(markdup); + //PRINT_GP(intersect); // PRINT_GP(merge_result); // PRINT_GP(sort_pair); // PRINT_GP(sort_frag); @@ -68,10 +68,10 @@ int DisplayProfiling(int nthread) { // PRINT_GP(merge_markdup); // PRINT_GP(merge_update); // PRINT_GP(merge_add); - PRINT_GP(markdup_all); + //PRINT_GP(markdup_all); // PRINT_GP(final_read); - PRINT_GP(write); - PRINT_GP(whole_process); + //PRINT_GP(write); + // PRINT_GP(whole_process); // PRINT_TP(gen, nthread); // PRINT_TP(sort_frag, nthread); diff --git a/src/util/profiling.h b/src/util/profiling.h index 8986d32..713ea87 100644 --- a/src/util/profiling.h +++ b/src/util/profiling.h @@ -40,6 +40,9 @@ extern uint64_t gprof[LIM_GLOBAL_PROF_TYPE]; enum { GP_0 = 0, GP_1, GP_2, GP_3, GP_4, GP_5, GP_6, GP_7, GP_8, GP_9, GP_10 }; enum { GP_read_wait = 11, + GP_covariate, + GP_read_ref, + GP_read_vcf, GP_gen_wait, GP_sort_wait, GP_markdup_wait,