diff --git a/.gitignore b/.gitignore index 453cda5..bb8d38e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ /build /test build.sh +debug_build.sh +r1.sh run.sh test.sh *.log diff --git a/src/bqsr/bqsr_entry.cpp b/src/bqsr/bqsr_entry.cpp index 2a09a29..ee073b5 100644 --- a/src/bqsr/bqsr_entry.cpp +++ b/src/bqsr/bqsr_entry.cpp @@ -221,7 +221,7 @@ PosAndOperator getReadIndexForReferenceCoordinate(BamWrap *bw, int alignmentStar } // 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量 -void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &ad) { +void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData &sd) { int start, stop; // Determine the read coordinate to start and stop hard clipping if (refStart < 0) { @@ -230,65 +230,65 @@ void clipByReferenceCoordinates(BamWrap *bw, int refStart, int refStop, SamData // if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here // is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine. stop = stopPosAndOperator.readPosAtRefCoord - (consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1); - ad.left_clip = stop + 1; - ad.cigar_start = stopPosAndOperator.cigarIndex; - ad.first_cigar_clip = ad.left_clip - stopPosAndOperator.preCigarLen; + sd.left_clip = stop + 1; + sd.cigar_start = stopPosAndOperator.cigarIndex; + sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen; } else { if (refStop >= 0) return; // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion avoids // overclipping PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart); start = startPosAndOperator.readPosAtRefCoord; - ad.right_clip = bw->b->core.l_qseq - start; - ad.cigar_end = startPosAndOperator.cigarIndex + 1; - ad.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start; + sd.right_clip = bw->b->core.l_qseq - start; + sd.cigar_end = startPosAndOperator.cigarIndex + 1; + sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start; } } // 计算切掉adapter之后,ref相对原始ref的偏移量 -void calculateRefOffset(BamWrap *bw, SamData &ad) { +void calculateRefOffset(BamWrap *bw, SamData &sd) { const uint32_t* cigar = bam_get_cigar(bw->b); const bam1_core_t& bc = bw->b->core; int i = 0; - for (i = 0; i < ad.cigar_start; ++i) { + for (i = 0; i < sd.cigar_start; ++i) { const char c = bam_cigar_opchr(cigar[i]); int len = bam_cigar_oplen(cigar[i]); if (consumeRefBases(c)) { - ad.ref_offset += len; + sd.ref_offset += len; } } const char c = bam_cigar_opchr(cigar[i]); if (consumeRefBases(c)) { - ad.ref_offset += ad.first_cigar_clip; + sd.ref_offset += sd.first_cigar_clip; } } // 计算clip处理之后,剩余的碱基 -void calculateReadBases(BamWrap* bw, SamData& ad) { - ad.bases.resize(ad.read_len); - ad.base_quals.resize(ad.read_len); +void calculateReadBases(BamWrap* bw, SamData& sd) { + sd.bases.resize(sd.read_len); + sd.base_quals.resize(sd.read_len); uint8_t* seq = bam_get_seq(bw->b); uint8_t* quals = bam_get_qual(bw->b); - for (int i = 0; i < ad.read_len; ++i) { - ad.bases[i] = cBaseToChar[bam_seqi(seq, i + ad.left_clip)]; - ad.base_quals[i] = quals[i + ad.left_clip]; + for (int i = 0; i < sd.read_len; ++i) { + sd.bases[i] = cBaseToChar[bam_seqi(seq, i + sd.left_clip)]; + sd.base_quals[i] = quals[i + sd.left_clip]; } } // 计算read两端clip之后的softstart和softend -void calculateSoftStartEnd(BamWrap* bw, SamData& ad) { - int64_t softStart = bw->b->core.pos + ad.ref_offset; +void calculateSoftStartEnd(BamWrap* bw, SamData& sd) { + int64_t softStart = bw->b->core.pos + sd.ref_offset; int64_t softEnd = softStart - 1; // 闭区间 const uint32_t* cigar = bam_get_cigar(bw->b); const bam1_core_t& bc = bw->b->core; - int cigar_start = ad.cigar_start; - int cigar_end = ad.cigar_end; + int cigar_start = sd.cigar_start; + int cigar_end = sd.cigar_end; bool rightTail = false; - for (int i = ad.cigar_start; i < ad.cigar_end; ++i) { + for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { const char c = bam_cigar_opchr(cigar[i]); int len = bam_cigar_oplen(cigar[i]); - if (i == ad.cigar_start) len -= ad.first_cigar_clip; - if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip; + if (i == sd.cigar_start) len -= sd.first_cigar_clip; + if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip; // if (c == 'S' || c == 'I') if (c == 'S') softStart -= len; @@ -299,47 +299,53 @@ void calculateSoftStartEnd(BamWrap* bw, SamData& ad) { softEnd += len; } } - ad.softStart() = softStart; - ad.softEnd() = softEnd; + sd.softStart() = softStart; + sd.softEnd() = softEnd; } // 计算clip之后的cigar -void calculateCigar(BamWrap* bw, SamData& ad) { - ad.cigars.clear(); +void calculateCigar(BamWrap* bw, SamData& sd) { + sd.cigars.clear(); const uint32_t* cigar = bam_get_cigar(bw->b); const bam1_core_t& bc = bw->b->core; - int cigar_start = ad.cigar_start; - int cigar_end = ad.cigar_end; - for (int i = ad.cigar_start; i < ad.cigar_end; ++i) { - const char c = bam_cigar_opchr(cigar[i]); + int cigar_start = sd.cigar_start; + int cigar_end = sd.cigar_end; + for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { + char c = bam_cigar_opchr(cigar[i]); int len = bam_cigar_oplen(cigar[i]); - if (i == ad.cigar_start) - len -= ad.first_cigar_clip; - if (i == ad.cigar_end - 1) - len -= ad.last_cigar_clip; - ad.cigars.push_back({c, len}); + if (i == sd.cigar_start) + len -= sd.first_cigar_clip; + if (i == sd.cigar_end - 1) + len -= sd.last_cigar_clip; + //if ((i == sd.cigar_start || i == sd.cigar_end - 1) && c == 'D') // 跳过开头的deletion + if (i == sd.cigar_start && c == 'D') { // 跳过开头的deletion + c = 'H'; + // sd.ref_offset += len; + sd.start_pos += len; + } + sd.cigars.push_back({c, len}); } - //for(auto &cigar : ad.cigars) { + //for(auto &cigar : sd.cigars) { // spdlog::info("op: {}, len: {}", cigar.op, cigar.len); //} } // 计算read两端softclip的碱基数量,可能会修改ad里的clip值 -void calculateSoftClip(BamWrap *bw, SamData &ad) { +void calculateSoftClip(BamWrap *bw, SamData &sd) { const uint32_t* cigar = bam_get_cigar(bw->b); const bam1_core_t& bc = bw->b->core; - int readIndex = ad.left_clip; + int readIndex = sd.left_clip; int cutLeft = -1; // first position to hard clip (inclusive) int cutRight = -1; // first position to hard clip (inclusive) - int cigar_start = ad.cigar_start; - int cigar_end = ad.cigar_end; + int cigar_start = sd.cigar_start; + int cigar_end = sd.cigar_end; bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail - for (int i = ad.cigar_start; i < ad.cigar_end; ++i) { + for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { const char c = bam_cigar_opchr(cigar[i]); int len = bam_cigar_oplen(cigar[i]); - if (i == ad.cigar_start) len -= ad.first_cigar_clip; - if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip; + if (i == sd.cigar_start) len -= sd.first_cigar_clip; + if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip; if (c == 'S') { if (rightTail) { cutRight = readIndex; @@ -356,14 +362,14 @@ void calculateSoftClip(BamWrap *bw, SamData &ad) { } } if (cutRight >= 0) { - ad.right_clip = bw->b->core.l_qseq - cutRight; - ad.cigar_end = cigar_end; - ad.last_cigar_clip = 0; + sd.right_clip = bw->b->core.l_qseq - cutRight; + sd.cigar_end = cigar_end; + sd.last_cigar_clip = 0; } if (cutLeft >= 0) { - ad.left_clip = cutLeft + 1; - ad.cigar_start = cigar_start; - ad.first_cigar_clip = 0; + sd.left_clip = cutLeft + 1; + sd.cigar_start = cigar_start; + sd.first_cigar_clip = 0; } } @@ -387,9 +393,10 @@ inline void updateIndel(vector &isIndel, int index) { } // 计算该read的每个碱基位置是否是SNP或Indel -int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &isSNP, vector &isIns, vector &isDel) { +int calculateIsSNPOrIndel(AuxVar& aux, SamData &sd, vector &isSNP, vector &isIns, vector &isDel) { // 1. 读取参考基因组,先看看串行运行性能,稍后可以将读入ref和vcf合并起来做成一个并行流水线步骤 - Interval interval{bw->start_pos() + ad.ref_offset, bw->end_pos()}; // 闭区间 + //Interval interval{bw->start_pos() + sd.ref_offset, bw->end_pos()}; // 闭区间 + Interval interval{sd.start_pos, sd.end_pos}; // 闭区间 PROF_START(ref); read_ref_base(aux, interval.left, interval); PROF_END(gprof[GP_read_ref], ref); @@ -398,18 +405,13 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &is // 2. 遍历cigar,计算每个碱基是否是SNP或Indel int readPos = 0, refPos = 0, nEvents = 0; - const uint32_t* cigar = bam_get_cigar(bw->b); - const bam1_core_t& bc = bw->b->core; - uint8_t* seq = bam_get_seq(bw->b); - for (int i = ad.cigar_start; i < ad.cigar_end; ++i) { - const char c = bam_cigar_opchr(cigar[i]); - int len = bam_cigar_oplen(cigar[i]); - if (i == ad.cigar_start) len -= ad.first_cigar_clip; - if (i == ad.cigar_end - 1) len -= ad.last_cigar_clip; + for (int i = 0; i 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); } isSNP[readPos] = snpInt; nEvents += snpInt; @@ -418,14 +420,14 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &is } } else if (c == 'D') { // 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del - int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1; + int index = sd.bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1; updateIndel(isDel, index); refPos += len; } else if (c == 'N') { refPos += len; } else if (c == 'I') { // 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins - bool forwardStrandRead = !bw->GetReadNegativeStrandFlag(); + bool forwardStrandRead = !sd.bw->GetReadNegativeStrandFlag(); if (forwardStrandRead) { updateIndel(isIns, readPos - 1); } @@ -437,6 +439,47 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &is readPos += len; } } + + // const uint32_t* cigar = bam_get_cigar(bw->b); + // const bam1_core_t& bc = bw->b->core; + // uint8_t* seq = bam_get_seq(bw->b); + // for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { + // const char c = bam_cigar_opchr(cigar[i]); + // int len = bam_cigar_oplen(cigar[i]); + // if (i == sd.cigar_start) len -= sd.first_cigar_clip; + // if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip; + // if (c == 'M' || c == '=' || c == 'X') { + // for (int j = 0; j < len; ++j) { + // // 按位置将read和ref碱基进行比较,不同则是snp,注意read起始位置要加上left_clip + // int snpInt = cBaseToChar[bam_seqi(seq, readPos + sd.left_clip)] == refBases[refPos] ? 0 : 1; + // // if (snpInt > 0) { spdlog::info("snp {}, readpos: {}", snpInt, readPos); } + // isSNP[readPos] = snpInt; + // nEvents += snpInt; + // readPos++; + // refPos++; + // } + // } else if (c == 'D') { + // // 应该是在上一个消耗碱基的cigar的最后一个位置,标记Del + // int index = bw->GetReadNegativeStrandFlag() ? readPos : readPos - 1; + // updateIndel(isDel, index); + // refPos += len; + // } else if (c == 'N') { + // refPos += len; + // } else if (c == 'I') { + // // 与Del不同,Ins应该是在下一个cigar开始的位置,标记Ins + // bool forwardStrandRead = !bw->GetReadNegativeStrandFlag(); + // if (forwardStrandRead) { + // updateIndel(isIns, readPos - 1); + // } + // readPos += len; + // if (!forwardStrandRead) { + // updateIndel(isIns, readPos); + // } + // } else if (c == 'S') { + // readPos += len; + // } + // } + nEvents += std::accumulate(isIns.begin(), isIns.end(), 0) + std::accumulate(isDel.begin(), isDel.end(), 0); // spdlog::info("nEvents: {}", nEvents); @@ -448,14 +491,14 @@ int calculateIsSNPOrIndel(AuxVar& aux, BamWrap *bw, SamData &ad, vector &is } // 简单计算baq数组,就是全部赋值为'@' (64) -bool flatBAQArray(BamWrap* bw, SamData& ad, vector& baqArray) { - baqArray.resize(ad.read_len, (uint8_t)'@'); +bool flatBAQArray(SamData& sd, vector& baqArray) { + baqArray.resize(sd.read_len, (uint8_t)'@'); return true; } // 计算真实的baq数组,耗时更多,好像enable-baq参数默认是关闭的,那就先不实现这个了 -bool calculateBAQArray(AuxVar& aux, BAQ& baq, BamWrap* bw, SamData& ad, vector& baqArray) { - baqArray.resize(ad.read_len, 0); +bool calculateBAQArray(AuxVar& aux, BAQ& baq, SamData& sd, vector& baqArray) { + baqArray.resize(sd.read_len, 0); return true; } @@ -471,7 +514,8 @@ static void get_line_from_buf(char* buf, int64_t total, int64_t* cur, string* li } // 计算与read有交叉的已知位点信息, 应该要判断一下,是按照read的范围去读取vcf,还是按照一个batch read的范围去读取 -void calculateKnownSites(BamWrap* bw, SamData& ad, vector &vcfs, vector &knownSites) { +void calculateKnownSites(SamData& sd, vector &vcfs, vector &knownSites) { + BamWrap* bw = sd.bw; int tid = bw->contig_id(); uint64_t startPos = bw->start_pos(); // 闭区间 uint64_t endPos = bw->end_pos(); // 闭区间 @@ -532,28 +576,28 @@ void calculateKnownSites(BamWrap* bw, SamData& ad, vector &vcfs, vect //} } //exit(0); - knownSites.resize(ad.read_len); + knownSites.resize(sd.read_len); endPos = bw->end_pos(); for(auto &vcf : vcfs) { for (auto &intv : vcf.knownSites) { // knownSite is outside clipping window for the read, ignore - if (intv.right < ad.softStart()) + if (intv.right < sd.softStart()) continue; - if (intv.left > ad.softEnd()) + if (intv.left > sd.softEnd()) break; // 计算对应的位点 - ReadIdxCigar rcStart = ad.getReadIndexForReferenceCoordinate(intv.left); + ReadIdxCigar rcStart = sd.getReadIndexForReferenceCoordinate(intv.left); int featureStartOnRead = rcStart.readIdx == SamData::READ_INDEX_NOT_FOUND ? 0 : rcStart.readIdx; if (rcStart.cigarOp == 'D') { --featureStartOnRead; } - ReadIdxCigar rcEnd = ad.getReadIndexForReferenceCoordinate(intv.right); - int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? ad.read_len : rcEnd.readIdx; - if (featureStartOnRead > ad.read_len) { - featureStartOnRead = featureEndOnRead = ad.read_len; + ReadIdxCigar rcEnd = sd.getReadIndexForReferenceCoordinate(intv.right); + int featureEndOnRead = rcEnd.readIdx == SamData::READ_INDEX_NOT_FOUND ? sd.read_len : rcEnd.readIdx; + if (featureStartOnRead > sd.read_len) { + featureStartOnRead = featureEndOnRead = sd.read_len; } - for (int i = max(0, featureStartOnRead); i < min(ad.read_len, featureEndOnRead + 1); ++i) { + for (int i = max(0, featureStartOnRead); i < min(sd.read_len, featureEndOnRead + 1); ++i) { knownSites[i] = true; } } @@ -620,6 +664,9 @@ void updateRecalTablesForRead(ReadRecalInfo &info) { int readLength = read.read_len; for (int offset = 0; offset < readLength; ++offset) { + //if (read.rid == 46114) { + // fprintf(gf[3], "%d %d\n", offset, info.skips[offset] ? 1 : 0); + //} if (!info.skips[offset]) { //if (true){ // 不跳过当前位置 for (int idx = 0; idx < nsgv::gEventTypes.size(); ++idx) { @@ -634,6 +681,9 @@ void updateRecalTablesForRead(ReadRecalInfo &info) { // 处理base quality score协变量 // RecalUtils::IncrementDatum3keys(qualityScoreTable, qual, isError, readGroup, baseQuality, event.index); + if (read.rid == 46114) { + // fprintf(gf[3], "%d %d %f\n", offset, baseQuality, isError); + } qualityScoreTable[readGroup][baseQuality][event.index].increment(1, isError, baseQuality); auto &d = qualityScoreTable[readGroup][baseQuality][event.index]; @@ -653,6 +703,29 @@ void updateRecalTablesForRead(ReadRecalInfo &info) { } } } + // fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1); + // for (auto& arr1 : qualityScoreTable.data) { + // for (size_t si = 0; si < arr1.size(); ++si) { + // for (auto &val : arr1[si]) { + // if (val.numObservations > 0) + // fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality); + // } + // } + // } + // fprintf(gf[3], "\n"); + + // fprintf(gf[3], "%ld %d %ld\n", read.rid, read.read_len, read.start_pos+1); + // for (auto& arr1 : contextTable.data) { + // for (size_t si = 0; si < arr1.size(); ++si) { + // for (auto &arr2 : arr1[si]) { + // for (auto& val : arr2) { + // if (val.numObservations > 0) + // fprintf(gf[3], "%ld %f %f ", val.numObservations, val.getNumMismatches(), val.reportedQuality); + // } + // } + // } + // } + // fprintf(gf[3], "\n"); } // 数据总结 @@ -756,51 +829,51 @@ int SerialBQSR() { // 4. 对read的两端进行检测,去除(hardclip)adapter // spdlog::info("bam idx: {}", i); BamWrap* bw = bams[i]; - SamData ad; - ad.rid = i; - ad.bw = bw; - ad.read_len = BamWrap::BamEffectiveLength(bw->b); - ad.cigar_end = bw->b->core.n_cigar; - if (ad.read_len <= 0) continue; + SamData sd; + sd.rid = i + readNumSum; + sd.bw = bw; + sd.read_len = BamWrap::BamEffectiveLength(bw->b); + sd.cigar_end = bw->b->core.n_cigar; + if (sd.read_len <= 0) continue; int adapter_boundary = bw->GetAdapterBoundary(); if (bw->IsAdapterInRead(adapter_boundary)) { // adapter在read范围内 if (bw->GetReadNegativeStrandFlag()) { // 反链 - clipByReferenceCoordinates(bw, -1, adapter_boundary, ad); + clipByReferenceCoordinates(bw, -1, adapter_boundary, sd); } else { // 正链 - clipByReferenceCoordinates(bw, adapter_boundary, -1, ad); + clipByReferenceCoordinates(bw, adapter_boundary, -1, sd); } } - ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度 + sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度 // 5. 然后再去除softclip部分 - calculateSoftClip(bw, ad); - ad.read_len = bw->b->core.l_qseq - ad.left_clip - ad.right_clip; // 更新read长度 - if (ad.read_len <= 0) continue; + calculateSoftClip(bw, sd); + sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度 + if (sd.read_len <= 0) continue; - calculateRefOffset(bw, ad); // 计算ref_offset,就是相对比对的position,要将ref右移多少 - calculateReadBases(bw, ad); // 计算clip处理之后,剩余的碱基 + calculateRefOffset(bw, sd); // 计算ref_offset,就是相对比对的position,要将ref右移多少 + calculateReadBases(bw, sd); // 计算clip处理之后,剩余的碱基 // 计算clip之后,两端的softstart和softend - calculateSoftStartEnd(bw, ad); - calculateCigar(bw, ad); + calculateSoftStartEnd(bw, sd); + calculateCigar(bw, sd); //spdlog::info("read-len {} - {}: clip left {}, right {}, ref offset: {}, cigar range: [{}, {}), cigar: {}", bw->b->core.l_qseq, - // ad.read_len, ad.left_clip, ad.right_clip, ad.ref_offset, ad.cigar_start, ad.cigar_end, bw->cigar_str()); + // sd.read_len, sd.left_clip, sd.right_clip, sd.ref_offset, sd.cigar_start, sd.cigar_end, bw->cigar_str()); // 6. 更新每个read的platform信息,好像没啥用,暂时忽略 - vector isSNP(ad.read_len, 0); // 该位置是否是SNP位置,0不是,1是 - vector isIns(ad.read_len, 0); // 该位置是否是插入位置,0不是,1是 - vector isDel(ad.read_len, 0); // 该位置是否是删除位置,0不是,1是 - const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], bw, ad, isSNP, isIns, isDel); + vector isSNP(sd.read_len, 0); // 该位置是否是SNP位置,0不是,1是 + vector isIns(sd.read_len, 0); // 该位置是否是插入位置,0不是,1是 + vector isDel(sd.read_len, 0); // 该位置是否是删除位置,0不是,1是 + const int nErrors = calculateIsSNPOrIndel(nsgv::gAuxVars[0], sd, isSNP, isIns, isDel); - /*fprintf(gf[0], "%d\t", ad.read_len); - for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]); + /*fprintf(gf[0], "%d\t", sd.read_len); + for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", isSNP[ii]); fprintf(gf[0], "\n"); - fprintf(gf[1], "%d\t", ad.read_len); - for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]); + fprintf(gf[1], "%d\t", sd.read_len); + for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[1], "%d ", isIns[ii]); fprintf(gf[1], "\n"); - fprintf(gf[2], "%d\t", ad.read_len); - for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]); + fprintf(gf[2], "%d\t", sd.read_len); + for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[2], "%d ", isDel[ii]); fprintf(gf[2], "\n"); */ @@ -816,21 +889,21 @@ int SerialBQSR() { vector baqArray; bool baqCalculated = false; if (nErrors == 0 || !nsgv::gBqsrArg.enableBAQ) { - baqCalculated = flatBAQArray(bw, ad, baqArray); + baqCalculated = flatBAQArray(sd, baqArray); } else { - // baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, bw, ad, baqArray); + // baqCalculated = calculateBAQArray(nsgv::gAuxVars[0], baq, sd, baqArray); } if (!baqCalculated) continue; // 到这里,基本的数据都准备好了,后续就是进行bqsr的统计了 // 8. 计算这条read对应的协变量 PROF_START(covariate); - CovariateUtils::ComputeCovariates(bw, ad, nsgv::gInBamHeader, readCovariates, true); + CovariateUtils::ComputeCovariates(sd, nsgv::gInBamHeader, readCovariates, true); PROF_END(gprof[GP_covariate], covariate); - // fprintf(gf[3], "%ld %ld %d %ld\n", ad.rid, readCovariates.size(), ad.read_len, readCovariates[0][0].size()); + // fprintf(gf[3], "%ld %ld %d %ld\n", sd.rid, readCovariates.size(), sd.read_len, readCovariates[0][0].size()); // for (auto &arr1 : readCovariates) { - // for (size_t si = 0; si < ad.read_len; ++si) { + // for (size_t si = 0; si < sd.read_len; ++si) { // for (auto &val : arr1[si]) { // fprintf(gf[3], "%d ", val); // } @@ -838,22 +911,33 @@ int SerialBQSR() { // } // fprintf(gf[3], "\n"); + // fprintf(gf[3], "%ld %d\n", sd.rid, sd.read_len); + // { + // auto& arr1 = readCovariates[0]; + // { + // for (int pos = 0; pos < sd.read_len; ++pos) { + // fprintf(gf[3], "%d %d\n", pos, arr1[pos][2]); + // } + // } + // } + // fprintf(gf[3], "\n"); + //test = readCovariates[1][0][0] + readCovariates[2][1][3]; //int end_pos = bw->contig_end_pos(); //spdlog::info("adapter: {}, read: {}, {}, strand: {}", adapter_boundary, bw->contig_pos(), end_pos, // bw->GetReadNegativeStrandFlag() ? "reverse" : "forward"); // for (auto val : isSNP) { if (val > 0) spdlog::info("snp err val-1: {}", val); } // 9. 计算这条read需要跳过的位置 - vector skips(ad.read_len, 0); + vector skips(sd.read_len, 0); PROF_START(known_sites); - calculateKnownSites(bw, ad, nsgv::gAuxVars[0].vcfArr, skips); - for (int ii = 0; ii < ad.read_len; ++ii) { - skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[ad.bases[ii]] == -1) || - ad.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; + calculateKnownSites(sd, nsgv::gAuxVars[0].vcfArr, skips); + for (int ii = 0; ii < sd.read_len; ++ii) { + skips[ii] = skips[ii] || (ContextCovariate::baseIndexMap[sd.bases[ii]] == -1) || + sd.base_quals[ii] < nsgv::gBqsrArg.PRESERVE_QSCORES_LESS_THAN; } PROF_END(gprof[GP_read_vcf], known_sites); - // fprintf(gf[0], "%d\t", ad.read_len); - // for (int ii = 0; ii < ad.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0); + // fprintf(gf[0], "%d\t", sd.read_len); + // for (int ii = 0; ii < sd.read_len; ++ii) fprintf(gf[0], "%d ", skips[ii] ? 1 : 0); // fprintf(gf[0], "\n"); // 10. 根据BAQ进一步处理snp,indel,得到处理后的数据 @@ -864,12 +948,12 @@ int SerialBQSR() { calculateFractionalErrorArray(isDel, baqArray, delErrors); // for (auto val : isSNP) { if (val > 0) spdlog::info("snp val: {}", val); } - //spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), ad.read_len); + //spdlog::info("snp errors size: {}, read len: {}", snpErrors.size(), sd.read_len); //for (auto val : snpErrors) { if (val > 0) spdlog::info("snp err val: {}", val); } // aggregate all of the info into our info object, and update the data // 11. 合并之前计算的数据,得到info,并更新bqsr table数据 - ReadRecalInfo info(ad, readCovariates, skips, snpErrors, insErrors, delErrors); + ReadRecalInfo info(sd, readCovariates, skips, snpErrors, insErrors, delErrors); int m = 0; // for (auto err : snpErrors) { if (isSNP[m] > 0 || err > 0) spdlog::info("snp err: {} : {}", isSNP[m++], err); } //exit(0); diff --git a/src/bqsr/covariate.cpp b/src/bqsr/covariate.cpp index 429973d..2e4ee2c 100644 --- a/src/bqsr/covariate.cpp +++ b/src/bqsr/covariate.cpp @@ -23,17 +23,17 @@ int CycleCovariate::MAXIMUM_CYCLE_VALUE; // for CovariateUtils // 对一条read计算协变量(该协变量被上一个read用过) -void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, +void CovariateUtils::ComputeCovariates(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { - ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues); - BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues); - ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues); - CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues); + ReadGroupCovariate::RecordValues(sd, header, values, recordIndelValues); + BaseQualityCovariate::RecordValues(sd, header, values, recordIndelValues); + ContextCovariate::RecordValues(sd, header, values, recordIndelValues); + CycleCovariate::RecordValues(sd, header, values, recordIndelValues); } // ReadGroupCovariate 协变量的方法 -void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { - uint8_t *rgStr = bam_aux_get(bw->b, "RG"); +void ReadGroupCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { + uint8_t *rgStr = bam_aux_get(sd.bw->b, "RG"); char* rgVal = nullptr; if (rgStr) rgVal = bam_aux2Z(rgStr); int key = 0; @@ -42,34 +42,35 @@ void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* heade } else { key = RgToId[rgVal]; } - for (int i = 0; i < ad.read_len; ++i) { + for (int i = 0; i < sd.read_len; ++i) { CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values); } } // BaseQualityCovariate 协变量的方法 -void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, +void BaseQualityCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { // 在前面的处理过后,quals应该和base长度一致了 #define __bq_set_cov(ins, del) \ do { \ - for (int i = 0; i < ad.read_len; ++i) { \ - CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \ + for (int i = 0; i < sd.read_len; ++i) { \ + CovariateUtils::SetCovariate(quals[i], (ins), (del), i, BaseQualityCovariate::index, values); \ } \ } while (0) const int INDEL_QUAL = 45; - uint8_t* quals = bam_get_qual(bw->b); + // uint8_t* quals = bam_get_qual(bw->b); + auto &quals = sd.base_quals; if (recordIndelValues) { - uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions - uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions + uint8_t* insQualPtr = bam_aux_get(sd.bw->b, "BI"); // base qualities for insertions + uint8_t* delQualPtr = bam_aux_get(sd.bw->b, "BD"); // base qualities for deletions if (insQualPtr == nullptr && delQualPtr == nullptr) { __bq_set_cov(INDEL_QUAL, INDEL_QUAL); } else if (insQualPtr == nullptr) { - uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr); + uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr) + sd.left_clip; __bq_set_cov(INDEL_QUAL, delQuals[i]); } else { - uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr); + uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr) + sd.left_clip; __bq_set_cov(insQuals[i], INDEL_QUAL); } } else { @@ -144,14 +145,11 @@ static void ClipLowQualEndsWithN(string& bases, const FastArray &quals, } // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) -void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) { - uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip; - - if (bw->GetReadNegativeStrandFlag()) { // 反向互补 - for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]); +void ContextCovariate::GetStrandedClippedBytes(SamData& sd, string& clippedBases, uint8_t lowQTail) { + if (sd.bw->GetReadNegativeStrandFlag()) { // 反向互补 + for (int i = 0; i < sd.read_len; ++i) clippedBases[i] = SimpleComplement(sd.bases[sd.read_len - 1 - i]); } - - ClipLowQualEndsWithN(clippedBases, ad.base_quals, lowQTail, bw->GetReadNegativeStrandFlag()); + ClipLowQualEndsWithN(clippedBases, sd.base_quals, lowQTail, sd.bw->GetReadNegativeStrandFlag()); } /** @@ -243,12 +241,13 @@ void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const i } } -void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { - const int originalReadLength = ad.read_len; +void ContextCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { + const int originalReadLength = sd.read_len; // store the original bases and then write Ns over low quality ones - string strandedClippedBases(ad.bases); - GetStrandedClippedBytes(bw, ad, strandedClippedBases, 30); // 注意这里的lowQualTail数值 + string strandedClippedBases(sd.bases); + // GetStrandedClippedBytes(bw, sd, strandedClippedBases, 30); // 注意这里的lowQualTail数值 + GetStrandedClippedBytes(sd, strandedClippedBases, lowQualTail); // 命名我之前看到过这个30的? // spdlog::info("bases: {}", strandedClippedBases); vector nBasePairContextAtEachCycle; GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle); @@ -266,7 +265,7 @@ void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, } } - const bool negativeStrand = bw->GetReadNegativeStrandFlag(); + const bool negativeStrand = sd.bw->GetReadNegativeStrandFlag(); // Note: duplicated the loop to avoid checking recordIndelValues on each iteration if (recordIndelValues) { vector indelKeys; @@ -295,10 +294,10 @@ void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, * @param maxCycle max value of the base to compute the key for * (this method throws UserException if the computed absolute value of the cycle number is higher than this value). */ -int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) { - const bool isNegStrand = bw->GetReadNegativeStrandFlag(); - const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2); - const int readLength = ad.read_len; +int CycleCovariate::CycleKey(SamData& sd, const int baseNumber, const bool indel, const int maxCycle) { + const bool isNegStrand = sd.bw->GetReadNegativeStrandFlag(); + const bool isSecondInPair = (sd.bw->b->core.flag & BAM_FPAIRED) && (sd.bw->b->core.flag & BAM_FREAD2); + const int readLength = sd.read_len; const int readOrderFactor = isSecondInPair ? -1 : 1; int increment; @@ -324,18 +323,18 @@ int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, con } // Used to pick out the covariate's value from attributes of the read -void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { - const int readLength = ad.read_len; +void CycleCovariate::RecordValues(SamData& sd, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { + const int readLength = sd.read_len; // Note: duplicate the loop to void checking recordIndelValues on every iteration if (recordIndelValues) { for (int i = 0; i < readLength; i++) { - const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE); - const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE); + const int substitutionKey = CycleKey(sd, i, false, MAXIMUM_CYCLE_VALUE); + const int indelKey = CycleKey(sd, i, true, MAXIMUM_CYCLE_VALUE); CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values); } } else { for (int i = 0; i < readLength; i++) { - const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE); + const int substitutionKey = CycleKey(sd, i, false, MAXIMUM_CYCLE_VALUE); CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values); } } diff --git a/src/bqsr/covariate.h b/src/bqsr/covariate.h index 2c73aa0..d7d7f2b 100644 --- a/src/bqsr/covariate.h +++ b/src/bqsr/covariate.h @@ -71,7 +71,7 @@ struct CovariateUtils { } // 对一条read计算协变量(该协变量被上一个read用过) - static void ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void ComputeCovariates(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Read group协变量 @@ -80,13 +80,13 @@ struct ReadGroupCovariate { static map RgToId; // read group name到id的映射 static map IdToRg; // id到read group name的映射 - static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Base quality协变量 struct BaseQualityCovariate { static constexpr int index = 1; // 在协变量数组中的索引位置 - static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Context协变量 @@ -210,14 +210,14 @@ struct ContextCovariate { } // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) - static void GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail); + static void GetStrandedClippedBytes(SamData& ad, string& clippedBases, uint8_t lowQTail); // Creates a int representation of a given dna string. static int KeyFromContext(const string& dna, const int start, const int end); // For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). static void GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector& keys); // 设置协变量的值 - static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // Cycle协变量 @@ -264,9 +264,9 @@ struct CycleCovariate { } // Computes the encoded value of CycleCovariate's key for the given position at the read. - static int CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle); + static int CycleKey(SamData& ad, const int baseNumber, const bool indel, const int maxCycle); - static void RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); + static void RecordValues(SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues); }; // 好像不需要 diff --git a/src/util/bam_wrap.h b/src/util/bam_wrap.h index 8424bea..00e7feb 100644 --- a/src/util/bam_wrap.h +++ b/src/util/bam_wrap.h @@ -81,7 +81,7 @@ struct SamData { int cigar_end = 0; // cigar结束位置,开区间 int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧 int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧 - int left_clip = 0; // 左侧被切掉的碱基长度 + int left_clip = 0; // 左侧被切掉的碱基长度,BI和BD质量分数也会用到 int right_clip = 0; // 右侧被切掉的碱基长度 int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量 diff --git a/src/util/read_utils.h b/src/util/read_utils.h new file mode 100644 index 0000000..e69de29