/* Description: read (sam record) 相关的工具函数,比如用于clipping低质量碱基等 Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2025/12/29 */ #pragma once #include "bam_wrap.h" #include "sam_data.h" #include "base_utils.h" // 用于对read进行各种转换操作,比如clipping等。注意这里都是逻辑操作,最后需要调用SamData.applyTransformations()来真正应用这些修改 struct ReadTransformer { // 给定一个ref位置,在read内部找到对应的位置和操作符 struct PosAndOperator { int readPosAtRefCoord = -1; // read中的位置 char cigarOperator = '0'; // cigar操作符 int cigarIndex = -1; // cigar索引 int cigarLen = 0; int preCigarLen = 0; // 截止cigar之前的,消耗read base的长度 }; /** * Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of * the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned. * Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips. * * @param alignmentStart The soft start of the read on the reference * @param cigar The read's cigar * @param refCoord The target reference coordinate * @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED}; * if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read * coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last cigar element * is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}. */ static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) { PosAndOperator po; if (refCoord < alignmentStart) { return po; } int firstReadPosOfElement = 0; // inclusive int firstRefPosOfElement = alignmentStart; // inclusive int lastReadPosOfElement = 0; // exclusive int lastRefPosOfElement = alignmentStart; // exclusive // advance forward through all the cigar elements until we bracket the reference coordinate const uint32_t* cigar = bam_get_cigar(bw->b); const bam1_core_t& bc = bw->b->core; const int idx = bc.n_cigar - 1; if (idx < 0) return po; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); firstReadPosOfElement = lastReadPosOfElement; firstRefPosOfElement = lastRefPosOfElement; lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0; lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0; if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0); return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement}; } } return po; } // 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量 static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) { int start, stop; // Determine the read coordinate to start and stop hard clipping if (refStart < 0) { if (refStop < 0) return; PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop); // if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here // is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine. stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1); sd.left_clip = stop + 1; sd.cigar_start = stopPosAndOperator.cigarIndex; sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen; } else { if (refStop >= 0) return; // unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion // avoids overclipping PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart); start = startPosAndOperator.readPosAtRefCoord; sd.right_clip = bw->b->core.l_qseq - start; sd.cigar_end = startPosAndOperator.cigarIndex + 1; sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start; } } // 切掉adaper序列,注意这里的clipping只是逻辑上的,实际并没有修改bam record static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) { int adapter_boundary = bw->GetAdapterBoundary(); if (bw->IsAdapterInRead(adapter_boundary)) { // adapter在read范围内 if (bw->GetReadNegativeStrandFlag()) { // 反链 clipByReferenceCoordinates(bw, -1, adapter_boundary, sd); } else { // 正链 clipByReferenceCoordinates(bw, adapter_boundary, -1, sd); } } sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度 } // 计算read两端softclip的碱基数量,切掉softclip序列 static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) { const uint32_t* cigar = bam_get_cigar(bw->b); const bam1_core_t& bc = bw->b->core; int readIndex = sd.left_clip; int cutLeft = -1; // first position to hard clip (inclusive) int cutRight = -1; // first position to hard clip (inclusive) int cigar_start = sd.cigar_start; int cigar_end = sd.cigar_end; bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail for (int i = sd.cigar_start; i < sd.cigar_end; ++i) { const char c = bam_cigar_opchr(cigar[i]); int len = bam_cigar_oplen(cigar[i]); if (i == sd.cigar_start) len -= sd.first_cigar_clip; if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip; if (c == 'S') { if (rightTail) { cutRight = readIndex; cigar_end = i; } else { cutLeft = readIndex + len - 1; cigar_start = i + 1; } } else if (c != 'H') { rightTail = true; } if (BaseUtils::consumeReadBases(c)) { readIndex += len; } } if (cutRight >= 0) { sd.right_clip = bw->b->core.l_qseq - cutRight; sd.cigar_end = cigar_end; sd.last_cigar_clip = 0; } if (cutLeft >= 0) { sd.left_clip = cutLeft + 1; sd.cigar_start = cigar_start; sd.first_cigar_clip = 0; } sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度 } };