FastBQSR/src/util/read_transformer.h

154 lines
7.7 KiB
C
Raw Normal View History

/*
Description: read (sam record) clipping
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/29
*/
#pragma once
#include "bam_wrap.h"
#include "sam_data.h"
#include "base_utils.h"
// 用于对read进行各种转换操作比如clipping等。注意这里都是逻辑操作最后需要调用SamData.applyTransformations()来真正应用这些修改
struct ReadTransformer {
// 给定一个ref位置在read内部找到对应的位置和操作符
struct PosAndOperator {
int readPosAtRefCoord = -1; // read中的位置
char cigarOperator = '0'; // cigar操作符
int cigarIndex = -1; // cigar索引
int cigarLen = 0;
int preCigarLen = 0; // 截止cigar之前的消耗read base的长度
};
/**
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
*
* @param alignmentStart The soft start of the read on the reference
* @param cigar The read's cigar
* @param refCoord The target reference coordinate
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read
* coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last cigar element
* is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
*/
static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) {
PosAndOperator po;
if (refCoord < alignmentStart) {
return po;
}
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = alignmentStart; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = alignmentStart; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
const int idx = bc.n_cigar - 1;
if (idx < 0)
return po;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0;
lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0;
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
}
}
return po;
}
// 根据adapter位置对read进行hardclip返回左侧或右侧减掉的base数量
static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) {
int start, stop;
// Determine the read coordinate to start and stop hard clipping
if (refStart < 0) {
if (refStop < 0)
return;
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
sd.left_clip = stop + 1;
sd.cigar_start = stopPosAndOperator.cigarIndex;
sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
} else {
if (refStop >= 0)
return;
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion
// avoids overclipping
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
start = startPosAndOperator.readPosAtRefCoord;
sd.right_clip = bw->b->core.l_qseq - start;
sd.cigar_end = startPosAndOperator.cigarIndex + 1;
sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
}
}
// 切掉adaper序列注意这里的clipping只是逻辑上的实际并没有修改bam record
static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) {
int adapter_boundary = bw->GetAdapterBoundary();
if (bw->IsAdapterInRead(adapter_boundary)) {
// adapter在read范围内
if (bw->GetReadNegativeStrandFlag()) { // 反链
clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
} else { // 正链
clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
}
}
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
}
// 计算read两端softclip的碱基数量切掉softclip序列
static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) {
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int readIndex = sd.left_clip;
int cutLeft = -1; // first position to hard clip (inclusive)
int cutRight = -1; // first position to hard clip (inclusive)
int cigar_start = sd.cigar_start;
int cigar_end = sd.cigar_end;
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
if (c == 'S') {
if (rightTail) {
cutRight = readIndex;
cigar_end = i;
} else {
cutLeft = readIndex + len - 1;
cigar_start = i + 1;
}
} else if (c != 'H') {
rightTail = true;
}
if (BaseUtils::consumeReadBases(c)) {
readIndex += len;
}
}
if (cutRight >= 0) {
sd.right_clip = bw->b->core.l_qseq - cutRight;
sd.cigar_end = cigar_end;
sd.last_cigar_clip = 0;
}
if (cutLeft >= 0) {
sd.left_clip = cutLeft + 1;
sd.cigar_start = cigar_start;
sd.first_cigar_clip = 0;
}
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
}
};