FastBQSR/src/util/read_transformer.h

154 lines
7.7 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: read (sam record) 相关的工具函数比如用于clipping低质量碱基等
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2025/12/29
*/
#pragma once
#include "bam_wrap.h"
#include "sam_data.h"
#include "base_utils.h"
// 用于对read进行各种转换操作比如clipping等。注意这里都是逻辑操作最后需要调用SamData.applyTransformations()来真正应用这些修改
struct ReadTransformer {
// 给定一个ref位置在read内部找到对应的位置和操作符
struct PosAndOperator {
int readPosAtRefCoord = -1; // read中的位置
char cigarOperator = '0'; // cigar操作符
int cigarIndex = -1; // cigar索引
int cigarLen = 0;
int preCigarLen = 0; // 截止cigar之前的消耗read base的长度
};
/**
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
*
* @param alignmentStart The soft start of the read on the reference
* @param cigar The read's cigar
* @param refCoord The target reference coordinate
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read
* coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last cigar element
* is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
*/
static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) {
PosAndOperator po;
if (refCoord < alignmentStart) {
return po;
}
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = alignmentStart; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = alignmentStart; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
const int idx = bc.n_cigar - 1;
if (idx < 0)
return po;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0;
lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0;
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
}
}
return po;
}
// 根据adapter位置对read进行hardclip返回左侧或右侧减掉的base数量
static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) {
int start, stop;
// Determine the read coordinate to start and stop hard clipping
if (refStart < 0) {
if (refStop < 0)
return;
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
sd.left_clip = stop + 1;
sd.cigar_start = stopPosAndOperator.cigarIndex;
sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
} else {
if (refStop >= 0)
return;
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion
// avoids overclipping
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
start = startPosAndOperator.readPosAtRefCoord;
sd.right_clip = bw->b->core.l_qseq - start;
sd.cigar_end = startPosAndOperator.cigarIndex + 1;
sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
}
}
// 切掉adaper序列注意这里的clipping只是逻辑上的实际并没有修改bam record
static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) {
int adapter_boundary = bw->GetAdapterBoundary();
if (bw->IsAdapterInRead(adapter_boundary)) {
// adapter在read范围内
if (bw->GetReadNegativeStrandFlag()) { // 反链
clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
} else { // 正链
clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
}
}
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
}
// 计算read两端softclip的碱基数量切掉softclip序列
static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) {
const uint32_t* cigar = bam_get_cigar(bw->b);
const bam1_core_t& bc = bw->b->core;
int readIndex = sd.left_clip;
int cutLeft = -1; // first position to hard clip (inclusive)
int cutRight = -1; // first position to hard clip (inclusive)
int cigar_start = sd.cigar_start;
int cigar_end = sd.cigar_end;
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
int len = bam_cigar_oplen(cigar[i]);
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
if (c == 'S') {
if (rightTail) {
cutRight = readIndex;
cigar_end = i;
} else {
cutLeft = readIndex + len - 1;
cigar_start = i + 1;
}
} else if (c != 'H') {
rightTail = true;
}
if (BaseUtils::consumeReadBases(c)) {
readIndex += len;
}
}
if (cutRight >= 0) {
sd.right_clip = bw->b->core.l_qseq - cutRight;
sd.cigar_end = cigar_end;
sd.last_cigar_clip = 0;
}
if (cutLeft >= 0) {
sd.left_clip = cutLeft + 1;
sd.cigar_start = cigar_start;
sd.first_cigar_clip = 0;
}
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
}
};