154 lines
7.7 KiB
C
154 lines
7.7 KiB
C
/*
|
||
Description: read (sam record) 相关的工具函数,比如用于clipping低质量碱基等
|
||
|
||
Copyright : All right reserved by ICT
|
||
|
||
Author : Zhang Zhonghai
|
||
Date : 2025/12/29
|
||
*/
|
||
#pragma once
|
||
|
||
#include "bam_wrap.h"
|
||
#include "sam_data.h"
|
||
#include "base_utils.h"
|
||
|
||
// 用于对read进行各种转换操作,比如clipping等。注意这里都是逻辑操作,最后需要调用SamData.applyTransformations()来真正应用这些修改
|
||
struct ReadTransformer {
|
||
// 给定一个ref位置,在read内部找到对应的位置和操作符
|
||
struct PosAndOperator {
|
||
int readPosAtRefCoord = -1; // read中的位置
|
||
char cigarOperator = '0'; // cigar操作符
|
||
int cigarIndex = -1; // cigar索引
|
||
int cigarLen = 0;
|
||
int preCigarLen = 0; // 截止cigar之前的,消耗read base的长度
|
||
};
|
||
|
||
/**
|
||
* Find the 0-based index within a read base array corresponding to a given 0-based position in the reference, along with the cigar operator of
|
||
* the element containing that base. If the reference coordinate occurs within a deletion, the first index after the deletion is returned.
|
||
* Note that this treats soft-clipped bases as if they align with the reference, which is useful for hard-clipping reads with soft clips.
|
||
*
|
||
* @param alignmentStart The soft start of the read on the reference
|
||
* @param cigar The read's cigar
|
||
* @param refCoord The target reference coordinate
|
||
* @return If the reference coordinate occurs before the read start or after the read end {@code CLIPPING_GOAL_NOT_REACHED};
|
||
* if the reference coordinate falls within an alignment block of the read's cigar, the corresponding read
|
||
* coordinate; if the reference coordinate falls within a deletion, the first read coordinate after the deletion. Note: if the last cigar element
|
||
* is a deletion (which isn't meaningful), it returns {@code CLIPPING_GOAL_NOT_REACHED}.
|
||
*/
|
||
static PosAndOperator getReadIndexForReferenceCoordinate(BamWrap* bw, int alignmentStart, int refCoord) {
|
||
PosAndOperator po;
|
||
if (refCoord < alignmentStart) {
|
||
return po;
|
||
}
|
||
int firstReadPosOfElement = 0; // inclusive
|
||
int firstRefPosOfElement = alignmentStart; // inclusive
|
||
int lastReadPosOfElement = 0; // exclusive
|
||
int lastRefPosOfElement = alignmentStart; // exclusive
|
||
|
||
// advance forward through all the cigar elements until we bracket the reference coordinate
|
||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||
const bam1_core_t& bc = bw->b->core;
|
||
const int idx = bc.n_cigar - 1;
|
||
if (idx < 0)
|
||
return po;
|
||
for (int i = 0; i < bc.n_cigar; ++i) {
|
||
const char c = bam_cigar_opchr(cigar[i]);
|
||
const int len = bam_cigar_oplen(cigar[i]);
|
||
firstReadPosOfElement = lastReadPosOfElement;
|
||
firstRefPosOfElement = lastRefPosOfElement;
|
||
lastReadPosOfElement += BaseUtils::consumeReadBases(c) ? len : 0;
|
||
lastRefPosOfElement += (BaseUtils::consumeRefBases(c) || c == 'S') ? len : 0;
|
||
if (firstRefPosOfElement <= refCoord && refCoord < lastRefPosOfElement) { // refCoord falls within this cigar element
|
||
int readPosAtRefCoord = firstReadPosOfElement + (BaseUtils::consumeReadBases(c) ? (refCoord - firstRefPosOfElement) : 0);
|
||
return PosAndOperator{readPosAtRefCoord, c, i, len, firstReadPosOfElement};
|
||
}
|
||
}
|
||
return po;
|
||
}
|
||
|
||
// 根据adapter位置,对read进行hardclip,返回左侧或右侧减掉的base数量
|
||
static void clipByReferenceCoordinates(BamWrap* bw, int refStart, int refStop, SamData& sd) {
|
||
int start, stop;
|
||
// Determine the read coordinate to start and stop hard clipping
|
||
if (refStart < 0) {
|
||
if (refStop < 0)
|
||
return;
|
||
PosAndOperator stopPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStop);
|
||
// if the refStop falls in a deletion, the above method returns the position after the deletion. Since the stop we return here
|
||
// is inclusive, we decrement the stop to avoid overclipping by one base. As a result we do not clip the deletion, which is fine.
|
||
stop = stopPosAndOperator.readPosAtRefCoord - (BaseUtils::consumeReadBases(stopPosAndOperator.cigarOperator) ? 0 : 1);
|
||
sd.left_clip = stop + 1;
|
||
sd.cigar_start = stopPosAndOperator.cigarIndex;
|
||
sd.first_cigar_clip = sd.left_clip - stopPosAndOperator.preCigarLen;
|
||
} else {
|
||
if (refStop >= 0)
|
||
return;
|
||
// unlike the above case where we clip the start fo the read, here we clip the end and returning the base to the right of a deletion
|
||
// avoids overclipping
|
||
PosAndOperator startPosAndOperator = getReadIndexForReferenceCoordinate(bw, bw->GetSoftStart(), refStart);
|
||
start = startPosAndOperator.readPosAtRefCoord;
|
||
sd.right_clip = bw->b->core.l_qseq - start;
|
||
sd.cigar_end = startPosAndOperator.cigarIndex + 1;
|
||
sd.last_cigar_clip = startPosAndOperator.preCigarLen + startPosAndOperator.cigarLen - start;
|
||
}
|
||
}
|
||
|
||
// 切掉adaper序列,注意这里的clipping只是逻辑上的,实际并没有修改bam record
|
||
static void hardClipAdaptorSequence(BamWrap* bw, SamData& sd) {
|
||
int adapter_boundary = bw->GetAdapterBoundary();
|
||
if (bw->IsAdapterInRead(adapter_boundary)) {
|
||
// adapter在read范围内
|
||
if (bw->GetReadNegativeStrandFlag()) { // 反链
|
||
clipByReferenceCoordinates(bw, -1, adapter_boundary, sd);
|
||
} else { // 正链
|
||
clipByReferenceCoordinates(bw, adapter_boundary, -1, sd);
|
||
}
|
||
}
|
||
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
|
||
}
|
||
|
||
// 计算read两端softclip的碱基数量,切掉softclip序列
|
||
static void hardClipSoftClippedBases(BamWrap* bw, SamData& sd) {
|
||
const uint32_t* cigar = bam_get_cigar(bw->b);
|
||
const bam1_core_t& bc = bw->b->core;
|
||
int readIndex = sd.left_clip;
|
||
int cutLeft = -1; // first position to hard clip (inclusive)
|
||
int cutRight = -1; // first position to hard clip (inclusive)
|
||
int cigar_start = sd.cigar_start;
|
||
int cigar_end = sd.cigar_end;
|
||
bool rightTail = false; // trigger to stop clipping the left tail and start cutting the right tail
|
||
|
||
for (int i = sd.cigar_start; i < sd.cigar_end; ++i) {
|
||
const char c = bam_cigar_opchr(cigar[i]);
|
||
int len = bam_cigar_oplen(cigar[i]);
|
||
if (i == sd.cigar_start) len -= sd.first_cigar_clip;
|
||
if (i == sd.cigar_end - 1) len -= sd.last_cigar_clip;
|
||
if (c == 'S') {
|
||
if (rightTail) {
|
||
cutRight = readIndex;
|
||
cigar_end = i;
|
||
} else {
|
||
cutLeft = readIndex + len - 1;
|
||
cigar_start = i + 1;
|
||
}
|
||
} else if (c != 'H') {
|
||
rightTail = true;
|
||
}
|
||
if (BaseUtils::consumeReadBases(c)) {
|
||
readIndex += len;
|
||
}
|
||
}
|
||
if (cutRight >= 0) {
|
||
sd.right_clip = bw->b->core.l_qseq - cutRight;
|
||
sd.cigar_end = cigar_end;
|
||
sd.last_cigar_clip = 0;
|
||
}
|
||
if (cutLeft >= 0) {
|
||
sd.left_clip = cutLeft + 1;
|
||
sd.cigar_start = cigar_start;
|
||
sd.first_cigar_clip = 0;
|
||
}
|
||
sd.read_len = bw->b->core.l_qseq - sd.left_clip - sd.right_clip; // 更新read长度
|
||
}
|
||
}; |