FastBQSR/src/util/bam_wrap.h

538 lines
19 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: 读入sam/bam时开辟一个大的buf存放这些数据
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2019/11/27
*/
#pragma once
#include <htslib/sam.h>
#include <limits.h>
#include <math.h>
#include <stdlib.h>
#include <map>
#include <queue>
#include <sstream>
#include <string>
#include <vector>
using namespace std;
struct Cigar {
char op = '0';
int len = 0;
// 该操作符是否消耗read的碱基
static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; }
// 该操作符是否消耗参考基因组的碱基
static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; }
};
struct ReadIdxCigar {
int readIdx = 0; // 在read序列中的位置
char cigarOp = '0'; // 当前位置对应的cigar
};
// 不用经常释放array的内存空间减少频繁的内存开辟和释放操作
template <class T>
struct FastArray {
vector<T> arr;
size_t idx;
void clear() { idx = 0; }
size_t size() { return idx; }
bool empty() { return idx == 0; }
void reserve(size_t _size) { arr.reserve(_size); }
void resize(size_t _size) {
arr.resize(_size);
idx = _size;
}
void push_back(const T& val) {
if (idx < arr.size()) {
arr[idx++] = val;
} else {
arr.push_back(val);
idx++;
}
}
inline T& operator[](size_t pos) { return arr[pos]; }
struct iterator {
typename std::vector<T>::iterator it;
iterator(typename std::vector<T>::iterator _it) : it(_it) {}
iterator& operator++() { ++it; return *this;}
iterator& operator--() { --it; return *this;}
T& operator*() const { return *it; }
bool operator!=(const iterator& other) const { return it != other.it; }
bool operator==(const iterator& other) const { return it == other.it; }
};
iterator begin() { return arr.begin(); }
iterator end() { return arr.begin() + idx; }
};
// 对原始bam数据的补充比如对两端进行hardclip等
class BamWrap;
struct SamData {
int read_len = 0; // read长度各种clip之后的长度
int cigar_start = 0; // cigar起始位置闭区间
int cigar_end = 0; // cigar结束位置开区间
int first_cigar_clip = 0; // 第一个cigar, clip的数量切左侧
int last_cigar_clip = 0; // 最后一个cigar, clip的数量切右侧
int left_clip = 0; // 左侧被切掉的碱基长度
int right_clip = 0; // 右侧被切掉的碱基长度
int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响)相对原始ref比对位置contig_pos的偏移量
// 记录一下bqsr运算过程中用到的数据回头提前计算一下修正现在的复杂逻辑
static constexpr int READ_INDEX_NOT_FOUND = -1;
BamWrap* bw;
int64_t start_pos; // 因为soft clip都被切掉了这里的softstart应该就是切掉之后的匹配位点闭区间
int64_t end_pos; // 同上,闭区间
string bases; // 处理之后的read的碱基
FastArray<uint8_t> base_quals; // 对应的质量分数
FastArray<uint8_t> ins_quals; // insert质量分数, BI (大部分应该都没有)
FastArray<uint8_t> del_quals; // delete质量分数, BD (大部分应该都没有)
FastArray<Cigar> cigars;
int64_t& softStart() { return start_pos; }
int64_t& softEnd() { return end_pos; }
// functions
ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) {
ReadIdxCigar rc;
if (refPos < start_pos)
return rc;
int firstReadPosOfElement = 0; // inclusive
int firstRefPosOfElement = start_pos; // inclusive
int lastReadPosOfElement = 0; // exclusive
int lastRefPosOfElement = start_pos; // exclusive
// advance forward through all the cigar elements until we bracket the reference coordinate
for (auto& cigar : cigars) {
firstReadPosOfElement = lastReadPosOfElement;
firstRefPosOfElement = lastRefPosOfElement;
lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0;
lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0;
if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element
int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0);
rc.cigarOp = cigar.op;
rc.readIdx = readPosAtRefCoord;
return rc;
}
}
return rc;
}
};
/*
这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明
这是类似bam的一个属性而大写加驼峰命名的函数更侧重说明这是通过计算得出的。
*/
/*
* sam read的封装
*/
struct BamWrap {
// 将contig左移后加上pos作为全局位置
const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位和位点拼合在一起
const static int READ_MAX_LENGTH = 200;
const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系
// 成员变量尽量少,减少占用内存空间
bam1_t *b;
int64_t end_pos_; // bam的全局结束位置, 相对ref, 闭区间
// 全局开始位置
inline int64_t start_pos() { return bam_global_pos(b); }
// 全局结束位置
inline int64_t end_pos() { return end_pos_; }
// 和reference对应的序列长度不是read包含碱基的个数
inline int16_t read_len() { return (end_pos_ - start_pos() + 1); }
// contig id
inline int32_t contig_id() { return b->core.tid; }
// 在contig内的开始位置
inline int32_t contig_pos() { return b->core.pos; }
// 在contig内部的结束位置
inline int32_t contig_end_pos() { return bam_pos(end_pos_); }
// 序列的长度AGTC字母个数
inline int16_t seq_len() { return b->core.l_qseq; }
/*
// 算上开头的softclip
inline int32_t softclip_start() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
const char c = bam_cigar_opchr(cigar[0]);
const int len = bam_cigar_oplen(cigar[0]);
if (c == 'S')
return bc.pos - len;
return bc.pos;
}
inline int64_t global_softclip_start() {
return softclip_start() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT);
}
// 算上结尾的softclip闭区间
inline int32_t softclip_end() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
const int idx = bc.n_cigar - 1;
if (idx < 0) return bam_pos(end_pos_);
const char c = bam_cigar_opchr(cigar[idx]);
const int len = bam_cigar_oplen(cigar[idx]);
if (c == 'S')
return bam_pos(end_pos_) + len;
return bam_pos(end_pos_);
}
inline int64_t global_softclip_end() {
return softclip_end() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT);
}
// 右边softclip的长度
inline int32_t right_softclip_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]);
const int len = bam_cigar_oplen(cigar[bc.n_cigar - 1]);
if (c == 'S')
return len;
return 0;
}
*/
// 获取序列
inline std::string sequence() {
ostringstream oss;
char *seq = (char *)bam_get_seq(b);
const bam1_core_t &bc = b->core;
const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};
for (int i = 0; i < bc.l_qseq; ++i) {
char base = base_to_char[bam_seqi(seq, i)];
oss << base;
}
return std::move(oss.str());
}
// 获取名字
inline const char *query_name() { return bam_get_qname(b); }
// 获取cigar 字符串
inline string cigar_str() {
ostringstream oss;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
oss << len << c;
}
return std::move(oss.str());
}
// 占用的内存大小
inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; }
// 获取cigar中insert的总长度
inline int32_t insert_cigar_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
int ret = 0;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'I')
ret += len;
}
return ret;
}
// 获取cigar中delete的总长度
inline int32_t del_cigar_len() {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
int ret = 0;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'D')
ret += len;
}
return ret;
}
// 计算sam read的终点位置相对参考基因组
static inline int64_t BamEndPos(const bam1_t *b) {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
int start_offset = -1;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X')
start_offset += len;
}
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset));
};
// 计算read的有效长度即除了softclip和hardclip之外的长度
static inline int BamEffectiveLength(const bam1_t *b) {
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
int effective_len = 0;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'I' || c == 'N' || c == 'M' || c == '=' || c == 'X')
effective_len += len;
}
return effective_len;
};
bool HasWellDefinedFragmentSize() {
const bam1_core_t &bc = b->core;
bool hasWellDefinedFragmentSize = true;
if (bc.isize == 0 || !(bc.flag & BAM_FPAIRED) || ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) ||
((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) {
hasWellDefinedFragmentSize = false;
} else if (bc.flag & BAM_FREVERSE) {
hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false;
} else {
hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false;
}
return hasWellDefinedFragmentSize;
}
// 计算bam的adapterBoundary
int GetAdapterBoundary() {
const bam1_core_t &bc = b->core;
int adapterBoundary = INT_MIN;
if (!HasWellDefinedFragmentSize())
adapterBoundary = INT_MIN;
else if (bc.flag & BAM_FREVERSE)
adapterBoundary = bc.mpos - 1;
else
adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样3.5的这里+1
return adapterBoundary;
}
// 检测adapter boundary是否在read范围内
bool IsAdapterInRead(int adapterBoundary) {
return (adapterBoundary != INT_MIN && (adapterBoundary >= contig_pos() && adapterBoundary <= contig_end_pos()));
}
// 获取开头的I的长度
inline int GetHeadInsertLen() {
int insLen = 0;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'I') {
insLen = len;
break;
} else if (c != 'H' && c != 'S')
break;
}
return insLen;
}
// 获取soft clip开始位置(能处理H和S相连的情况有这种情况么,
// 注意开头的I要当做S)
inline int64_t GetSoftStart() {
int64_t softStart = b->core.pos;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
// if (c == 'S' || c == 'I')
if (c == 'S')
softStart -= len;
else if (c != 'H')
break;
}
return softStart;
}
/**
* Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips.
*
* Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
*
* @return the unclipped end of the read taking soft clips (but not hard clips) into account
*/
inline int64_t GetSoftEnd() {
int64_t softEnd = contig_end_pos();
const uint32_t* cigar = bam_get_cigar(b);
const bam1_core_t& bc = b->core;
bool foundAlignedBase = false;
for (int i = bc.n_cigar - 1; i >= 0; --i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
// if (c == 'S' || c == 'I')
if (c == 'S')
softEnd += len;
else if (c != 'H') {
foundAlignedBase = true;
break;
}
}
if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the alignment end
softEnd = contig_end_pos();
}
return softEnd;
}
// 获取unclipped开始位置(包括hardclip)
inline int64_t GetUnclippedStart() {
int64_t start = b->core.pos;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'S' || c == 'H')
start -= len;
else
break;
}
return start;
}
// 获取unclipped结束位置(包括hardclip)
inline int64_t GetUnclippedEnd() {
int64_t end_pos = bam_endpos(b);
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = bc.n_cigar - 1; i >= 0; --i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
if (c == 'S' || c == 'H')
end_pos += len;
else
break;
}
return end_pos - 1;
}
/* 获取碱基质量分数的加和 */
/** Calculates a score for the read which is the sum of scores over Q15. */
inline int GetSumOfBaseQualities() {
int score = 0;
uint8_t *qual = bam_get_qual(b);
for (int i = 0; i < b->core.l_qseq; ++i) {
if (qual[i] >= 15)
score += qual[i];
}
return score;
}
/* 与flag相关的检测 */
/* 没有比对上 unmapped */
inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; }
/* Template having multiple segments in sequencing */
inline bool GetReadPairedFlag() { return b->core.flag & BAM_FPAIRED; }
/**
* the read fails platform/vendor quality checks.
*/
inline bool GetReadFailsVendorQualityCheckFlag() { return b->core.flag & BAM_FQCFAIL; }
/**
* the mate is unmapped.
*/
bool GetMateUnmappedFlag() { return b->core.flag & BAM_FMUNMAP; }
/**
* @return whether the alignment is secondary (an alternative alignment of
* the read).
*/
bool IsSecondaryAlignment() { return b->core.flag & BAM_FSECONDARY; }
/**
* @return whether the alignment is supplementary (a split alignment such as
* a chimeric alignment).
*/
bool GetSupplementaryAlignmentFlag() { return b->core.flag & BAM_FSUPPLEMENTARY; }
/*
* Tests if this record is either a secondary and/or supplementary
* alignment;
*/
bool IsSecondaryOrSupplementary() { return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); }
/**
* the read is the first read in a pair.
*/
bool GetFirstOfPairFlag() { return b->core.flag & BAM_FREAD1; }
/**
* strand of the query (false for forward; true for reverse strand).
*/
bool GetReadNegativeStrandFlag() { return b->core.flag & BAM_FREVERSE; }
/**
* strand of the mate (false for forward; true for reverse strand).
*/
bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; }
/* 其他的一些信息 */
inline int GetReferenceLength() {
int length = 0;
const uint32_t *cigar = bam_get_cigar(b);
const bam1_core_t &bc = b->core;
for (int i = 0; i < bc.n_cigar; ++i) {
const char c = bam_cigar_opchr(cigar[i]);
const int len = bam_cigar_oplen(cigar[i]);
switch (c) {
case 'M':
case 'D':
case 'N':
case '=':
case 'X':
length += len;
break;
default:
break;
}
}
return length;
}
// 计算bam的全局位置算上染色体序号和比对位置
static inline int64_t bam_global_pos(bam1_t *b) {
return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos);
}
static inline int64_t bam_global_pos(int tid, int pos) {
return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos);
}
// 根据全局位置获取bam的染色体序号
static inline int32_t bam_tid(int64_t global_pos) {
const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1);
const int64_t high_tid = global_pos & mask;
return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT);
}
// 根据全局位置获取bam的比对位置(染色体内)
static inline int32_t bam_pos(int64_t global_pos) {
const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1;
return (int32_t)(global_pos & mask);
}
// 设置是否冗余的标记
void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); }
void setFlag(bool flag, int bit) {
if (flag)
this->b->core.flag |= bit;
else
this->b->core.flag &= ~bit;
}
};
typedef std::map<const std::string, std::vector<BamWrap *>> SampleBamMap;