/* Description: 读入sam/bam时,开辟一个大的buf,存放这些数据 Copyright : All right reserved by ICT Author : Zhang Zhonghai Date : 2019/11/27 */ #pragma once #include #include #include #include #include #include #include #include #include using namespace std; struct Cigar { char op = '0'; int len = 0; // 该操作符是否消耗read的碱基 static bool ConsumeReadBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'I' || cigar == 'S'; } // 该操作符是否消耗参考基因组的碱基 static bool ConsumeRefBases(char cigar) { return cigar == 'M' || cigar == '=' || cigar == 'X' || cigar == 'D' || cigar == 'N'; } }; struct ReadIdxCigar { int readIdx = 0; // 在read序列中的位置 char cigarOp = '0'; // 当前位置对应的cigar }; // 不用经常释放array的内存空间,减少频繁的内存开辟和释放操作 template struct FastArray { vector arr; size_t idx; void clear() { idx = 0; } size_t size() { return idx; } bool empty() { return idx == 0; } void reserve(size_t _size) { arr.reserve(_size); } void resize(size_t _size) { arr.resize(_size); idx = _size; } void push_back(const T& val) { if (idx < arr.size()) { arr[idx++] = val; } else { arr.push_back(val); idx++; } } inline T& operator[](size_t pos) { return arr[pos]; } struct iterator { typename std::vector::iterator it; iterator(typename std::vector::iterator _it) : it(_it) {} iterator& operator++() { ++it; return *this;} iterator& operator--() { --it; return *this;} T& operator*() const { return *it; } bool operator!=(const iterator& other) const { return it != other.it; } bool operator==(const iterator& other) const { return it == other.it; } }; iterator begin() { return arr.begin(); } iterator end() { return arr.begin() + idx; } }; // 对原始bam数据的补充,比如对两端进行hardclip等 class BamWrap; struct SamData { int read_len = 0; // read长度,各种clip之后的长度 int cigar_start = 0; // cigar起始位置,闭区间 int cigar_end = 0; // cigar结束位置,开区间 int first_cigar_clip = 0; // 第一个cigar, clip的数量,切左侧 int last_cigar_clip = 0; // 最后一个cigar, clip的数量,切右侧 int left_clip = 0; // 左侧被切掉的碱基长度 int right_clip = 0; // 右侧被切掉的碱基长度 int ref_offset = 0; // 切除adapter和softclip之后(softclip应该不影响),相对原始ref比对位置(contig_pos)的偏移量 // 记录一下bqsr运算过程中用到的数据,回头提前计算一下,修正现在的复杂逻辑 static constexpr int READ_INDEX_NOT_FOUND = -1; BamWrap* bw; int64_t start_pos; // 因为soft clip都被切掉了,这里的softstart应该就是切掉之后的匹配位点,闭区间 int64_t end_pos; // 同上,闭区间 string bases; // 处理之后的read的碱基 FastArray base_quals; // 对应的质量分数 FastArray ins_quals; // insert质量分数, BI (大部分应该都没有) FastArray del_quals; // delete质量分数, BD (大部分应该都没有) FastArray cigars; int64_t& softStart() { return start_pos; } int64_t& softEnd() { return end_pos; } // functions ReadIdxCigar getReadIndexForReferenceCoordinate(int64_t refPos) { ReadIdxCigar rc; if (refPos < start_pos) return rc; int firstReadPosOfElement = 0; // inclusive int firstRefPosOfElement = start_pos; // inclusive int lastReadPosOfElement = 0; // exclusive int lastRefPosOfElement = start_pos; // exclusive // advance forward through all the cigar elements until we bracket the reference coordinate for (auto& cigar : cigars) { firstReadPosOfElement = lastReadPosOfElement; firstRefPosOfElement = lastRefPosOfElement; lastReadPosOfElement += Cigar::ConsumeReadBases(cigar.op) ? cigar.len : 0; lastRefPosOfElement += Cigar::ConsumeRefBases(cigar.op) || cigar.op == 'S' ? cigar.len : 0; if (firstRefPosOfElement <= refPos && refPos < lastRefPosOfElement) { // refCoord falls within this cigar element int readPosAtRefCoord = firstReadPosOfElement + (Cigar::ConsumeReadBases(cigar.op) ? (refPos - firstRefPosOfElement) : 0); rc.cigarOp = cigar.op; rc.readIdx = readPosAtRefCoord; return rc; } } return rc; } }; /* 这里的成员函数命名有点混乱,特此说明,小写加下划线的函数命名,无论是静态函数,还是普通成员函数,更侧重说明 这是类似bam的一个属性,而大写加驼峰命名的函数,更侧重说明这是通过计算得出的。 */ /* * sam read的封装 */ struct BamWrap { // 将contig左移后加上pos作为全局位置 const static int MAX_CONTIG_LEN_SHIFT = 40; // 将染色体id左移多少位,和位点拼合在一起 const static int READ_MAX_LENGTH = 200; const static int READ_MAX_DEPTH = 1000; // 这只是用来初始化空间用的,深度大于这个值也没关系 // 成员变量尽量少,减少占用内存空间 bam1_t *b; int64_t end_pos_; // bam的全局结束位置, 相对ref, 闭区间 // 全局开始位置 inline int64_t start_pos() { return bam_global_pos(b); } // 全局结束位置 inline int64_t end_pos() { return end_pos_; } // 和reference对应的序列长度,不是read包含碱基的个数 inline int16_t read_len() { return (end_pos_ - start_pos() + 1); } // contig id inline int32_t contig_id() { return b->core.tid; } // 在contig内的开始位置 inline int32_t contig_pos() { return b->core.pos; } // 在contig内部的结束位置 inline int32_t contig_end_pos() { return bam_pos(end_pos_); } // 序列的长度(AGTC字母个数) inline int16_t seq_len() { return b->core.l_qseq; } /* // 算上开头的softclip inline int32_t softclip_start() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; const char c = bam_cigar_opchr(cigar[0]); const int len = bam_cigar_oplen(cigar[0]); if (c == 'S') return bc.pos - len; return bc.pos; } inline int64_t global_softclip_start() { return softclip_start() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT); } // 算上结尾的softclip,闭区间 inline int32_t softclip_end() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; const int idx = bc.n_cigar - 1; if (idx < 0) return bam_pos(end_pos_); const char c = bam_cigar_opchr(cigar[idx]); const int len = bam_cigar_oplen(cigar[idx]); if (c == 'S') return bam_pos(end_pos_) + len; return bam_pos(end_pos_); } inline int64_t global_softclip_end() { return softclip_end() + ((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT); } // 右边softclip的长度 inline int32_t right_softclip_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; const char c = bam_cigar_opchr(cigar[bc.n_cigar - 1]); const int len = bam_cigar_oplen(cigar[bc.n_cigar - 1]); if (c == 'S') return len; return 0; } */ // 获取序列 inline std::string sequence() { ostringstream oss; char *seq = (char *)bam_get_seq(b); const bam1_core_t &bc = b->core; const char base_to_char[16] = {'N', 'A', 'C', 'N', 'G', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 'N'}; for (int i = 0; i < bc.l_qseq; ++i) { char base = base_to_char[bam_seqi(seq, i)]; oss << base; } return std::move(oss.str()); } // 获取名字 inline const char *query_name() { return bam_get_qname(b); } // 获取cigar 字符串 inline string cigar_str() { ostringstream oss; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); oss << len << c; } return std::move(oss.str()); } // 占用的内存大小 inline int16_t length() { return sizeof(*this) + sizeof(bam1_t) + b->l_data; } // 获取cigar中insert的总长度 inline int32_t insert_cigar_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; int ret = 0; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'I') ret += len; } return ret; } // 获取cigar中delete的总长度 inline int32_t del_cigar_len() { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; int ret = 0; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'D') ret += len; } return ret; } // 计算sam read的终点位置,相对参考基因组 static inline int64_t BamEndPos(const bam1_t *b) { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; int start_offset = -1; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'D' || c == 'N' || c == 'M' || c == '=' || c == 'X') start_offset += len; } return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)(b->core.pos + start_offset)); }; // 计算read的有效长度,即除了softclip和hardclip之外的长度 static inline int BamEffectiveLength(const bam1_t *b) { const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; int effective_len = 0; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'I' || c == 'N' || c == 'M' || c == '=' || c == 'X') effective_len += len; } return effective_len; }; bool HasWellDefinedFragmentSize() { const bam1_core_t &bc = b->core; bool hasWellDefinedFragmentSize = true; if (bc.isize == 0 || !(bc.flag & BAM_FPAIRED) || ((bc.flag & BAM_FUNMAP) || (bc.flag & BAM_FMUNMAP)) || ((bool)(bc.flag & BAM_FREVERSE) == (bool)(bc.flag & BAM_FMREVERSE))) { hasWellDefinedFragmentSize = false; } else if (bc.flag & BAM_FREVERSE) { hasWellDefinedFragmentSize = contig_end_pos() > bc.mpos ? true : false; } else { hasWellDefinedFragmentSize = bc.pos <= bc.mpos + bc.isize ? true : false; } return hasWellDefinedFragmentSize; } // 计算bam的adapterBoundary int GetAdapterBoundary() { const bam1_core_t &bc = b->core; int adapterBoundary = INT_MIN; if (!HasWellDefinedFragmentSize()) adapterBoundary = INT_MIN; else if (bc.flag & BAM_FREVERSE) adapterBoundary = bc.mpos - 1; else adapterBoundary = bc.pos + abs(bc.isize); // GATK4.0 和 GATK3.5不一样,3.5的这里+1 return adapterBoundary; } // 检测adapter boundary是否在read范围内 bool IsAdapterInRead(int adapterBoundary) { return (adapterBoundary != INT_MIN && (adapterBoundary >= contig_pos() && adapterBoundary <= contig_end_pos())); } // 获取开头的I的长度 inline int GetHeadInsertLen() { int insLen = 0; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'I') { insLen = len; break; } else if (c != 'H' && c != 'S') break; } return insLen; } // 获取soft clip开始位置(能处理H和S相连的情况,有这种情况么?, // 注意开头的I要当做S?) inline int64_t GetSoftStart() { int64_t softStart = b->core.pos; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); // if (c == 'S' || c == 'I') if (c == 'S') softStart -= len; else if (c != 'H') break; } return softStart; } /** * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. * * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips. * * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ inline int64_t GetSoftEnd() { int64_t softEnd = contig_end_pos(); const uint32_t* cigar = bam_get_cigar(b); const bam1_core_t& bc = b->core; bool foundAlignedBase = false; for (int i = bc.n_cigar - 1; i >= 0; --i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); // if (c == 'S' || c == 'I') if (c == 'S') softEnd += len; else if (c != 'H') { foundAlignedBase = true; break; } } if (!foundAlignedBase) { // for example 64H14S, the soft end is actually the same as the alignment end softEnd = contig_end_pos(); } return softEnd; } // 获取unclipped开始位置(包括hardclip) inline int64_t GetUnclippedStart() { int64_t start = b->core.pos; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'S' || c == 'H') start -= len; else break; } return start; } // 获取unclipped结束位置(包括hardclip) inline int64_t GetUnclippedEnd() { int64_t end_pos = bam_endpos(b); const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; for (int i = bc.n_cigar - 1; i >= 0; --i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); if (c == 'S' || c == 'H') end_pos += len; else break; } return end_pos - 1; } /* 获取碱基质量分数的加和 */ /** Calculates a score for the read which is the sum of scores over Q15. */ inline int GetSumOfBaseQualities() { int score = 0; uint8_t *qual = bam_get_qual(b); for (int i = 0; i < b->core.l_qseq; ++i) { if (qual[i] >= 15) score += qual[i]; } return score; } /* 与flag相关的检测 */ /* 没有比对上 unmapped */ inline bool GetReadUnmappedFlag() { return b->core.flag & BAM_FUNMAP; } /* Template having multiple segments in sequencing */ inline bool GetReadPairedFlag() { return b->core.flag & BAM_FPAIRED; } /** * the read fails platform/vendor quality checks. */ inline bool GetReadFailsVendorQualityCheckFlag() { return b->core.flag & BAM_FQCFAIL; } /** * the mate is unmapped. */ bool GetMateUnmappedFlag() { return b->core.flag & BAM_FMUNMAP; } /** * @return whether the alignment is secondary (an alternative alignment of * the read). */ bool IsSecondaryAlignment() { return b->core.flag & BAM_FSECONDARY; } /** * @return whether the alignment is supplementary (a split alignment such as * a chimeric alignment). */ bool GetSupplementaryAlignmentFlag() { return b->core.flag & BAM_FSUPPLEMENTARY; } /* * Tests if this record is either a secondary and/or supplementary * alignment; */ bool IsSecondaryOrSupplementary() { return IsSecondaryAlignment() || GetSupplementaryAlignmentFlag(); } /** * the read is the first read in a pair. */ bool GetFirstOfPairFlag() { return b->core.flag & BAM_FREAD1; } /** * strand of the query (false for forward; true for reverse strand). */ bool GetReadNegativeStrandFlag() { return b->core.flag & BAM_FREVERSE; } /** * strand of the mate (false for forward; true for reverse strand). */ bool GetMateNegativeStrandFlag() { return b->core.flag & BAM_FMREVERSE; } /* 其他的一些信息 */ inline int GetReferenceLength() { int length = 0; const uint32_t *cigar = bam_get_cigar(b); const bam1_core_t &bc = b->core; for (int i = 0; i < bc.n_cigar; ++i) { const char c = bam_cigar_opchr(cigar[i]); const int len = bam_cigar_oplen(cigar[i]); switch (c) { case 'M': case 'D': case 'N': case '=': case 'X': length += len; break; default: break; } } return length; } // 计算bam的全局位置,算上染色体序号和比对位置 static inline int64_t bam_global_pos(bam1_t *b) { return (((int64_t)b->core.tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)b->core.pos); } static inline int64_t bam_global_pos(int tid, int pos) { return (((int64_t)tid << MAX_CONTIG_LEN_SHIFT) | (int64_t)pos); } // 根据全局位置获取bam的染色体序号 static inline int32_t bam_tid(int64_t global_pos) { const int64_t mask = ~(((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1); const int64_t high_tid = global_pos & mask; return (int32_t)(high_tid >> MAX_CONTIG_LEN_SHIFT); } // 根据全局位置获取bam的比对位置(染色体内) static inline int32_t bam_pos(int64_t global_pos) { const int64_t mask = ((int64_t)1 << MAX_CONTIG_LEN_SHIFT) - 1; return (int32_t)(global_pos & mask); } // 设置是否冗余的标记 void SetDuplicateReadFlag(bool flag) { setFlag(flag, BAM_FDUP); } void setFlag(bool flag, int bit) { if (flag) this->b->core.flag |= bit; else this->b->core.flag &= ~bit; } }; typedef std::map> SampleBamMap;