#include "covariate.h" // for EventType EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"}; EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"}; EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"}; vector EventType::EVENTS = {BASE_SUBSTITUTION, BASE_INSERTION, BASE_DELETION}; // static变量 for ContextCovariate int ContextCovariate::mismatchesContextSize; int ContextCovariate::indelsContextSize; int ContextCovariate::mismatchesKeyMask; int ContextCovariate::indelsKeyMask; uint8_t ContextCovariate::lowQualTail; int ContextCovariate::baseIndexMap[256]; // for ReadGroupCovariate map ReadGroupCovariate::RgToId; // read group name到id的映射 map ReadGroupCovariate::IdToRg; // id到read group name的映射 // for cycleCovariate int CycleCovariate::MAXIMUM_CYCLE_VALUE; // for CovariateUtils // 对一条read计算协变量(该协变量被上一个read用过) void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues); BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues); ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues); CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues); } // ReadGroupCovariate 协变量的方法 void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { uint8_t *rgStr = bam_aux_get(bw->b, "RG"); char* rgVal = nullptr; if (rgStr) rgVal = bam_aux2Z(rgStr); int key = 0; if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) { spdlog::error("The RG tag value for read can not be found in header!"); } else { key = RgToId[rgVal]; } for (int i = 0; i < ad.read_len; ++i) { CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values); } } // BaseQualityCovariate 协变量的方法 void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { // 在前面的处理过后,quals应该和base长度一致了 #define __bq_set_cov(ins, del) \ do { \ for (int i = 0; i < ad.read_len; ++i) { \ CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \ } \ } while (0) const int INDEL_QUAL = 45; uint8_t* quals = bam_get_qual(bw->b); if (recordIndelValues) { uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions if (insQualPtr == nullptr && delQualPtr == nullptr) { __bq_set_cov(INDEL_QUAL, INDEL_QUAL); } else if (insQualPtr == nullptr) { uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr); __bq_set_cov(INDEL_QUAL, delQuals[i]); } else { uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr); __bq_set_cov(insQuals[i], INDEL_QUAL); } } else { __bq_set_cov(0, 0); } } // ContextCovariate 协变量的方法 static char SimpleComplement(const char base) { switch (base) { case 'A': case 'a': return 'T'; case 'C': case 'c': return 'G'; case 'G': case 'g': return 'C'; case 'T': case 't': return 'A'; default: return base; } } // 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N) void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) { uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip; if (bw->GetReadNegativeStrandFlag()) { // 反向互补 for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]); } // 处理左边 int left = 0; for (; left < ad.read_len; ++left) { if (quals[left] <= lowQTail) clippedBases[left] = 'N'; else break; } if (left == ad.read_len) { clippedBases.clear(); return; } // 处理右边 int right = ad.read_len - 1; for (; right >= 0; --right) { if (quals[right] <= lowQTail) clippedBases[right] = 'N'; else break; } if (right < left) clippedBases.clear(); } /** * Creates a int representation of a given dna string. * * @param dna the dna sequence * @param start the start position in the byte array (inclusive) * @param end the end position in the array (exclusive) * @return the key representing the dna sequence */ int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) { int key = end - start; int bitOffset = LENGTH_BITS; for (int i = start; i < end; i++) { const int baseIndex = baseIndexMap[dna[i] & 0xff]; if (baseIndex == -1) { // ignore non-ACGT bases return -1; } key |= (baseIndex << bitOffset); bitOffset += 2; } return key; } /** * For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context). * * For example, for the read [AGCTG], return the list * [-1, "AG", "GC", "CT", "TG" ] * with each string context encoded as an integer. * * @param bases the bases in the read to build the context from * @param contextSize context size to use building the context * @param mask mask for pulling out just the context bits * * @return a list that has the same length as the read and contains the (preceding) n-base context at each position. * */ void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector& keys) { int readLength = bases.size(); keys.resize(readLength); int keyIdx = 0; // the first contextSize-1 bases will not have enough previous context for (int i = 1; i < contextSize && i <= readLength; i++) { keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE; } if (readLength < contextSize) return; int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS; // get (and add) the key for the context starting at the first base int currentKey = KeyFromContext(bases, 0, contextSize); keys[keyIdx++] = currentKey; // if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects int currentNPenalty = 0; if (currentKey == -1) { currentKey = 0; currentNPenalty = contextSize - 1; int offset = newBaseOffset; int baseIndex; while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) { currentKey |= (baseIndex << offset); offset -= 2; currentNPenalty--; } } for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { const int baseIndex = baseIndexMap[bases[currentIndex]]; if (baseIndex == -1) { // ignore non-ACGT bases currentNPenalty = contextSize; currentKey = 0; // reset the key } else { // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length // in currentKey = (currentKey >> 2) & mask; currentKey |= (baseIndex << newBaseOffset); currentKey |= contextSize; } if (currentNPenalty == 0) { keys[keyIdx++] = currentKey; } else { currentNPenalty--; keys[keyIdx++] = -1; } } } void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { const int originalReadLength = ad.read_len; // store the original bases and then write Ns over low quality ones string strandedClippedBases(ad.bases); GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail); // spdlog::info("bases: {}", strandedClippedBases); vector nBasePairContextAtEachCycle; GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle); const int readLengthAfterClipping = strandedClippedBases.size(); // this is necessary to ensure that we don't keep historical data in the ReadCovariates values // since the context covariate may not span the entire set of values in read covariates // due to the clipping of the low quality bases if (readLengthAfterClipping != originalReadLength) { // don't bother zeroing out if we are going to overwrite the whole array for (int i = 0; i < originalReadLength; i++) { // this base has been clipped off, so zero out the covariate values here CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values); } } const bool negativeStrand = bw->GetReadNegativeStrandFlag(); // Note: duplicated the loop to avoid checking recordIndelValues on each iteration if (recordIndelValues) { vector indelKeys; GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys); for (int i = 0; i < readLengthAfterClipping; i++) { const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping); const int indelKey = indelKeys[i]; CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values); } } else { for (int i = 0; i < readLengthAfterClipping; i++) { const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping); CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values); } } } // CycleCovariate 协变量的方法 /** * Computes the encoded value of CycleCovariate's key for the given position at the read. * Uses keyFromCycle to do the encoding. * @param baseNumber index of the base to compute the key for * @param read the read * @param indel is this an indel key or a substitution key? * @param maxCycle max value of the base to compute the key for * (this method throws UserException if the computed absolute value of the cycle number is higher than this value). */ int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) { const bool isNegStrand = bw->GetReadNegativeStrandFlag(); const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2); const int readLength = ad.read_len; const int readOrderFactor = isSecondInPair ? -1 : 1; int increment; int cycle; if (isNegStrand) { cycle = readLength * readOrderFactor; increment = -1 * readOrderFactor; } else { cycle = readOrderFactor; increment = readOrderFactor; } cycle += baseNumber * increment; if (!indel) { return CycleCovariate::KeyFromCycle(cycle, maxCycle); } const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1; if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) { return -1; } else { return CycleCovariate::KeyFromCycle(cycle, maxCycle); } } // Used to pick out the covariate's value from attributes of the read void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) { const int readLength = ad.read_len; // Note: duplicate the loop to void checking recordIndelValues on every iteration if (recordIndelValues) { for (int i = 0; i < readLength; i++) { const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE); const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE); CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values); } } else { for (int i = 0; i < readLength; i++) { const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE); CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values); } } }