FastBQSR/src/bqsr/covariate.cpp

318 lines
13 KiB
C++
Raw Normal View History

#include "covariate.h"
// for EventType
EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"};
EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"};
EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"};
vector<EventTypeValue> EventType::EVENTS = {BASE_SUBSTITUTION, BASE_INSERTION, BASE_DELETION};
// static变量 for ContextCovariate
int ContextCovariate::mismatchesContextSize;
int ContextCovariate::indelsContextSize;
int ContextCovariate::mismatchesKeyMask;
int ContextCovariate::indelsKeyMask;
uint8_t ContextCovariate::lowQualTail;
int ContextCovariate::baseIndexMap[256];
// for ReadGroupCovariate
map<string, int> ReadGroupCovariate::RgToId; // read group name到id的映射
map<int, string> ReadGroupCovariate::IdToRg; // id到read group name的映射
// for cycleCovariate
int CycleCovariate::MAXIMUM_CYCLE_VALUE;
// for CovariateUtils
// 对一条read计算协变量该协变量被上一个read用过
void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
bool recordIndelValues) {
ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
}
// ReadGroupCovariate 协变量的方法
void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
uint8_t *rgStr = bam_aux_get(bw->b, "RG");
char* rgVal = nullptr;
if (rgStr) rgVal = bam_aux2Z(rgStr);
int key = 0;
if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
spdlog::error("The RG tag value for read can not be found in header!");
} else {
key = RgToId[rgVal];
}
for (int i = 0; i < ad.read_len; ++i) {
CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
}
}
// BaseQualityCovariate 协变量的方法
void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
bool recordIndelValues) {
// 在前面的处理过后quals应该和base长度一致了
#define __bq_set_cov(ins, del) \
do { \
for (int i = 0; i < ad.read_len; ++i) { \
CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \
} \
} while (0)
const int INDEL_QUAL = 45;
uint8_t* quals = bam_get_qual(bw->b);
if (recordIndelValues) {
uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions
uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions
if (insQualPtr == nullptr && delQualPtr == nullptr) {
__bq_set_cov(INDEL_QUAL, INDEL_QUAL);
} else if (insQualPtr == nullptr) {
uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr);
__bq_set_cov(INDEL_QUAL, delQuals[i]);
} else {
uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr);
__bq_set_cov(insQuals[i], INDEL_QUAL);
}
} else {
__bq_set_cov(0, 0);
}
}
// ContextCovariate 协变量的方法
static char SimpleComplement(const char base) {
switch (base) {
case 'A':
case 'a':
return 'T';
case 'C':
case 'c':
return 'G';
case 'G':
case 'g':
return 'C';
case 'T':
case 't':
return 'A';
default:
return base;
}
}
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
if (bw->GetReadNegativeStrandFlag()) { // 反向互补
for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
}
// 处理左边
int left = 0;
for (; left < ad.read_len; ++left) {
if (quals[left] <= lowQTail)
clippedBases[left] = 'N';
else
break;
}
if (left == ad.read_len) {
clippedBases.clear();
return;
}
// 处理右边
int right = ad.read_len - 1;
for (; right >= 0; --right) {
if (quals[right] <= lowQTail)
clippedBases[right] = 'N';
else
break;
}
if (right < left)
clippedBases.clear();
}
/**
* Creates a int representation of a given dna string.
*
* @param dna the dna sequence
* @param start the start position in the byte array (inclusive)
* @param end the end position in the array (exclusive)
* @return the key representing the dna sequence
*/
int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) {
int key = end - start;
int bitOffset = LENGTH_BITS;
for (int i = start; i < end; i++) {
const int baseIndex = baseIndexMap[dna[i] & 0xff];
if (baseIndex == -1) { // ignore non-ACGT bases
return -1;
}
key |= (baseIndex << bitOffset);
bitOffset += 2;
}
return key;
}
/**
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
*
* For example, for the read [AGCTG], return the list
* [-1, "AG", "GC", "CT", "TG" ]
* with each string context encoded as an integer.
*
* @param bases the bases in the read to build the context from
* @param contextSize context size to use building the context
* @param mask mask for pulling out just the context bits
*
* @return a list that has the same length as the read and contains the (preceding) n-base context at each position.
*
*/
void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys) {
int readLength = bases.size();
keys.resize(readLength);
int keyIdx = 0;
// the first contextSize-1 bases will not have enough previous context
for (int i = 1; i < contextSize && i <= readLength; i++) {
keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE;
}
if (readLength < contextSize)
return;
int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS;
// get (and add) the key for the context starting at the first base
int currentKey = KeyFromContext(bases, 0, contextSize);
keys[keyIdx++] = currentKey;
// if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects
int currentNPenalty = 0;
if (currentKey == -1) {
currentKey = 0;
currentNPenalty = contextSize - 1;
int offset = newBaseOffset;
int baseIndex;
while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) {
currentKey |= (baseIndex << offset);
offset -= 2;
currentNPenalty--;
}
}
for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) {
const int baseIndex = baseIndexMap[bases[currentIndex]];
if (baseIndex == -1) { // ignore non-ACGT bases
currentNPenalty = contextSize;
currentKey = 0; // reset the key
} else {
// push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length
// in
currentKey = (currentKey >> 2) & mask;
currentKey |= (baseIndex << newBaseOffset);
currentKey |= contextSize;
}
if (currentNPenalty == 0) {
keys[keyIdx++] = currentKey;
} else {
currentNPenalty--;
keys[keyIdx++] = -1;
}
}
}
void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int originalReadLength = ad.read_len;
// store the original bases and then write Ns over low quality ones
string strandedClippedBases(ad.bases);
GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
// spdlog::info("bases: {}", strandedClippedBases);
vector<int> nBasePairContextAtEachCycle;
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
const int readLengthAfterClipping = strandedClippedBases.size();
// this is necessary to ensure that we don't keep historical data in the ReadCovariates values
// since the context covariate may not span the entire set of values in read covariates
// due to the clipping of the low quality bases
if (readLengthAfterClipping != originalReadLength) {
// don't bother zeroing out if we are going to overwrite the whole array
for (int i = 0; i < originalReadLength; i++) {
// this base has been clipped off, so zero out the covariate values here
CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values);
}
}
const bool negativeStrand = bw->GetReadNegativeStrandFlag();
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
if (recordIndelValues) {
vector<int> indelKeys;
GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys);
for (int i = 0; i < readLengthAfterClipping; i++) {
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
const int indelKey = indelKeys[i];
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values);
}
} else {
for (int i = 0; i < readLengthAfterClipping; i++) {
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values);
}
}
}
// CycleCovariate 协变量的方法
/**
* Computes the encoded value of CycleCovariate's key for the given position at the read.
* Uses keyFromCycle to do the encoding.
* @param baseNumber index of the base to compute the key for
* @param read the read
* @param indel is this an indel key or a substitution key?
* @param maxCycle max value of the base to compute the key for
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
*/
int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
const bool isNegStrand = bw->GetReadNegativeStrandFlag();
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
const int readLength = ad.read_len;
const int readOrderFactor = isSecondInPair ? -1 : 1;
int increment;
int cycle;
if (isNegStrand) {
cycle = readLength * readOrderFactor;
increment = -1 * readOrderFactor;
} else {
cycle = readOrderFactor;
increment = readOrderFactor;
}
cycle += baseNumber * increment;
if (!indel) {
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
}
const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1;
if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) {
return -1;
} else {
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
}
}
// Used to pick out the covariate's value from attributes of the read
void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int readLength = ad.read_len;
// Note: duplicate the loop to void checking recordIndelValues on every iteration
if (recordIndelValues) {
for (int i = 0; i < readLength; i++) {
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE);
CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
}
} else {
for (int i = 0; i < readLength; i++) {
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
}
}
}