FastBQSR/src/bqsr/covariate.cpp

318 lines
13 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include "covariate.h"
// for EventType
EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"};
EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"};
EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"};
vector<EventTypeValue> EventType::EVENTS = {BASE_SUBSTITUTION, BASE_INSERTION, BASE_DELETION};
// static变量 for ContextCovariate
int ContextCovariate::mismatchesContextSize;
int ContextCovariate::indelsContextSize;
int ContextCovariate::mismatchesKeyMask;
int ContextCovariate::indelsKeyMask;
uint8_t ContextCovariate::lowQualTail;
int ContextCovariate::baseIndexMap[256];
// for ReadGroupCovariate
map<string, int> ReadGroupCovariate::RgToId; // read group name到id的映射
map<int, string> ReadGroupCovariate::IdToRg; // id到read group name的映射
// for cycleCovariate
int CycleCovariate::MAXIMUM_CYCLE_VALUE;
// for CovariateUtils
// 对一条read计算协变量该协变量被上一个read用过
void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
bool recordIndelValues) {
ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
}
// ReadGroupCovariate 协变量的方法
void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
uint8_t *rgStr = bam_aux_get(bw->b, "RG");
char* rgVal = nullptr;
if (rgStr) rgVal = bam_aux2Z(rgStr);
int key = 0;
if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
spdlog::error("The RG tag value for read can not be found in header!");
} else {
key = RgToId[rgVal];
}
for (int i = 0; i < ad.read_len; ++i) {
CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
}
}
// BaseQualityCovariate 协变量的方法
void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
bool recordIndelValues) {
// 在前面的处理过后quals应该和base长度一致了
#define __bq_set_cov(ins, del) \
do { \
for (int i = 0; i < ad.read_len; ++i) { \
CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \
} \
} while (0)
const int INDEL_QUAL = 45;
uint8_t* quals = bam_get_qual(bw->b);
if (recordIndelValues) {
uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions
uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions
if (insQualPtr == nullptr && delQualPtr == nullptr) {
__bq_set_cov(INDEL_QUAL, INDEL_QUAL);
} else if (insQualPtr == nullptr) {
uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr);
__bq_set_cov(INDEL_QUAL, delQuals[i]);
} else {
uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr);
__bq_set_cov(insQuals[i], INDEL_QUAL);
}
} else {
__bq_set_cov(0, 0);
}
}
// ContextCovariate 协变量的方法
static char SimpleComplement(const char base) {
switch (base) {
case 'A':
case 'a':
return 'T';
case 'C':
case 'c':
return 'G';
case 'G':
case 'g':
return 'C';
case 'T':
case 't':
return 'A';
default:
return base;
}
}
// 获取去除低质量分数碱基之后的read碱基序列将低质量分数的碱基变成N
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
if (bw->GetReadNegativeStrandFlag()) { // 反向互补
for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
}
// 处理左边
int left = 0;
for (; left < ad.read_len; ++left) {
if (quals[left] <= lowQTail)
clippedBases[left] = 'N';
else
break;
}
if (left == ad.read_len) {
clippedBases.clear();
return;
}
// 处理右边
int right = ad.read_len - 1;
for (; right >= 0; --right) {
if (quals[right] <= lowQTail)
clippedBases[right] = 'N';
else
break;
}
if (right < left)
clippedBases.clear();
}
/**
* Creates a int representation of a given dna string.
*
* @param dna the dna sequence
* @param start the start position in the byte array (inclusive)
* @param end the end position in the array (exclusive)
* @return the key representing the dna sequence
*/
int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) {
int key = end - start;
int bitOffset = LENGTH_BITS;
for (int i = start; i < end; i++) {
const int baseIndex = baseIndexMap[dna[i] & 0xff];
if (baseIndex == -1) { // ignore non-ACGT bases
return -1;
}
key |= (baseIndex << bitOffset);
bitOffset += 2;
}
return key;
}
/**
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
*
* For example, for the read [AGCTG], return the list
* [-1, "AG", "GC", "CT", "TG" ]
* with each string context encoded as an integer.
*
* @param bases the bases in the read to build the context from
* @param contextSize context size to use building the context
* @param mask mask for pulling out just the context bits
*
* @return a list that has the same length as the read and contains the (preceding) n-base context at each position.
*
*/
void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys) {
int readLength = bases.size();
keys.resize(readLength);
int keyIdx = 0;
// the first contextSize-1 bases will not have enough previous context
for (int i = 1; i < contextSize && i <= readLength; i++) {
keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE;
}
if (readLength < contextSize)
return;
int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS;
// get (and add) the key for the context starting at the first base
int currentKey = KeyFromContext(bases, 0, contextSize);
keys[keyIdx++] = currentKey;
// if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects
int currentNPenalty = 0;
if (currentKey == -1) {
currentKey = 0;
currentNPenalty = contextSize - 1;
int offset = newBaseOffset;
int baseIndex;
while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) {
currentKey |= (baseIndex << offset);
offset -= 2;
currentNPenalty--;
}
}
for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) {
const int baseIndex = baseIndexMap[bases[currentIndex]];
if (baseIndex == -1) { // ignore non-ACGT bases
currentNPenalty = contextSize;
currentKey = 0; // reset the key
} else {
// push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length
// in
currentKey = (currentKey >> 2) & mask;
currentKey |= (baseIndex << newBaseOffset);
currentKey |= contextSize;
}
if (currentNPenalty == 0) {
keys[keyIdx++] = currentKey;
} else {
currentNPenalty--;
keys[keyIdx++] = -1;
}
}
}
void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int originalReadLength = ad.read_len;
// store the original bases and then write Ns over low quality ones
string strandedClippedBases(ad.bases);
GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
// spdlog::info("bases: {}", strandedClippedBases);
vector<int> nBasePairContextAtEachCycle;
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
const int readLengthAfterClipping = strandedClippedBases.size();
// this is necessary to ensure that we don't keep historical data in the ReadCovariates values
// since the context covariate may not span the entire set of values in read covariates
// due to the clipping of the low quality bases
if (readLengthAfterClipping != originalReadLength) {
// don't bother zeroing out if we are going to overwrite the whole array
for (int i = 0; i < originalReadLength; i++) {
// this base has been clipped off, so zero out the covariate values here
CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values);
}
}
const bool negativeStrand = bw->GetReadNegativeStrandFlag();
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
if (recordIndelValues) {
vector<int> indelKeys;
GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys);
for (int i = 0; i < readLengthAfterClipping; i++) {
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
const int indelKey = indelKeys[i];
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values);
}
} else {
for (int i = 0; i < readLengthAfterClipping; i++) {
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values);
}
}
}
// CycleCovariate 协变量的方法
/**
* Computes the encoded value of CycleCovariate's key for the given position at the read.
* Uses keyFromCycle to do the encoding.
* @param baseNumber index of the base to compute the key for
* @param read the read
* @param indel is this an indel key or a substitution key?
* @param maxCycle max value of the base to compute the key for
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
*/
int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
const bool isNegStrand = bw->GetReadNegativeStrandFlag();
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
const int readLength = ad.read_len;
const int readOrderFactor = isSecondInPair ? -1 : 1;
int increment;
int cycle;
if (isNegStrand) {
cycle = readLength * readOrderFactor;
increment = -1 * readOrderFactor;
} else {
cycle = readOrderFactor;
increment = readOrderFactor;
}
cycle += baseNumber * increment;
if (!indel) {
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
}
const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1;
if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) {
return -1;
} else {
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
}
}
// Used to pick out the covariate's value from attributes of the read
void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
const int readLength = ad.read_len;
// Note: duplicate the loop to void checking recordIndelValues on every iteration
if (recordIndelValues) {
for (int i = 0; i < readLength; i++) {
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE);
CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
}
} else {
for (int i = 0; i < readLength; i++) {
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
}
}
}