318 lines
13 KiB
C++
318 lines
13 KiB
C++
#include "covariate.h"
|
||
|
||
// for EventType
|
||
EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"};
|
||
EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"};
|
||
EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"};
|
||
vector<EventTypeValue> EventType::EVENTS = {BASE_SUBSTITUTION, BASE_INSERTION, BASE_DELETION};
|
||
|
||
// static变量 for ContextCovariate
|
||
int ContextCovariate::mismatchesContextSize;
|
||
int ContextCovariate::indelsContextSize;
|
||
int ContextCovariate::mismatchesKeyMask;
|
||
int ContextCovariate::indelsKeyMask;
|
||
uint8_t ContextCovariate::lowQualTail;
|
||
int ContextCovariate::baseIndexMap[256];
|
||
|
||
// for ReadGroupCovariate
|
||
map<string, int> ReadGroupCovariate::RgToId; // read group name到id的映射
|
||
map<int, string> ReadGroupCovariate::IdToRg; // id到read group name的映射
|
||
|
||
// for cycleCovariate
|
||
int CycleCovariate::MAXIMUM_CYCLE_VALUE;
|
||
|
||
// for CovariateUtils
|
||
// 对一条read计算协变量(该协变量被上一个read用过)
|
||
void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
||
bool recordIndelValues) {
|
||
ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||
BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||
ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||
CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
|
||
}
|
||
|
||
// ReadGroupCovariate 协变量的方法
|
||
void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||
uint8_t *rgStr = bam_aux_get(bw->b, "RG");
|
||
char* rgVal = nullptr;
|
||
if (rgStr) rgVal = bam_aux2Z(rgStr);
|
||
int key = 0;
|
||
if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
|
||
spdlog::error("The RG tag value for read can not be found in header!");
|
||
} else {
|
||
key = RgToId[rgVal];
|
||
}
|
||
for (int i = 0; i < ad.read_len; ++i) {
|
||
CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
|
||
}
|
||
}
|
||
|
||
// BaseQualityCovariate 协变量的方法
|
||
void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
|
||
bool recordIndelValues) {
|
||
// 在前面的处理过后,quals应该和base长度一致了
|
||
#define __bq_set_cov(ins, del) \
|
||
do { \
|
||
for (int i = 0; i < ad.read_len; ++i) { \
|
||
CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \
|
||
} \
|
||
} while (0)
|
||
|
||
const int INDEL_QUAL = 45;
|
||
uint8_t* quals = bam_get_qual(bw->b);
|
||
if (recordIndelValues) {
|
||
uint8_t* insQualPtr = bam_aux_get(bw->b, "BI"); // base qualities for insertions
|
||
uint8_t* delQualPtr = bam_aux_get(bw->b, "BD"); // base qualities for deletions
|
||
if (insQualPtr == nullptr && delQualPtr == nullptr) {
|
||
__bq_set_cov(INDEL_QUAL, INDEL_QUAL);
|
||
} else if (insQualPtr == nullptr) {
|
||
uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr);
|
||
__bq_set_cov(INDEL_QUAL, delQuals[i]);
|
||
} else {
|
||
uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr);
|
||
__bq_set_cov(insQuals[i], INDEL_QUAL);
|
||
}
|
||
} else {
|
||
__bq_set_cov(0, 0);
|
||
}
|
||
}
|
||
|
||
// ContextCovariate 协变量的方法
|
||
|
||
static char SimpleComplement(const char base) {
|
||
switch (base) {
|
||
case 'A':
|
||
case 'a':
|
||
return 'T';
|
||
case 'C':
|
||
case 'c':
|
||
return 'G';
|
||
case 'G':
|
||
case 'g':
|
||
return 'C';
|
||
case 'T':
|
||
case 't':
|
||
return 'A';
|
||
default:
|
||
return base;
|
||
}
|
||
}
|
||
|
||
// 获取去除低质量分数碱基之后的read碱基序列(将低质量分数的碱基变成N)
|
||
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
|
||
uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;
|
||
|
||
if (bw->GetReadNegativeStrandFlag()) { // 反向互补
|
||
for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
|
||
}
|
||
|
||
// 处理左边
|
||
int left = 0;
|
||
for (; left < ad.read_len; ++left) {
|
||
if (quals[left] <= lowQTail)
|
||
clippedBases[left] = 'N';
|
||
else
|
||
break;
|
||
}
|
||
if (left == ad.read_len) {
|
||
clippedBases.clear();
|
||
return;
|
||
}
|
||
// 处理右边
|
||
int right = ad.read_len - 1;
|
||
for (; right >= 0; --right) {
|
||
if (quals[right] <= lowQTail)
|
||
clippedBases[right] = 'N';
|
||
else
|
||
break;
|
||
}
|
||
if (right < left)
|
||
clippedBases.clear();
|
||
}
|
||
|
||
/**
|
||
* Creates a int representation of a given dna string.
|
||
*
|
||
* @param dna the dna sequence
|
||
* @param start the start position in the byte array (inclusive)
|
||
* @param end the end position in the array (exclusive)
|
||
* @return the key representing the dna sequence
|
||
*/
|
||
int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) {
|
||
int key = end - start;
|
||
int bitOffset = LENGTH_BITS;
|
||
for (int i = start; i < end; i++) {
|
||
const int baseIndex = baseIndexMap[dna[i] & 0xff];
|
||
if (baseIndex == -1) { // ignore non-ACGT bases
|
||
return -1;
|
||
}
|
||
key |= (baseIndex << bitOffset);
|
||
bitOffset += 2;
|
||
}
|
||
return key;
|
||
}
|
||
|
||
/**
|
||
* For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
|
||
*
|
||
* For example, for the read [AGCTG], return the list
|
||
* [-1, "AG", "GC", "CT", "TG" ]
|
||
* with each string context encoded as an integer.
|
||
*
|
||
* @param bases the bases in the read to build the context from
|
||
* @param contextSize context size to use building the context
|
||
* @param mask mask for pulling out just the context bits
|
||
*
|
||
* @return a list that has the same length as the read and contains the (preceding) n-base context at each position.
|
||
*
|
||
*/
|
||
void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys) {
|
||
int readLength = bases.size();
|
||
keys.resize(readLength);
|
||
int keyIdx = 0;
|
||
// the first contextSize-1 bases will not have enough previous context
|
||
for (int i = 1; i < contextSize && i <= readLength; i++) {
|
||
keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE;
|
||
}
|
||
if (readLength < contextSize)
|
||
return;
|
||
|
||
int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS;
|
||
|
||
// get (and add) the key for the context starting at the first base
|
||
int currentKey = KeyFromContext(bases, 0, contextSize);
|
||
keys[keyIdx++] = currentKey;
|
||
|
||
// if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects
|
||
int currentNPenalty = 0;
|
||
if (currentKey == -1) {
|
||
currentKey = 0;
|
||
currentNPenalty = contextSize - 1;
|
||
int offset = newBaseOffset;
|
||
int baseIndex;
|
||
while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) {
|
||
currentKey |= (baseIndex << offset);
|
||
offset -= 2;
|
||
currentNPenalty--;
|
||
}
|
||
}
|
||
|
||
for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) {
|
||
const int baseIndex = baseIndexMap[bases[currentIndex]];
|
||
if (baseIndex == -1) { // ignore non-ACGT bases
|
||
currentNPenalty = contextSize;
|
||
currentKey = 0; // reset the key
|
||
} else {
|
||
// push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length
|
||
// in
|
||
currentKey = (currentKey >> 2) & mask;
|
||
currentKey |= (baseIndex << newBaseOffset);
|
||
currentKey |= contextSize;
|
||
}
|
||
|
||
if (currentNPenalty == 0) {
|
||
keys[keyIdx++] = currentKey;
|
||
} else {
|
||
currentNPenalty--;
|
||
keys[keyIdx++] = -1;
|
||
}
|
||
}
|
||
}
|
||
|
||
void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||
const int originalReadLength = ad.read_len;
|
||
|
||
// store the original bases and then write Ns over low quality ones
|
||
string strandedClippedBases(ad.bases);
|
||
GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
|
||
// spdlog::info("bases: {}", strandedClippedBases);
|
||
vector<int> nBasePairContextAtEachCycle;
|
||
GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);
|
||
|
||
const int readLengthAfterClipping = strandedClippedBases.size();
|
||
|
||
// this is necessary to ensure that we don't keep historical data in the ReadCovariates values
|
||
// since the context covariate may not span the entire set of values in read covariates
|
||
// due to the clipping of the low quality bases
|
||
if (readLengthAfterClipping != originalReadLength) {
|
||
// don't bother zeroing out if we are going to overwrite the whole array
|
||
for (int i = 0; i < originalReadLength; i++) {
|
||
// this base has been clipped off, so zero out the covariate values here
|
||
CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values);
|
||
}
|
||
}
|
||
|
||
const bool negativeStrand = bw->GetReadNegativeStrandFlag();
|
||
// Note: duplicated the loop to avoid checking recordIndelValues on each iteration
|
||
if (recordIndelValues) {
|
||
vector<int> indelKeys;
|
||
GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys);
|
||
for (int i = 0; i < readLengthAfterClipping; i++) {
|
||
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
|
||
const int indelKey = indelKeys[i];
|
||
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values);
|
||
}
|
||
} else {
|
||
for (int i = 0; i < readLengthAfterClipping; i++) {
|
||
const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
|
||
CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values);
|
||
}
|
||
}
|
||
}
|
||
|
||
// CycleCovariate 协变量的方法
|
||
|
||
/**
|
||
* Computes the encoded value of CycleCovariate's key for the given position at the read.
|
||
* Uses keyFromCycle to do the encoding.
|
||
* @param baseNumber index of the base to compute the key for
|
||
* @param read the read
|
||
* @param indel is this an indel key or a substitution key?
|
||
* @param maxCycle max value of the base to compute the key for
|
||
* (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
|
||
*/
|
||
int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
|
||
const bool isNegStrand = bw->GetReadNegativeStrandFlag();
|
||
const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
|
||
const int readLength = ad.read_len;
|
||
|
||
const int readOrderFactor = isSecondInPair ? -1 : 1;
|
||
int increment;
|
||
int cycle;
|
||
if (isNegStrand) {
|
||
cycle = readLength * readOrderFactor;
|
||
increment = -1 * readOrderFactor;
|
||
} else {
|
||
cycle = readOrderFactor;
|
||
increment = readOrderFactor;
|
||
}
|
||
cycle += baseNumber * increment;
|
||
|
||
if (!indel) {
|
||
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
|
||
}
|
||
const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1;
|
||
if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) {
|
||
return -1;
|
||
} else {
|
||
return CycleCovariate::KeyFromCycle(cycle, maxCycle);
|
||
}
|
||
}
|
||
|
||
// Used to pick out the covariate's value from attributes of the read
|
||
void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
|
||
const int readLength = ad.read_len;
|
||
// Note: duplicate the loop to void checking recordIndelValues on every iteration
|
||
if (recordIndelValues) {
|
||
for (int i = 0; i < readLength; i++) {
|
||
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
|
||
const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE);
|
||
CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
|
||
}
|
||
} else {
|
||
for (int i = 0; i < readLength; i++) {
|
||
const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
|
||
CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
|
||
}
|
||
}
|
||
} |