FastBQSR/src/bqsr/covariate.cpp

#include "covariate.h"

// for EventType
EventTypeValue EventType::BASE_SUBSTITUTION = {0, 'M', "Base Substitution"};
EventTypeValue EventType::BASE_INSERTION = {1, 'I', "Base Insertion"};
EventTypeValue EventType::BASE_DELETION = {2, 'D', "Base Deletion"};
vector<EventTypeValue> EventType::EVENTS = {BASE_SUBSTITUTION, BASE_INSERTION, BASE_DELETION};

// static变量 for ContextCovariate
int ContextCovariate::mismatchesContextSize;
int ContextCovariate::indelsContextSize;
int ContextCovariate::mismatchesKeyMask;
int ContextCovariate::indelsKeyMask;
uint8_t ContextCovariate::lowQualTail;
int ContextCovariate::baseIndexMap[256];

// for ReadGroupCovariate
map<string, int> ReadGroupCovariate::RgToId;  // read group name到id的映射
map<int, string> ReadGroupCovariate::IdToRg;  // id到read group name的映射

// for cycleCovariate
int CycleCovariate::MAXIMUM_CYCLE_VALUE;

// for CovariateUtils
// 对一条read计算协变量（该协变量被上一个read用过）
void CovariateUtils::ComputeCovariates(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
                                            bool recordIndelValues) {
    ReadGroupCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
    BaseQualityCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
    ContextCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
    CycleCovariate::RecordValues(bw, ad, header, values, recordIndelValues);
}

// ReadGroupCovariate 协变量的方法
void ReadGroupCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    uint8_t *rgStr = bam_aux_get(bw->b, "RG");
    char* rgVal = nullptr;
    if (rgStr) rgVal = bam_aux2Z(rgStr);
    int key = 0;
    if (rgVal == nullptr || RgToId.find(rgVal) == RgToId.end()) {
        spdlog::error("The RG tag value for read can not be found in header!");
    } else {
        key = RgToId[rgVal];
    }
    for (int i = 0; i < ad.read_len; ++i) {
        CovariateUtils::SetCovariate(key, key, key, i, ReadGroupCovariate::index, values);
    }
}

// BaseQualityCovariate 协变量的方法
void BaseQualityCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values,
                                        bool recordIndelValues) {
    // 在前面的处理过后，quals应该和base长度一致了
#define __bq_set_cov(ins, del)                                                                                           \
    do {                                                                                                                 \
        for (int i = 0; i < ad.read_len; ++i) {                                                                          \
            CovariateUtils::SetCovariate(quals[i + ad.left_clip], (ins), (del), i, BaseQualityCovariate::index, values); \
        }                                                                                                                \
    } while (0)

    const int INDEL_QUAL = 45;
    uint8_t* quals = bam_get_qual(bw->b);
    if (recordIndelValues) {
        uint8_t* insQualPtr = bam_aux_get(bw->b, "BI");  // base qualities for insertions
        uint8_t* delQualPtr = bam_aux_get(bw->b, "BD");  // base qualities for deletions
        if (insQualPtr == nullptr && delQualPtr == nullptr) {
            __bq_set_cov(INDEL_QUAL, INDEL_QUAL);
        } else if (insQualPtr == nullptr) {
            uint8_t* delQuals = (uint8_t*)bam_aux2Z(delQualPtr);
            __bq_set_cov(INDEL_QUAL, delQuals[i]);
        } else {
            uint8_t* insQuals = (uint8_t*)bam_aux2Z(insQualPtr);
            __bq_set_cov(insQuals[i], INDEL_QUAL);
        }
    } else {
        __bq_set_cov(0, 0);
    }
}

// ContextCovariate 协变量的方法

static char SimpleComplement(const char base) {
    switch (base) {
    case 'A':
    case 'a':
        return 'T';
    case 'C':
    case 'c':
        return 'G';
    case 'G':
    case 'g':
        return 'C';
    case 'T':
    case 't':
        return 'A';
    default:
        return base;
    }
}

// 获取去除低质量分数碱基之后的read碱基序列（将低质量分数的碱基变成N）
void ContextCovariate::GetStrandedClippedBytes(BamWrap* bw, SamData& ad, string& clippedBases, uint8_t lowQTail) {
    uint8_t* quals = bam_get_qual(bw->b) + ad.left_clip;

    if (bw->GetReadNegativeStrandFlag()) {  // 反向互补
        for (int i = 0; i < ad.read_len; ++i) clippedBases[i] = SimpleComplement(ad.bases[ad.read_len - 1 - i]);
    }

    // 处理左边
    int left = 0;
    for (; left < ad.read_len; ++left) {
        if (quals[left] <= lowQTail)
            clippedBases[left] = 'N';
        else
            break;
    }
    if (left == ad.read_len) {
        clippedBases.clear();
        return;
    }
    // 处理右边
    int right = ad.read_len - 1;
    for (; right >= 0; --right) {
        if (quals[right] <= lowQTail)
            clippedBases[right] = 'N';
        else
            break;
    }
    if (right < left)
        clippedBases.clear();
}

/**
 * Creates a int representation of a given dna string.
 *
 * @param dna    the dna sequence
 * @param start  the start position in the byte array (inclusive)
 * @param end    the end position in the array (exclusive)
 * @return the key representing the dna sequence
 */
int ContextCovariate::KeyFromContext(const string& dna, const int start, const int end) {
    int key = end - start;
    int bitOffset = LENGTH_BITS;
    for (int i = start; i < end; i++) {
        const int baseIndex = baseIndexMap[dna[i] & 0xff];
        if (baseIndex == -1) {  // ignore non-ACGT bases
            return -1;
        }
        key |= (baseIndex << bitOffset);
        bitOffset += 2;
    }
    return key;
}

/**
 * For each position of the read, calculate the n-base-pair *read* base context (as opposed to the reference context).
 *
 * For example, for the read [AGCTG], return the list
 *   [-1, "AG", "GC", "CT", "TG" ]
 * with each string context encoded as an integer.
 *
 * @param bases       the bases in the read to build the context from
 * @param contextSize context size to use building the context
 * @param mask        mask for pulling out just the context bits
 *
 * @return a list that has the same length as the read and contains the (preceding) n-base context at each position.
 *
 */
void ContextCovariate::GetReadContextAtEachPosition(const string& bases, const int contextSize, const int mask, vector<int>& keys) {
    int readLength = bases.size();
    keys.resize(readLength);
    int keyIdx = 0;
    // the first contextSize-1 bases will not have enough previous context
    for (int i = 1; i < contextSize && i <= readLength; i++) {
        keys[keyIdx++] = UNKNOWN_OR_ERROR_CONTEXT_CODE;
    }
    if (readLength < contextSize)
        return;

    int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS;

    // get (and add) the key for the context starting at the first base
    int currentKey = KeyFromContext(bases, 0, contextSize);
    keys[keyIdx++] = currentKey;

    // if the first key was -1 then there was an non-ACGT in the context; figure out how many more consecutive contexts it affects
    int currentNPenalty = 0;
    if (currentKey == -1) {
        currentKey = 0;
        currentNPenalty = contextSize - 1;
        int offset = newBaseOffset;
        int baseIndex;
        while ((baseIndex = baseIndexMap[bases[currentNPenalty]]) != -1) {
            currentKey |= (baseIndex << offset);
            offset -= 2;
            currentNPenalty--;
        }
    }

    for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) {
        const int baseIndex = baseIndexMap[bases[currentIndex]];
        if (baseIndex == -1) {  // ignore non-ACGT bases
            currentNPenalty = contextSize;
            currentKey = 0;  // reset the key
        } else {
            // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length
            // in
            currentKey = (currentKey >> 2) & mask;
            currentKey |= (baseIndex << newBaseOffset);
            currentKey |= contextSize;
        }

        if (currentNPenalty == 0) {
            keys[keyIdx++] = currentKey;
        } else {
            currentNPenalty--;
            keys[keyIdx++] = -1;
        }
    }
}

void ContextCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    const int originalReadLength = ad.read_len;

    // store the original bases and then write Ns over low quality ones
    string strandedClippedBases(ad.bases);
    GetStrandedClippedBytes(bw, ad, strandedClippedBases, lowQualTail);
    // spdlog::info("bases: {}", strandedClippedBases);
    vector<int> nBasePairContextAtEachCycle;
    GetReadContextAtEachPosition(strandedClippedBases, mismatchesContextSize, mismatchesKeyMask, nBasePairContextAtEachCycle);

    const int readLengthAfterClipping = strandedClippedBases.size();

    // this is necessary to ensure that we don't keep historical data in the ReadCovariates values
    // since the context covariate may not span the entire set of values in read covariates
    // due to the clipping of the low quality bases
    if (readLengthAfterClipping != originalReadLength) {
        // don't bother zeroing out if we are going to overwrite the whole array
        for (int i = 0; i < originalReadLength; i++) {
            // this base has been clipped off, so zero out the covariate values here
            CovariateUtils::SetCovariate(0, 0, 0, i, ContextCovariate::index, values);
        }
    }

    const bool negativeStrand = bw->GetReadNegativeStrandFlag();
    // Note: duplicated the loop to avoid checking recordIndelValues on each iteration
    if (recordIndelValues) {
        vector<int> indelKeys;
        GetReadContextAtEachPosition(strandedClippedBases, indelsContextSize, indelsKeyMask, indelKeys);
        for (int i = 0; i < readLengthAfterClipping; i++) {
            const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
            const int indelKey = indelKeys[i];
            CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], indelKey, indelKey, readOffset, ContextCovariate::index, values);
        }
    } else {
        for (int i = 0; i < readLengthAfterClipping; i++) {
            const int readOffset = GetStrandedOffset(negativeStrand, i, readLengthAfterClipping);
            CovariateUtils::SetCovariate(nBasePairContextAtEachCycle[i], 0, 0, readOffset, ContextCovariate::index, values);
        }
    }
}

// CycleCovariate 协变量的方法

/**
 * Computes the encoded value of CycleCovariate's key for the given position at the read.
 * Uses keyFromCycle to do the encoding.
 * @param baseNumber index of the base to compute the key for
 * @param read the read
 * @param indel is this an indel key or a substitution key?
 * @param maxCycle max value of the base to compute the key for
 *                 (this method throws UserException if the computed absolute value of the cycle number is higher than this value).
 */
int CycleCovariate::CycleKey(BamWrap* bw, SamData& ad, const int baseNumber, const bool indel, const int maxCycle) {
    const bool isNegStrand = bw->GetReadNegativeStrandFlag();
    const bool isSecondInPair = (bw->b->core.flag & BAM_FPAIRED) && (bw->b->core.flag & BAM_FREAD2);
    const int readLength = ad.read_len;

    const int readOrderFactor = isSecondInPair ? -1 : 1;
    int increment;
    int cycle;
    if (isNegStrand) {
        cycle = readLength * readOrderFactor;
        increment = -1 * readOrderFactor;
    } else {
        cycle = readOrderFactor;
        increment = readOrderFactor;
    }
    cycle += baseNumber * increment;

    if (!indel) {
        return CycleCovariate::KeyFromCycle(cycle, maxCycle);
    }
    const int maxCycleForIndels = readLength - CUSHION_FOR_INDELS - 1;
    if (baseNumber < CUSHION_FOR_INDELS || baseNumber > maxCycleForIndels) {
        return -1;
    } else {
        return CycleCovariate::KeyFromCycle(cycle, maxCycle);
    }
}

// Used to pick out the covariate's value from attributes of the read
void CycleCovariate::RecordValues(BamWrap* bw, SamData& ad, sam_hdr_t* header, PerReadCovariateMatrix& values, bool recordIndelValues) {
    const int readLength = ad.read_len;
    // Note: duplicate the loop to void checking recordIndelValues on every iteration
    if (recordIndelValues) {
        for (int i = 0; i < readLength; i++) {
            const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
            const int indelKey = CycleKey(bw, ad, i, true, MAXIMUM_CYCLE_VALUE);
            CovariateUtils::SetCovariate(substitutionKey, indelKey, indelKey, i, CycleCovariate::index, values);
        }
    } else {
        for (int i = 0; i < readLength; i++) {
            const int substitutionKey = CycleKey(bw, ad, i, false, MAXIMUM_CYCLE_VALUE);
            CovariateUtils::SetCovariate(substitutionKey, 0, 0, i, CycleCovariate::index, values);
        }
    }
}