picard_cpp/src/sam/markdups/md_funcs.cpp


#include "md_funcs.h"

#include <common/hts/bam_buf.h>
#include <common/utils/debug.h>
#include <common/utils/murmur3.h>
#include <common/utils/profiling.h>
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <sam/utils/read_ends.h>
#include <stdint.h>

#include <iostream>
#include <map>
#include <set>
#include <vector>
#include <math.h>
#include <unordered_map>

#include "dup_metrics.h"
#include "markdups_arg.h"
#include "shared_args.h"

using std::cerr;
using std::endl;
using std::map;
using std::set;
using std::unordered_map;
using std::vector;

/* 清除key位置的数据 */
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
    auto &msIdx = *pmsIdx;
    if (msIdx.find(key) != msIdx.end())
        msIdx[key].clear();  // 清除该位点的冗余结果
}

/* 删除key位置的数据 */
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
    auto &msIdx = *pmsIdx;
    if (msIdx.find(key) != msIdx.end())
        msIdx.erase(key);
}

/*
 * 计算read的分数
 */
int16_t computeDuplicateScore(BamWrap &bw) {
    int16_t score = 0;
    switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) {
    case ns_md::SUM_OF_BASE_QUALITIES:
        // two (very) long reads worth of high-quality bases can go over
        // Short.MAX_VALUE/2 and risk overflow.
        score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
        break;
    case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
        if (!bw.GetReadUnmappedFlag())
            // no need to remember the score since this scoring mechanism is
            // symmetric
            score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
        break;
    case ns_md::RANDOM:
        // The RANDOM score gives the same score to both reads so that they get
        // filtered together. it's not critical do use the readName since the
        // scores from both ends get added, but it seem to be clearer this way.
        score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
        // subtract Short.MIN_VALUE/4 from it to end up with a number between
        // 0 and Short.MAX_VALUE/2. This number can be then discounted in case
        // the read is not passing filters. We need to stay far from overflow so
        // that when we add the two scores from the two read mates we do not
        // overflow since that could cause us to chose a failing read-pair
        // instead of a passing one.
        score -= INT16_MIN / 4;
    default:
        break;
    }
    // make sure that filter-failing records are heavily discounted. (the
    // discount can happen twice, once for each mate, so need to make sure we do
    // not subtract more than Short.MIN_VALUE overall.)
    score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;

    return score;
}

/*
 * Builds a read ends object that represents a single read.
 * 用来表示一个read的特征结构
 */
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) {
    auto &k = *pKey;
    auto &bc = bw.b->core;
    k.read1FirstOfPair = bw.GetFirstOfPairFlag();
    k.read1ReferenceIndex = bc.tid;
    k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
    k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;

    k.read1IndexInFile = index;
    k.score = computeDuplicateScore(bw);
    // Doing this lets the ends object know that it's part of a pair
    if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) {
        k.read2ReferenceIndex = bc.mtid;
    }
    // Fill in the location information for optical duplicates
    rnParser.AddLocationInformation(bw.query_name(), pKey);
    // cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
    // 计算位置key
    k.posKey =
        BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate);  // << 1 | k.orientation;
}

/**
 * Takes a list of ReadEndsForMarkDuplicates objects and identify the
 * representative read based on quality score. For all members of the duplicate
 * set, add the read1 index-in-file of the representative read to the records of
 * the first and second in a pair. This value becomes is used for the 'DI' tag.
 */
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}

/* 处理一组pairend的readends，标记冗余 */
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
                        DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
    if (vpRe.size() < 2) {
        if (vpRe.size() == 1) {
            // addSingletonToCount(libraryIdGenerator);
        }
        return;
    }
    // cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
    auto &vDupIdx = dupIdx->AtPos(posKey);
    auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);

    int maxScore = 0;
    const ReadEnds *pBest = nullptr;
    /** All read ends should have orientation FF, FR, RF, or RR **/
    for (auto pe : vpRe)  // 找分数最高的readend
    {
        if (pe->score > maxScore || pBest == nullptr) {
            maxScore = pe->score;
            pBest = pe;
        }
    }
    if (!g_mdArg.READ_NAME_REGEX.empty())  // 检查光学冗余
    {
        // trackOpticalDuplicates
    }
    for (auto pe : vpRe)  // 对非best read标记冗余
    {
        if (pe != pBest)  // 非best
        {
            vDupIdx.push_back(pe->read1IndexInFile);  // 添加read1
            if (pe->read2IndexInFile != pe->read1IndexInFile)
                vDupIdx.push_back(pe->read2IndexInFile);  // 添加read2
        }
    }

    if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
        addRepresentativeReadIndex(vpRe);
    }
}

/* 处理一组非paired的readends，标记冗余 */
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
                            DupContainer<int64_t> *dupIdx) {
    auto &vDupIdx = dupIdx->AtPos(posKey);

    if (containsPairs) {
        for (auto pe : vpRe) {
            if (!pe->IsPaired()) {
                vDupIdx.push_back(pe->read1IndexInFile);
            }
        }
    } else {
        int maxScore = 0;
        const ReadEnds *pBest = nullptr;
        for (auto pe : vpRe) {
            if (pe->score > maxScore || pBest == nullptr) {
                maxScore = pe->score;
                pBest = pe;
            }
        }

        for (auto pe : vpRe) {
            if (pe != pBest) {
                vDupIdx.push_back(pe->read1IndexInFile);
            }
        }
    }
}

/* 处理位于某个坐标的pairend reads */
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
                        vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
                        DupContainer<int64_t> *opticalDupIdx) {
    if (readEnds.size() > 1) { // 有潜在的冗余
        vpCache.clear();
        //        std::sort(readEnds.begin(), readEnds.end());
        const ReadEnds *pReadEnd = nullptr;
        for (auto &re : readEnds) {
            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true))  // 跟前一个一样
                vpCache.push_back(&re);  // 处理一个潜在的冗余组
            else {
                markDuplicatePairs(posKey, vpCache, dupIdx,
                                   opticalDupIdx);  // 不一样
                vpCache.clear();
                vpCache.push_back(&re);
                pReadEnd = &re;
            }
        }
        markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
    }
}

/* 处理位于某个坐标的 reads */
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
                        vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
    if (readEnds.size() > 1)  // 有潜在的冗余
    {
        vpCache.clear();
        //        std::sort(readEnds.begin(), readEnds.end());
        const ReadEnds *pReadEnd = nullptr;
        bool containsPairs = false;
        bool containsFrags = false;
        for (auto &re : readEnds) {
            if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
                vpCache.push_back(&re);
                containsPairs = containsPairs || re.IsPaired();
                containsFrags = containsFrags || !re.IsPaired();
            } else {
                if (vpCache.size() > 1 && containsFrags) {
                    markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
                }
                vpCache.clear();
                vpCache.push_back(&re);
                pReadEnd = &re;
                containsPairs = re.IsPaired();
                containsFrags = !re.IsPaired();
            }
        }
        if (vpCache.size() > 1 && containsFrags) {
            markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
        }
    }
}

/* 对找到的pairend read end添加一些信息 */
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
    auto &pairedEnds = *pPairedEnds;

    int64_t bamIdx = fragEnd.read1IndexInFile;
    const int matesRefIndex = fragEnd.read1ReferenceIndex;
    const int matesCoordinate = fragEnd.read1Coordinate;
    // Set orientationForOpticalDuplicates, which always goes by the first then
    // the second end for the strands.  NB: must do this before updating the
    // orientation later.
    if (fragEnd.read1FirstOfPair) {
        pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
            fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
    } else {
        pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
            pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
    }
    // If the other read is actually later, simply add the other read's data as
    // read2, else flip the reads
    if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
        (matesRefIndex == pairedEnds.read1ReferenceIndex &&
         matesCoordinate >= pairedEnds.read1Coordinate)) {
        pairedEnds.read2ReferenceIndex = matesRefIndex;
        pairedEnds.read2Coordinate = matesCoordinate;
        pairedEnds.read2IndexInFile = bamIdx;
        pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
                                                              fragEnd.IsNegativeStrand());

        // if the two read ends are in the same position, pointing in opposite
        // directions, the orientation is undefined and the procedure above will
        // depend on the order of the reads in the file. To avoid this, we set
        // it explicitly (to FR):
        if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
            pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
            pairedEnds.orientation == ReadEnds::RF) {
            pairedEnds.orientation = ReadEnds::FR;
        }
    } else {
        pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
        pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
        pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
        pairedEnds.read1ReferenceIndex = matesRefIndex;
        pairedEnds.read1Coordinate = matesCoordinate;
        pairedEnds.read1IndexInFile = bamIdx;
        pairedEnds.orientation = ReadEnds::GetOrientationByte(
            fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
        pairedEnds.posKey = fragEnd.posKey;
    }
    pairedEnds.score += fragEnd.score;
}

static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
    return lhs != rhs &&  // no comparing an object to itself (checked using object identity)!
           (lhs->tile != ReadEnds::NO_VALUE) &&
           (rhs->tile != ReadEnds::NO_VALUE) &&  // no comparing objects without locations
           lhs->tile == rhs->tile &&             // and the same tile
           abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
}

static inline bool closeEnoughShort(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
    return lhs != rhs && abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
}

/**
 * Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of
 * one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for
 * optical duplication and the rest are flagged as optical duplicates.  The set of reads that are considered
 * optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the
 * read appeared in the input list of physical locations.
 *
 * @param list   a list of reads that are determined to be duplicates of one another
 * @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be
 *               annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not
 *               contained within the list! (always not be null!)
 * @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates
 */
static void findOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe,
                           vector<bool> *pOpticalDuplicateFlags) {
    const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
    const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000;

    vector<bool> &opticalDuplicateFlags = *pOpticalDuplicateFlags;
    // opticalDuplicateFlags.push_back(true); // for test
    int len = readEndsArr.size();
    // If there is only one or zero reads passed in (so there are obviously no optical duplicates),
    // or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm),
    // then just return an array of all false
    if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) {
        return;
    }

    if (len >= 4) {
        /**
         * Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive
         * optical duplicates
         * getOpticalDuplicatesFlagWithGraph
         */
        // Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other,
        // we will then use the union-find algorithm to cluster the graph and find optical duplicate groups
        Graph<int> opticalDistanceRelationGraph;
        unordered_map<int, vector<int>> tileRGmap;
        int keeperIndex = -1;
        for (int i = 0; i < readEndsArr.size(); ++i) {
            const ReadEnds *currentLoc = readEndsArr[i];
            if (currentLoc == pBestRe)
                keeperIndex = i;
            if (currentLoc->tile != ReadEnds::NO_VALUE) {
                int key = currentLoc->tile; // 只处理一个样本，所以只有一个read group
                tileRGmap[key].push_back(i);
            }
            opticalDistanceRelationGraph.addNode(i);
        }
        // Since because finding adjacent optical duplicates is an O(n^2) operation, we can subdivide the input into its
        // readgroups in order to reduce the amount of redundant checks across readgroups between reads.
        // fillGraphFromAGroup
        for (auto &entry : tileRGmap) {
            auto &groupList = entry.second;
            if (groupList.size() > 1) {
                for (int i = 0; i < groupList.size(); ++i) {
                    int iIndex = groupList[i];
                    const ReadEnds *currentLoc = readEndsArr[iIndex];
                    for (int j = i + 1; j < groupList.size(); ++j) {
                        int jIndex = groupList[j];
                        const ReadEnds *other = readEndsArr[jIndex];
                        if (closeEnoughShort(currentLoc, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE))
                            opticalDistanceRelationGraph.addEdge(iIndex, jIndex);
                    }
                }
            }
        }
        // Keep a map of the reads and their cluster assignments
        unordered_map<int, int> opticalDuplicateClusterMap;
        opticalDistanceRelationGraph.cluster(&opticalDuplicateClusterMap);
        unordered_map<int, int> clusterToRepresentativeRead;
        int keeperCluster = -1;
        // Specially mark the keeper as specifically not a duplicate if it exists
        if (keeperIndex >= 0) {
            keeperCluster = opticalDuplicateClusterMap[keeperIndex];
            clusterToRepresentativeRead[keeperCluster] = keeperIndex;
        }

        for (auto &entry: opticalDuplicateClusterMap) {
            int recordIndex = entry.first;
            int recordAssignedCluster = entry.second;
            // If its not the first read we've seen for this cluster, mark it as an optical duplicate
            auto repReadItr = clusterToRepresentativeRead.find(recordAssignedCluster);
            if (repReadItr != clusterToRepresentativeRead.end() && recordIndex != keeperIndex) {
                const ReadEnds *representativeLoc = readEndsArr[repReadItr->second];
                const ReadEnds *currentRecordLoc = readEndsArr[recordIndex];
                // If not in the keeper cluster, then keep the minX -> minY valued duplicate (note the tile must be
                // equal for reads to cluster together)
                if (!(keeperIndex >= 0 && recordAssignedCluster == keeperCluster) && 
                        (currentRecordLoc->x < representativeLoc->x || 
                        (currentRecordLoc->x == representativeLoc->x && currentRecordLoc->y < representativeLoc->y))) {
                    // Mark the old min as an optical duplicate, and save the new min
                    opticalDuplicateFlags[repReadItr->second] = true;
                    clusterToRepresentativeRead[recordAssignedCluster] = recordIndex;
                } else {
                    // If a smaller read has already been visited, mark the test read as an optical duplicate
                    opticalDuplicateFlags[recordIndex] = true;
                }
            } else {
                clusterToRepresentativeRead[recordAssignedCluster] = recordIndex;
            }
        }
    } else {
        /**
         * Compute optical duplicates quickly in the standard case where we know that there won't be any transitive
         * distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a
         * readgroup or 3x with the keeper present
         * getOpticalDuplicatesFlagFast
         */
        // First go through and compare all the reads to the keeper
        for (int i = 0; i < len; ++i) {
            const ReadEnds *other = readEndsArr[i];
            opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE);
        }
        // Now go through and do each pairwise comparison not involving the actualKeeper
        for (int i = 0; i < len; ++i) {
            const ReadEnds *lhs = readEndsArr[i];
            if (lhs == pBestRe)  // no comparisons to actualKeeper since those are all handled above
                continue;
            for (int j = i + 1; j < len; ++j) {
                const ReadEnds *rhs = readEndsArr[j];
                if (rhs == pBestRe)  // no comparisons to actualKeeper since those are all handled above
                    continue;
                if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j])
                    continue;  // both already marked, no need to check
                if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) {
                    // At this point we want to mark either lhs or rhs as duplicate. Either could have been marked
                    // as a duplicate of the keeper (but not both - that's checked above), so be careful about which
                    // one to now mark as a duplicate.
                    int index = opticalDuplicateFlags[j] ? i : j;
                    opticalDuplicateFlags[index] = true;
                }
            }
        }
    }
}

/**
 * Looks through the set of reads and identifies how many of the duplicates are
 * in fact optical duplicates, and stores the data in the instance level histogram.
 *
 * We expect only reads with FR or RF orientations, not a mixture of both.
 *
 * In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end
 * of the mate.  In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF
 * when we order orientation by the first mate sequenced (read #1 of the pair).
 */
static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
    vector<bool> opticalDuplicateFlags(readEndsArr.size(), false);
    // find OpticalDuplicates
    findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
    int opticalDuplicates = 0;
    for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
        ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
        if (opticalDuplicateFlags[i]) {
            ++opticalDuplicates;
            // if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
            //     cout << "val: " << pRe->isOpticalDuplicate << endl;
            // }
            pRe->isOpticalDuplicate = true;
            zzhopticalSet.insert(pRe->read1IndexInFile);
            zzhopticalSet.insert(pRe->read2IndexInFile);
            zzhopticalArr.push_back(pRe->read1IndexInFile);
            zzhopticalArr.push_back(pRe->read2IndexInFile);
        } else {
            pRe->isOpticalDuplicate = false;
            zzhopticalSet.erase(pRe->read1IndexInFile);
            zzhopticalSet.erase(pRe->read2IndexInFile);
        }
    }
    return opticalDuplicates;
}

/**
 * 记录光学原因造成的冗余
 */
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
    bool hasFR = false, hasRF = false;
    int prevOpticalDupNum = 0;
    // Check to see if we have a mixture of FR/RF
    for (auto pRe : readEndsArr) {
        if (pRe->isOpticalDuplicate)
            ++prevOpticalDupNum;
        if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
            hasFR = true;
        else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
            hasRF = true;
    }

    // Check if we need to partition since the orientations could have changed
    int nOpticalDup;
    if (hasFR && hasRF) { // need to track them independently
        vector<const ReadEnds *> trackOpticalDuplicatesF;
        vector<const ReadEnds *> trackOpticalDuplicatesR;
        // Split into two lists: first of pairs and second of pairs, 
        // since they must have orientation and same starting end
        for (auto pRe: readEndsArr) {
            if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
                trackOpticalDuplicatesF.push_back(pRe);
            else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
                trackOpticalDuplicatesR.push_back(pRe);
            else
                cerr << "Found an unexpected orientation: " << pRe->orientation << endl;
        }
        // track the duplicates
        int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe);
        int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe);
        nOpticalDup = nOpticalDupF + nOpticalDupR;
    } else {  // No need to partition
        nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
    }

    // trackDuplicateCounts
    gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup;
    if (readEndsArr.size() > nOpticalDup)
        gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup;
    if (nOpticalDup)
        gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;


    gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
    //gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;

    // cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
    // cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
}