picard_cpp/src/sam/markdups/md_funcs.cpp

533 lines
24 KiB
C++
Raw Normal View History

#include "md_funcs.h"
#include <common/hts/bam_buf.h>
#include <common/utils/debug.h>
#include <common/utils/murmur3.h>
#include <common/utils/profiling.h>
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <sam/utils/read_ends.h>
#include <stdint.h>
#include <iostream>
#include <map>
#include <set>
#include <vector>
#include <math.h>
#include <unordered_map>
#include "dup_metrics.h"
#include "markdups_arg.h"
#include "shared_args.h"
using std::cerr;
using std::endl;
using std::map;
using std::set;
using std::unordered_map;
using std::vector;
/* 清除key位置的数据 */
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx[key].clear(); // 清除该位点的冗余结果
}
/* 删除key位置的数据 */
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx.erase(key);
}
/*
* read
*/
int16_t computeDuplicateScore(BamWrap &bw) {
int16_t score = 0;
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) {
case ns_md::SUM_OF_BASE_QUALITIES:
// two (very) long reads worth of high-quality bases can go over
// Short.MAX_VALUE/2 and risk overflow.
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
break;
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
if (!bw.GetReadUnmappedFlag())
// no need to remember the score since this scoring mechanism is
// symmetric
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
break;
case ns_md::RANDOM:
// The RANDOM score gives the same score to both reads so that they get
// filtered together. it's not critical do use the readName since the
// scores from both ends get added, but it seem to be clearer this way.
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
// subtract Short.MIN_VALUE/4 from it to end up with a number between
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case
// the read is not passing filters. We need to stay far from overflow so
// that when we add the two scores from the two read mates we do not
// overflow since that could cause us to chose a failing read-pair
// instead of a passing one.
score -= INT16_MIN / 4;
default:
break;
}
// make sure that filter-failing records are heavily discounted. (the
// discount can happen twice, once for each mate, so need to make sure we do
// not subtract more than Short.MIN_VALUE overall.)
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
return score;
}
/*
* Builds a read ends object that represents a single read.
* read
*/
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) {
auto &k = *pKey;
auto &bc = bw.b->core;
k.read1FirstOfPair = bw.GetFirstOfPairFlag();
k.read1ReferenceIndex = bc.tid;
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
k.read1IndexInFile = index;
k.score = computeDuplicateScore(bw);
// Doing this lets the ends object know that it's part of a pair
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) {
k.read2ReferenceIndex = bc.mtid;
}
// Fill in the location information for optical duplicates
rnParser.AddLocationInformation(bw.query_name(), pKey);
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
// 计算位置key
k.posKey =
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
}
/**
* Takes a list of ReadEndsForMarkDuplicates objects and identify the
* representative read based on quality score. For all members of the duplicate
* set, add the read1 index-in-file of the representative read to the records of
* the first and second in a pair. This value becomes is used for the 'DI' tag.
*/
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
/* 处理一组pairend的readends标记冗余 */
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
if (vpRe.size() < 2) {
if (vpRe.size() == 1) {
// addSingletonToCount(libraryIdGenerator);
}
return;
}
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
auto &vDupIdx = dupIdx->AtPos(posKey);
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
int maxScore = 0;
const ReadEnds *pBest = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) // 找分数最高的readend
{
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
{
// trackOpticalDuplicates
}
for (auto pe : vpRe) // 对非best read标记冗余
{
if (pe != pBest) // 非best
{
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
}
}
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
addRepresentativeReadIndex(vpRe);
}
}
/* 处理一组非paired的readends标记冗余 */
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
DupContainer<int64_t> *dupIdx) {
auto &vDupIdx = dupIdx->AtPos(posKey);
if (containsPairs) {
for (auto pe : vpRe) {
if (!pe->IsPaired()) {
vDupIdx.push_back(pe->read1IndexInFile);
}
}
} else {
int maxScore = 0;
const ReadEnds *pBest = nullptr;
for (auto pe : vpRe) {
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
for (auto pe : vpRe) {
if (pe != pBest) {
vDupIdx.push_back(pe->read1IndexInFile);
}
}
}
}
/* 处理位于某个坐标的pairend reads */
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
DupContainer<int64_t> *opticalDupIdx) {
if (readEnds.size() > 1) { // 有潜在的冗余
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组
else {
markDuplicatePairs(posKey, vpCache, dupIdx,
opticalDupIdx); // 不一样
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
}
}
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
}
}
/* 处理位于某个坐标的 reads */
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
if (readEnds.size() > 1) // 有潜在的冗余
{
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
bool containsPairs = false;
bool containsFrags = false;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
vpCache.push_back(&re);
containsPairs = containsPairs || re.IsPaired();
containsFrags = containsFrags || !re.IsPaired();
} else {
if (vpCache.size() > 1 && containsFrags) {
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
containsPairs = re.IsPaired();
containsFrags = !re.IsPaired();
}
}
if (vpCache.size() > 1 && containsFrags) {
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
}
}
/* 对找到的pairend read end添加一些信息 */
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
auto &pairedEnds = *pPairedEnds;
2024-08-28 12:00:23 +08:00
int64_t bamIdx = fragEnd.read1IndexInFile;
const int matesRefIndex = fragEnd.read1ReferenceIndex;
const int matesCoordinate = fragEnd.read1Coordinate;
// Set orientationForOpticalDuplicates, which always goes by the first then
// the second end for the strands. NB: must do this before updating the
// orientation later.
if (fragEnd.read1FirstOfPair) {
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
} else {
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
}
// If the other read is actually later, simply add the other read's data as
// read2, else flip the reads
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
(matesRefIndex == pairedEnds.read1ReferenceIndex &&
matesCoordinate >= pairedEnds.read1Coordinate)) {
pairedEnds.read2ReferenceIndex = matesRefIndex;
pairedEnds.read2Coordinate = matesCoordinate;
pairedEnds.read2IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
fragEnd.IsNegativeStrand());
// if the two read ends are in the same position, pointing in opposite
// directions, the orientation is undefined and the procedure above will
// depend on the order of the reads in the file. To avoid this, we set
// it explicitly (to FR):
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds::RF) {
pairedEnds.orientation = ReadEnds::FR;
}
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = matesRefIndex;
pairedEnds.read1Coordinate = matesCoordinate;
pairedEnds.read1IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
pairedEnds.posKey = fragEnd.posKey;
}
pairedEnds.score += fragEnd.score;
}
static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
return lhs != rhs && // no comparing an object to itself (checked using object identity)!
(lhs->tile != ReadEnds::NO_VALUE) &&
(rhs->tile != ReadEnds::NO_VALUE) && // no comparing objects without locations
lhs->tile == rhs->tile && // and the same tile
abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
}
static inline bool closeEnoughShort(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
return lhs != rhs && abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
}
/**
* Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of
* one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for
* optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered
* optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the
* read appeared in the input list of physical locations.
*
* @param list a list of reads that are determined to be duplicates of one another
* @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be
* annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not
* contained within the list! (always not be null!)
* @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates
*/
static void findOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe,
vector<bool> *pOpticalDuplicateFlags) {
const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000;
vector<bool> &opticalDuplicateFlags = *pOpticalDuplicateFlags;
// opticalDuplicateFlags.push_back(true); // for test
int len = readEndsArr.size();
// If there is only one or zero reads passed in (so there are obviously no optical duplicates),
// or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm),
// then just return an array of all false
if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) {
return;
}
if (len >= 4) {
/**
* Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive
* optical duplicates
* getOpticalDuplicatesFlagWithGraph
*/
// Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other,
// we will then use the union-find algorithm to cluster the graph and find optical duplicate groups
Graph<int> opticalDistanceRelationGraph;
unordered_map<int, vector<int>> tileRGmap;
int keeperIndex = -1;
for (int i = 0; i < readEndsArr.size(); ++i) {
const ReadEnds *currentLoc = readEndsArr[i];
if (currentLoc == pBestRe)
keeperIndex = i;
if (currentLoc->tile != ReadEnds::NO_VALUE) {
int key = currentLoc->tile; // 只处理一个样本所以只有一个read group
tileRGmap[key].push_back(i);
}
opticalDistanceRelationGraph.addNode(i);
}
// Since because finding adjacent optical duplicates is an O(n^2) operation, we can subdivide the input into its
// readgroups in order to reduce the amount of redundant checks across readgroups between reads.
// fillGraphFromAGroup
for (auto &entry : tileRGmap) {
auto &groupList = entry.second;
if (groupList.size() > 1) {
for (int i = 0; i < groupList.size(); ++i) {
int iIndex = groupList[i];
const ReadEnds *currentLoc = readEndsArr[iIndex];
for (int j = i + 1; j < groupList.size(); ++j) {
int jIndex = groupList[j];
const ReadEnds *other = readEndsArr[jIndex];
if (closeEnoughShort(currentLoc, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE))
opticalDistanceRelationGraph.addEdge(iIndex, jIndex);
}
}
}
}
// Keep a map of the reads and their cluster assignments
unordered_map<int, int> opticalDuplicateClusterMap;
opticalDistanceRelationGraph.cluster(&opticalDuplicateClusterMap);
unordered_map<int, int> clusterToRepresentativeRead;
int keeperCluster = -1;
// Specially mark the keeper as specifically not a duplicate if it exists
if (keeperIndex >= 0) {
keeperCluster = opticalDuplicateClusterMap[keeperIndex];
clusterToRepresentativeRead[keeperCluster] = keeperIndex;
}
for (auto &entry: opticalDuplicateClusterMap) {
int recordIndex = entry.first;
int recordAssignedCluster = entry.second;
// If its not the first read we've seen for this cluster, mark it as an optical duplicate
auto repReadItr = clusterToRepresentativeRead.find(recordAssignedCluster);
if (repReadItr != clusterToRepresentativeRead.end() && recordIndex != keeperIndex) {
const ReadEnds *representativeLoc = readEndsArr[repReadItr->second];
const ReadEnds *currentRecordLoc = readEndsArr[recordIndex];
// If not in the keeper cluster, then keep the minX -> minY valued duplicate (note the tile must be
// equal for reads to cluster together)
if (!(keeperIndex >= 0 && recordAssignedCluster == keeperCluster) &&
(currentRecordLoc->x < representativeLoc->x ||
(currentRecordLoc->x == representativeLoc->x && currentRecordLoc->y < representativeLoc->y))) {
// Mark the old min as an optical duplicate, and save the new min
opticalDuplicateFlags[repReadItr->second] = true;
clusterToRepresentativeRead[recordAssignedCluster] = recordIndex;
} else {
// If a smaller read has already been visited, mark the test read as an optical duplicate
opticalDuplicateFlags[recordIndex] = true;
}
} else {
clusterToRepresentativeRead[recordAssignedCluster] = recordIndex;
}
}
} else {
/**
* Compute optical duplicates quickly in the standard case where we know that there won't be any transitive
* distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a
* readgroup or 3x with the keeper present
* getOpticalDuplicatesFlagFast
*/
// First go through and compare all the reads to the keeper
for (int i = 0; i < len; ++i) {
const ReadEnds *other = readEndsArr[i];
opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE);
}
// Now go through and do each pairwise comparison not involving the actualKeeper
for (int i = 0; i < len; ++i) {
const ReadEnds *lhs = readEndsArr[i];
if (lhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
continue;
for (int j = i + 1; j < len; ++j) {
const ReadEnds *rhs = readEndsArr[j];
if (rhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
continue;
if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j])
continue; // both already marked, no need to check
if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) {
// At this point we want to mark either lhs or rhs as duplicate. Either could have been marked
// as a duplicate of the keeper (but not both - that's checked above), so be careful about which
// one to now mark as a duplicate.
int index = opticalDuplicateFlags[j] ? i : j;
opticalDuplicateFlags[index] = true;
}
}
}
}
}
/**
* Looks through the set of reads and identifies how many of the duplicates are
* in fact optical duplicates, and stores the data in the instance level histogram.
*
* We expect only reads with FR or RF orientations, not a mixture of both.
*
* In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end
* of the mate. In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF
* when we order orientation by the first mate sequenced (read #1 of the pair).
*/
static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
vector<bool> opticalDuplicateFlags(readEndsArr.size(), false);
// find OpticalDuplicates
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
int opticalDuplicates = 0;
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
2024-08-28 12:00:23 +08:00
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
if (opticalDuplicateFlags[i]) {
++opticalDuplicates;
2024-08-28 12:00:23 +08:00
// if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
// cout << "val: " << pRe->isOpticalDuplicate << endl;
// }
pRe->isOpticalDuplicate = true;
2024-08-28 12:00:23 +08:00
zzhopticalSet.insert(pRe->read1IndexInFile);
zzhopticalSet.insert(pRe->read2IndexInFile);
zzhopticalArr.push_back(pRe->read1IndexInFile);
zzhopticalArr.push_back(pRe->read2IndexInFile);
} else {
pRe->isOpticalDuplicate = false;
zzhopticalSet.erase(pRe->read1IndexInFile);
zzhopticalSet.erase(pRe->read2IndexInFile);
}
}
return opticalDuplicates;
}
/**
*
*/
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
bool hasFR = false, hasRF = false;
2024-08-28 12:00:23 +08:00
int prevOpticalDupNum = 0;
// Check to see if we have a mixture of FR/RF
for (auto pRe : readEndsArr) {
2024-08-28 12:00:23 +08:00
if (pRe->isOpticalDuplicate)
++prevOpticalDupNum;
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
hasFR = true;
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
hasRF = true;
}
// Check if we need to partition since the orientations could have changed
int nOpticalDup;
if (hasFR && hasRF) { // need to track them independently
vector<const ReadEnds *> trackOpticalDuplicatesF;
vector<const ReadEnds *> trackOpticalDuplicatesR;
// Split into two lists: first of pairs and second of pairs,
// since they must have orientation and same starting end
for (auto pRe: readEndsArr) {
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
trackOpticalDuplicatesF.push_back(pRe);
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
trackOpticalDuplicatesR.push_back(pRe);
else
cerr << "Found an unexpected orientation: " << pRe->orientation << endl;
}
// track the duplicates
int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe);
int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe);
nOpticalDup = nOpticalDupF + nOpticalDupR;
} else { // No need to partition
nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
}
// trackDuplicateCounts
gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup;
if (readEndsArr.size() > nOpticalDup)
gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup;
if (nOpticalDup)
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
2024-08-28 12:00:23 +08:00
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
2024-08-28 12:00:23 +08:00
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
}