picard_cpp/src/sam/markdups/md_funcs.cpp

533 lines
24 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include "md_funcs.h"
#include <common/hts/bam_buf.h>
#include <common/utils/debug.h>
#include <common/utils/murmur3.h>
#include <common/utils/profiling.h>
#include <common/utils/timer.h>
#include <common/utils/util.h>
#include <sam/utils/read_ends.h>
#include <stdint.h>
#include <iostream>
#include <map>
#include <set>
#include <vector>
#include <math.h>
#include <unordered_map>
#include "dup_metrics.h"
#include "markdups_arg.h"
#include "shared_args.h"
using std::cerr;
using std::endl;
using std::map;
using std::set;
using std::unordered_map;
using std::vector;
/* 清除key位置的数据 */
void clearIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx[key].clear(); // 清除该位点的冗余结果
}
/* 删除key位置的数据 */
void delIdxAtPos(int64_t key, map<int64_t, set<int64_t>> *pmsIdx) {
auto &msIdx = *pmsIdx;
if (msIdx.find(key) != msIdx.end())
msIdx.erase(key);
}
/*
* 计算read的分数
*/
int16_t computeDuplicateScore(BamWrap &bw) {
int16_t score = 0;
switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) {
case ns_md::SUM_OF_BASE_QUALITIES:
// two (very) long reads worth of high-quality bases can go over
// Short.MAX_VALUE/2 and risk overflow.
score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2);
break;
case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH:
if (!bw.GetReadUnmappedFlag())
// no need to remember the score since this scoring mechanism is
// symmetric
score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2);
break;
case ns_md::RANDOM:
// The RANDOM score gives the same score to both reads so that they get
// filtered together. it's not critical do use the readName since the
// scores from both ends get added, but it seem to be clearer this way.
score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111);
// subtract Short.MIN_VALUE/4 from it to end up with a number between
// 0 and Short.MAX_VALUE/2. This number can be then discounted in case
// the read is not passing filters. We need to stay far from overflow so
// that when we add the two scores from the two read mates we do not
// overflow since that could cause us to chose a failing read-pair
// instead of a passing one.
score -= INT16_MIN / 4;
default:
break;
}
// make sure that filter-failing records are heavily discounted. (the
// discount can happen twice, once for each mate, so need to make sure we do
// not subtract more than Short.MIN_VALUE overall.)
score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0;
return score;
}
/*
* Builds a read ends object that represents a single read.
* 用来表示一个read的特征结构
*/
void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) {
auto &k = *pKey;
auto &bc = bw.b->core;
k.read1FirstOfPair = bw.GetFirstOfPairFlag();
k.read1ReferenceIndex = bc.tid;
k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart();
k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F;
k.read1IndexInFile = index;
k.score = computeDuplicateScore(bw);
// Doing this lets the ends object know that it's part of a pair
if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) {
k.read2ReferenceIndex = bc.mtid;
}
// Fill in the location information for optical duplicates
rnParser.AddLocationInformation(bw.query_name(), pKey);
// cout << k.tile << ' ' << k.x << ' ' << k.y << endl;
// 计算位置key
k.posKey =
BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation;
}
/**
* Takes a list of ReadEndsForMarkDuplicates objects and identify the
* representative read based on quality score. For all members of the duplicate
* set, add the read1 index-in-file of the representative read to the records of
* the first and second in a pair. This value becomes is used for the 'DI' tag.
*/
void addRepresentativeReadIndex(vector<const ReadEnds *> &vpRe) {}
/* 处理一组pairend的readends标记冗余 */
void markDuplicatePairs(int64_t posKey, vector<const ReadEnds *> &vpRe,
DupContainer<int64_t> *dupIdx, DupContainer<int64_t> *opticalDupIdx) {
if (vpRe.size() < 2) {
if (vpRe.size() == 1) {
// addSingletonToCount(libraryIdGenerator);
}
return;
}
// cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl;
auto &vDupIdx = dupIdx->AtPos(posKey);
auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey);
int maxScore = 0;
const ReadEnds *pBest = nullptr;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (auto pe : vpRe) // 找分数最高的readend
{
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余
{
// trackOpticalDuplicates
}
for (auto pe : vpRe) // 对非best read标记冗余
{
if (pe != pBest) // 非best
{
vDupIdx.push_back(pe->read1IndexInFile); // 添加read1
if (pe->read2IndexInFile != pe->read1IndexInFile)
vDupIdx.push_back(pe->read2IndexInFile); // 添加read2
}
}
if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) {
addRepresentativeReadIndex(vpRe);
}
}
/* 处理一组非paired的readends标记冗余 */
void markDuplicateFragments(int64_t posKey, vector<const ReadEnds *> &vpRe, bool containsPairs,
DupContainer<int64_t> *dupIdx) {
auto &vDupIdx = dupIdx->AtPos(posKey);
if (containsPairs) {
for (auto pe : vpRe) {
if (!pe->IsPaired()) {
vDupIdx.push_back(pe->read1IndexInFile);
}
}
} else {
int maxScore = 0;
const ReadEnds *pBest = nullptr;
for (auto pe : vpRe) {
if (pe->score > maxScore || pBest == nullptr) {
maxScore = pe->score;
pBest = pe;
}
}
for (auto pe : vpRe) {
if (pe != pBest) {
vDupIdx.push_back(pe->read1IndexInFile);
}
}
}
}
/* 处理位于某个坐标的pairend reads */
void handlePairs(int64_t posKey, vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx,
DupContainer<int64_t> *opticalDupIdx) {
if (readEnds.size() > 1) { // 有潜在的冗余
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样
vpCache.push_back(&re); // 处理一个潜在的冗余组
else {
markDuplicatePairs(posKey, vpCache, dupIdx,
opticalDupIdx); // 不一样
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
}
}
markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx);
}
}
/* 处理位于某个坐标的 reads */
void handleFrags(int64_t posKey, vector<ReadEnds> &readEnds,
vector<const ReadEnds *> &vpCache, DupContainer<int64_t> *dupIdx) {
if (readEnds.size() > 1) // 有潜在的冗余
{
vpCache.clear();
// std::sort(readEnds.begin(), readEnds.end());
const ReadEnds *pReadEnd = nullptr;
bool containsPairs = false;
bool containsFrags = false;
for (auto &re : readEnds) {
if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) {
vpCache.push_back(&re);
containsPairs = containsPairs || re.IsPaired();
containsFrags = containsFrags || !re.IsPaired();
} else {
if (vpCache.size() > 1 && containsFrags) {
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
vpCache.clear();
vpCache.push_back(&re);
pReadEnd = &re;
containsPairs = re.IsPaired();
containsFrags = !re.IsPaired();
}
}
if (vpCache.size() > 1 && containsFrags) {
markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx);
}
}
}
/* 对找到的pairend read end添加一些信息 */
void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) {
auto &pairedEnds = *pPairedEnds;
int64_t bamIdx = fragEnd.read1IndexInFile;
const int matesRefIndex = fragEnd.read1ReferenceIndex;
const int matesCoordinate = fragEnd.read1Coordinate;
// Set orientationForOpticalDuplicates, which always goes by the first then
// the second end for the strands. NB: must do this before updating the
// orientation later.
if (fragEnd.read1FirstOfPair) {
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
} else {
pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte(
pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand());
}
// If the other read is actually later, simply add the other read's data as
// read2, else flip the reads
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
(matesRefIndex == pairedEnds.read1ReferenceIndex &&
matesCoordinate >= pairedEnds.read1Coordinate)) {
pairedEnds.read2ReferenceIndex = matesRefIndex;
pairedEnds.read2Coordinate = matesCoordinate;
pairedEnds.read2IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R,
fragEnd.IsNegativeStrand());
// if the two read ends are in the same position, pointing in opposite
// directions, the orientation is undefined and the procedure above will
// depend on the order of the reads in the file. To avoid this, we set
// it explicitly (to FR):
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds::RF) {
pairedEnds.orientation = ReadEnds::FR;
}
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = matesRefIndex;
pairedEnds.read1Coordinate = matesCoordinate;
pairedEnds.read1IndexInFile = bamIdx;
pairedEnds.orientation = ReadEnds::GetOrientationByte(
fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R);
pairedEnds.posKey = fragEnd.posKey;
}
pairedEnds.score += fragEnd.score;
}
static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
return lhs != rhs && // no comparing an object to itself (checked using object identity)!
(lhs->tile != ReadEnds::NO_VALUE) &&
(rhs->tile != ReadEnds::NO_VALUE) && // no comparing objects without locations
lhs->tile == rhs->tile && // and the same tile
abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
}
static inline bool closeEnoughShort(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) {
return lhs != rhs && abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance;
}
/**
* Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of
* one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for
* optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered
* optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the
* read appeared in the input list of physical locations.
*
* @param list a list of reads that are determined to be duplicates of one another
* @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be
* annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not
* contained within the list! (always not be null!)
* @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates
*/
static void findOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe,
vector<bool> *pOpticalDuplicateFlags) {
const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000;
vector<bool> &opticalDuplicateFlags = *pOpticalDuplicateFlags;
// opticalDuplicateFlags.push_back(true); // for test
int len = readEndsArr.size();
// If there is only one or zero reads passed in (so there are obviously no optical duplicates),
// or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm),
// then just return an array of all false
if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) {
return;
}
if (len >= 4) {
/**
* Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive
* optical duplicates
* getOpticalDuplicatesFlagWithGraph
*/
// Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other,
// we will then use the union-find algorithm to cluster the graph and find optical duplicate groups
Graph<int> opticalDistanceRelationGraph;
unordered_map<int, vector<int>> tileRGmap;
int keeperIndex = -1;
for (int i = 0; i < readEndsArr.size(); ++i) {
const ReadEnds *currentLoc = readEndsArr[i];
if (currentLoc == pBestRe)
keeperIndex = i;
if (currentLoc->tile != ReadEnds::NO_VALUE) {
int key = currentLoc->tile; // 只处理一个样本所以只有一个read group
tileRGmap[key].push_back(i);
}
opticalDistanceRelationGraph.addNode(i);
}
// Since because finding adjacent optical duplicates is an O(n^2) operation, we can subdivide the input into its
// readgroups in order to reduce the amount of redundant checks across readgroups between reads.
// fillGraphFromAGroup
for (auto &entry : tileRGmap) {
auto &groupList = entry.second;
if (groupList.size() > 1) {
for (int i = 0; i < groupList.size(); ++i) {
int iIndex = groupList[i];
const ReadEnds *currentLoc = readEndsArr[iIndex];
for (int j = i + 1; j < groupList.size(); ++j) {
int jIndex = groupList[j];
const ReadEnds *other = readEndsArr[jIndex];
if (closeEnoughShort(currentLoc, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE))
opticalDistanceRelationGraph.addEdge(iIndex, jIndex);
}
}
}
}
// Keep a map of the reads and their cluster assignments
unordered_map<int, int> opticalDuplicateClusterMap;
opticalDistanceRelationGraph.cluster(&opticalDuplicateClusterMap);
unordered_map<int, int> clusterToRepresentativeRead;
int keeperCluster = -1;
// Specially mark the keeper as specifically not a duplicate if it exists
if (keeperIndex >= 0) {
keeperCluster = opticalDuplicateClusterMap[keeperIndex];
clusterToRepresentativeRead[keeperCluster] = keeperIndex;
}
for (auto &entry: opticalDuplicateClusterMap) {
int recordIndex = entry.first;
int recordAssignedCluster = entry.second;
// If its not the first read we've seen for this cluster, mark it as an optical duplicate
auto repReadItr = clusterToRepresentativeRead.find(recordAssignedCluster);
if (repReadItr != clusterToRepresentativeRead.end() && recordIndex != keeperIndex) {
const ReadEnds *representativeLoc = readEndsArr[repReadItr->second];
const ReadEnds *currentRecordLoc = readEndsArr[recordIndex];
// If not in the keeper cluster, then keep the minX -> minY valued duplicate (note the tile must be
// equal for reads to cluster together)
if (!(keeperIndex >= 0 && recordAssignedCluster == keeperCluster) &&
(currentRecordLoc->x < representativeLoc->x ||
(currentRecordLoc->x == representativeLoc->x && currentRecordLoc->y < representativeLoc->y))) {
// Mark the old min as an optical duplicate, and save the new min
opticalDuplicateFlags[repReadItr->second] = true;
clusterToRepresentativeRead[recordAssignedCluster] = recordIndex;
} else {
// If a smaller read has already been visited, mark the test read as an optical duplicate
opticalDuplicateFlags[recordIndex] = true;
}
} else {
clusterToRepresentativeRead[recordAssignedCluster] = recordIndex;
}
}
} else {
/**
* Compute optical duplicates quickly in the standard case where we know that there won't be any transitive
* distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a
* readgroup or 3x with the keeper present
* getOpticalDuplicatesFlagFast
*/
// First go through and compare all the reads to the keeper
for (int i = 0; i < len; ++i) {
const ReadEnds *other = readEndsArr[i];
opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE);
}
// Now go through and do each pairwise comparison not involving the actualKeeper
for (int i = 0; i < len; ++i) {
const ReadEnds *lhs = readEndsArr[i];
if (lhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
continue;
for (int j = i + 1; j < len; ++j) {
const ReadEnds *rhs = readEndsArr[j];
if (rhs == pBestRe) // no comparisons to actualKeeper since those are all handled above
continue;
if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j])
continue; // both already marked, no need to check
if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) {
// At this point we want to mark either lhs or rhs as duplicate. Either could have been marked
// as a duplicate of the keeper (but not both - that's checked above), so be careful about which
// one to now mark as a duplicate.
int index = opticalDuplicateFlags[j] ? i : j;
opticalDuplicateFlags[index] = true;
}
}
}
}
}
/**
* Looks through the set of reads and identifies how many of the duplicates are
* in fact optical duplicates, and stores the data in the instance level histogram.
*
* We expect only reads with FR or RF orientations, not a mixture of both.
*
* In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end
* of the mate. In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF
* when we order orientation by the first mate sequenced (read #1 of the pair).
*/
static int checkOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
vector<bool> opticalDuplicateFlags(readEndsArr.size(), false);
// find OpticalDuplicates
findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags);
int opticalDuplicates = 0;
for (int i = 0; i < opticalDuplicateFlags.size(); ++i) {
ReadEnds *pRe = const_cast<ReadEnds *>(readEndsArr[i]);
if (opticalDuplicateFlags[i]) {
++opticalDuplicates;
// if (zzhopticalSet.find(pRe->read1IndexInFile) != zzhopticalSet.end()) {
// cout << "val: " << pRe->isOpticalDuplicate << endl;
// }
pRe->isOpticalDuplicate = true;
zzhopticalSet.insert(pRe->read1IndexInFile);
zzhopticalSet.insert(pRe->read2IndexInFile);
zzhopticalArr.push_back(pRe->read1IndexInFile);
zzhopticalArr.push_back(pRe->read2IndexInFile);
} else {
pRe->isOpticalDuplicate = false;
zzhopticalSet.erase(pRe->read1IndexInFile);
zzhopticalSet.erase(pRe->read2IndexInFile);
}
}
return opticalDuplicates;
}
/**
* 记录光学原因造成的冗余
*/
void trackOpticalDuplicates(vector<const ReadEnds *> &readEndsArr, const ReadEnds *pBestRe) {
bool hasFR = false, hasRF = false;
int prevOpticalDupNum = 0;
// Check to see if we have a mixture of FR/RF
for (auto pRe : readEndsArr) {
if (pRe->isOpticalDuplicate)
++prevOpticalDupNum;
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
hasFR = true;
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
hasRF = true;
}
// Check if we need to partition since the orientations could have changed
int nOpticalDup;
if (hasFR && hasRF) { // need to track them independently
vector<const ReadEnds *> trackOpticalDuplicatesF;
vector<const ReadEnds *> trackOpticalDuplicatesR;
// Split into two lists: first of pairs and second of pairs,
// since they must have orientation and same starting end
for (auto pRe: readEndsArr) {
if (ReadEnds::FR == pRe->orientationForOpticalDuplicates)
trackOpticalDuplicatesF.push_back(pRe);
else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates)
trackOpticalDuplicatesR.push_back(pRe);
else
cerr << "Found an unexpected orientation: " << pRe->orientation << endl;
}
// track the duplicates
int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe);
int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe);
nOpticalDup = nOpticalDupF + nOpticalDupR;
} else { // No need to partition
nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe);
}
// trackDuplicateCounts
gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup;
if (readEndsArr.size() > nOpticalDup)
gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup;
if (nOpticalDup)
gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1;
gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup - prevOpticalDupNum;
//gMetrics.OpticalDuplicatesByLibraryId += nOpticalDup;
// cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl;
// cerr << (zzhtestnum++) << " " << readEndsArr.size() << ":" << nOpticalDup << endl;
}