#include "md_funcs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "dup_metrics.h" #include "markdups_arg.h" #include "shared_args.h" using std::cerr; using std::endl; using std::map; using std::set; using std::unordered_map; using std::vector; static int zzhtestnum = 0; /* 清除key位置的数据 */ void clearIdxAtPos(int64_t key, map> *pmsIdx) { auto &msIdx = *pmsIdx; if (msIdx.find(key) != msIdx.end()) msIdx[key].clear(); // 清除该位点的冗余结果 } /* 删除key位置的数据 */ void delIdxAtPos(int64_t key, map> *pmsIdx) { auto &msIdx = *pmsIdx; if (msIdx.find(key) != msIdx.end()) msIdx.erase(key); } /* * 计算read的分数 */ int16_t computeDuplicateScore(BamWrap &bw) { int16_t score = 0; switch (g_mdArg.DUPLICATE_SCORING_STRATEGY) { case ns_md::SUM_OF_BASE_QUALITIES: // two (very) long reads worth of high-quality bases can go over // Short.MAX_VALUE/2 and risk overflow. score += (int16_t)min(bw.GetSumOfBaseQualities(), INT16_MAX / 2); break; case ns_md::TOTAL_MAPPED_REFERENCE_LENGTH: if (!bw.GetReadUnmappedFlag()) // no need to remember the score since this scoring mechanism is // symmetric score = (int16_t)min(bw.GetReferenceLength(), INT16_MAX / 2); break; case ns_md::RANDOM: // The RANDOM score gives the same score to both reads so that they get // filtered together. it's not critical do use the readName since the // scores from both ends get added, but it seem to be clearer this way. score += (short)(Murmur3::Instance().HashUnencodedChars(bw.query_name()) & 0b11111111111111); // subtract Short.MIN_VALUE/4 from it to end up with a number between // 0 and Short.MAX_VALUE/2. This number can be then discounted in case // the read is not passing filters. We need to stay far from overflow so // that when we add the two scores from the two read mates we do not // overflow since that could cause us to chose a failing read-pair // instead of a passing one. score -= INT16_MIN / 4; default: break; } // make sure that filter-failing records are heavily discounted. (the // discount can happen twice, once for each mate, so need to make sure we do // not subtract more than Short.MIN_VALUE overall.) score += bw.GetReadFailsVendorQualityCheckFlag() ? (int16_t)(INT16_MIN / 2) : 0; return score; } /* * Builds a read ends object that represents a single read. * 用来表示一个read的特征结构 */ void buildReadEnds(BamWrap &bw, int64_t index, ReadNameParser &rnParser, ReadEnds *pKey) { auto &k = *pKey; auto &bc = bw.b->core; k.read1FirstOfPair = bw.GetFirstOfPairFlag(); k.read1ReferenceIndex = bc.tid; k.read1Coordinate = (bc.flag & BAM_FREVERSE) ? bw.GetUnclippedEnd() : bw.GetUnclippedStart(); k.orientation = (bc.flag & BAM_FREVERSE) ? ReadEnds::R : ReadEnds::F; k.read1IndexInFile = index; k.score = computeDuplicateScore(bw); // Doing this lets the ends object know that it's part of a pair if (bw.GetReadPairedFlag() && !bw.GetMateUnmappedFlag()) { k.read2ReferenceIndex = bc.mtid; } // Fill in the location information for optical duplicates rnParser.AddLocationInformation(bw.query_name(), pKey); // cout << k.tile << ' ' << k.x << ' ' << k.y << endl; // 计算位置key k.posKey = BamWrap::bam_global_pos(k.read1ReferenceIndex, k.read1Coordinate); // << 1 | k.orientation; } /** * Takes a list of ReadEndsForMarkDuplicates objects and identify the * representative read based on quality score. For all members of the duplicate * set, add the read1 index-in-file of the representative read to the records of * the first and second in a pair. This value becomes is used for the 'DI' tag. */ void addRepresentativeReadIndex(vector &vpRe) {} /* 处理一组pairend的readends，标记冗余 */ void markDuplicatePairs(int64_t posKey, vector &vpRe, DupContainer *dupIdx, DupContainer *opticalDupIdx) { if (vpRe.size() < 2) { if (vpRe.size() == 1) { // addSingletonToCount(libraryIdGenerator); } return; } // cout << "pos:" << posKey + 1 << ";size:" << vpRe.size() << endl; auto &vDupIdx = dupIdx->AtPos(posKey); auto &vOpticalDupIdx = opticalDupIdx->AtPos(posKey); int maxScore = 0; const ReadEnds *pBest = nullptr; /** All read ends should have orientation FF, FR, RF, or RR **/ for (auto pe : vpRe) // 找分数最高的readend { if (pe->score > maxScore || pBest == nullptr) { maxScore = pe->score; pBest = pe; } } if (!g_mdArg.READ_NAME_REGEX.empty()) // 检查光学冗余 { // trackOpticalDuplicates } for (auto pe : vpRe) // 对非best read标记冗余 { if (pe != pBest) // 非best { vDupIdx.push_back(pe->read1IndexInFile); // 添加read1 if (pe->read2IndexInFile != pe->read1IndexInFile) vDupIdx.push_back(pe->read2IndexInFile); // 添加read2 } } if (g_mdArg.TAG_DUPLICATE_SET_MEMBERS) { addRepresentativeReadIndex(vpRe); } } /* 处理一组非paired的readends，标记冗余 */ void markDuplicateFragments(int64_t posKey, vector &vpRe, bool containsPairs, DupContainer *dupIdx) { auto &vDupIdx = dupIdx->AtPos(posKey); if (containsPairs) { for (auto pe : vpRe) { if (!pe->IsPaired()) { vDupIdx.push_back(pe->read1IndexInFile); } } } else { int maxScore = 0; const ReadEnds *pBest = nullptr; for (auto pe : vpRe) { if (pe->score > maxScore || pBest == nullptr) { maxScore = pe->score; pBest = pe; } } for (auto pe : vpRe) { if (pe != pBest) { vDupIdx.push_back(pe->read1IndexInFile); } } } } /* 处理位于某个坐标的pairend reads */ void handlePairs(int64_t posKey, vector &readEnds, vector &vpCache, DupContainer *dupIdx, DupContainer *opticalDupIdx) { if (readEnds.size() > 1) { // 有潜在的冗余 vpCache.clear(); // std::sort(readEnds.begin(), readEnds.end()); const ReadEnds *pReadEnd = nullptr; for (auto &re : readEnds) { if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, true)) // 跟前一个一样 vpCache.push_back(&re); // 处理一个潜在的冗余组 else { markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); // 不一样 vpCache.clear(); vpCache.push_back(&re); pReadEnd = &re; } } markDuplicatePairs(posKey, vpCache, dupIdx, opticalDupIdx); } } /* 处理位于某个坐标的 reads */ void handleFrags(int64_t posKey, vector &readEnds, vector &vpCache, DupContainer *dupIdx) { if (readEnds.size() > 1) // 有潜在的冗余 { vpCache.clear(); // std::sort(readEnds.begin(), readEnds.end()); const ReadEnds *pReadEnd = nullptr; bool containsPairs = false; bool containsFrags = false; for (auto &re : readEnds) { if (pReadEnd != nullptr && ReadEnds::AreComparableForDuplicates(*pReadEnd, re, false)) { vpCache.push_back(&re); containsPairs = containsPairs || re.IsPaired(); containsFrags = containsFrags || !re.IsPaired(); } else { if (vpCache.size() > 1 && containsFrags) { markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx); } vpCache.clear(); vpCache.push_back(&re); pReadEnd = &re; containsPairs = re.IsPaired(); containsFrags = !re.IsPaired(); } } if (vpCache.size() > 1 && containsFrags) { markDuplicateFragments(posKey, vpCache, containsPairs, dupIdx); } } } /* 对找到的pairend read end添加一些信息 */ void modifyPairedEnds(const ReadEnds &fragEnd, ReadEnds *pPairedEnds) { auto &pairedEnds = *pPairedEnds; int64_t bamIdx = fragEnd.read1IndexInFile; const int matesRefIndex = fragEnd.read1ReferenceIndex; const int matesCoordinate = fragEnd.read1Coordinate; // Set orientationForOpticalDuplicates, which always goes by the first then // the second end for the strands. NB: must do this before updating the // orientation later. if (fragEnd.read1FirstOfPair) { pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte( fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R); } else { pairedEnds.orientationForOpticalDuplicates = ReadEnds::GetOrientationByte( pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand()); } // If the other read is actually later, simply add the other read's data as // read2, else flip the reads if (matesRefIndex > pairedEnds.read1ReferenceIndex || (matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate)) { pairedEnds.read2ReferenceIndex = matesRefIndex; pairedEnds.read2Coordinate = matesCoordinate; pairedEnds.read2IndexInFile = bamIdx; pairedEnds.orientation = ReadEnds::GetOrientationByte(pairedEnds.orientation == ReadEnds::R, fragEnd.IsNegativeStrand()); // if the two read ends are in the same position, pointing in opposite // directions, the orientation is undefined and the procedure above will // depend on the order of the reads in the file. To avoid this, we set // it explicitly (to FR): if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex && pairedEnds.read2Coordinate == pairedEnds.read1Coordinate && pairedEnds.orientation == ReadEnds::RF) { pairedEnds.orientation = ReadEnds::FR; } } else { pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex; pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; pairedEnds.read1ReferenceIndex = matesRefIndex; pairedEnds.read1Coordinate = matesCoordinate; pairedEnds.read1IndexInFile = bamIdx; pairedEnds.orientation = ReadEnds::GetOrientationByte( fragEnd.IsNegativeStrand(), pairedEnds.orientation == ReadEnds::R); pairedEnds.posKey = fragEnd.posKey; } pairedEnds.score += fragEnd.score; } static inline bool closeEnough(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) { return lhs != rhs && // no comparing an object to itself (checked using object identity)! (lhs->tile != ReadEnds::NO_VALUE) && (rhs->tile != ReadEnds::NO_VALUE) && // no comparing objects without locations lhs->tile == rhs->tile && // and the same tile abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance; } static inline bool closeEnoughShort(const ReadEnds *lhs, const ReadEnds *rhs, const int distance) { return lhs != rhs && abs(lhs->x - rhs->x) <= distance && abs(lhs->y - rhs->y) <= distance; } /** * Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of * one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for * optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered * optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the * read appeared in the input list of physical locations. * * @param list a list of reads that are determined to be duplicates of one another * @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be * annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not * contained within the list! (always not be null!) * @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates */ static void findOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe, vector *pOpticalDuplicateFlags) { const int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100; const int DEFAULT_MAX_DUPLICATE_SET_SIZE = 300000; vector &opticalDuplicateFlags = *pOpticalDuplicateFlags; // opticalDuplicateFlags.push_back(true); // for test int len = readEndsArr.size(); // If there is only one or zero reads passed in (so there are obviously no optical duplicates), // or if there are too many reads (so we don't want to try to run this expensive n^2 algorithm), // then just return an array of all false if (len < 2 || len > DEFAULT_MAX_DUPLICATE_SET_SIZE) { return; } if (len >= 4) { /** * Compute the optical duplicates correctly in the case where the duplicate group could end up with transitive * optical duplicates * getOpticalDuplicatesFlagWithGraph */ // Make a graph where the edges are reads that lie within the optical duplicate pixel distance from each other, // we will then use the union-find algorithm to cluster the graph and find optical duplicate groups Graph opticalDistanceRelationGraph; unordered_map> tileRGmap; int keeperIndex = -1; for (int i = 0; i < readEndsArr.size(); ++i) { const ReadEnds *currentLoc = readEndsArr[i]; if (currentLoc == pBestRe) keeperIndex = i; if (currentLoc->tile != ReadEnds::NO_VALUE) { int key = currentLoc->tile; // 只处理一个样本，所以只有一个read group tileRGmap[key].push_back(i); } opticalDistanceRelationGraph.addNode(i); } // Since because finding adjacent optical duplicates is an O(n^2) operation, we can subdivide the input into its // readgroups in order to reduce the amount of redundant checks across readgroups between reads. // fillGraphFromAGroup for (auto &entry : tileRGmap) { auto &groupList = entry.second; if (groupList.size() > 1) { for (int i = 0; i < groupList.size(); ++i) { int iIndex = groupList[i]; const ReadEnds *currentLoc = readEndsArr[iIndex]; for (int j = i + 1; j < groupList.size(); ++j) { int jIndex = groupList[j]; const ReadEnds *other = readEndsArr[jIndex]; if (closeEnoughShort(currentLoc, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) opticalDistanceRelationGraph.addEdge(iIndex, jIndex); } } } } // Keep a map of the reads and their cluster assignments unordered_map opticalDuplicateClusterMap; opticalDistanceRelationGraph.cluster(&opticalDuplicateClusterMap); unordered_map clusterToRepresentativeRead; int keeperCluster = -1; // Specially mark the keeper as specifically not a duplicate if it exists if (keeperIndex >= 0) { keeperCluster = opticalDuplicateClusterMap[keeperIndex]; clusterToRepresentativeRead[keeperCluster] = keeperIndex; } for (auto &entry: opticalDuplicateClusterMap) { int recordIndex = entry.first; int recordAssignedCluster = entry.second; // If its not the first read we've seen for this cluster, mark it as an optical duplicate auto repReadItr = clusterToRepresentativeRead.find(recordAssignedCluster); if (repReadItr != clusterToRepresentativeRead.end() && recordIndex != keeperIndex) { const ReadEnds *representativeLoc = readEndsArr[repReadItr->second]; const ReadEnds *currentRecordLoc = readEndsArr[recordIndex]; // If not in the keeper cluster, then keep the minX -> minY valued duplicate (note the tile must be // equal for reads to cluster together) if (!(keeperIndex >= 0 && recordAssignedCluster == keeperCluster) && (currentRecordLoc->x < representativeLoc->x || (currentRecordLoc->x == representativeLoc->x && currentRecordLoc->y < representativeLoc->y))) { // Mark the old min as an optical duplicate, and save the new min opticalDuplicateFlags[repReadItr->second] = true; clusterToRepresentativeRead[recordAssignedCluster] = recordIndex; } else { // If a smaller read has already been visited, mark the test read as an optical duplicate opticalDuplicateFlags[recordIndex] = true; } } else { clusterToRepresentativeRead[recordAssignedCluster] = recordIndex; } } } else { /** * Compute optical duplicates quickly in the standard case where we know that there won't be any transitive * distances to worry about. Note, this is guaranteed to be correct when there are at most 2x reads from a * readgroup or 3x with the keeper present * getOpticalDuplicatesFlagFast */ // First go through and compare all the reads to the keeper for (int i = 0; i < len; ++i) { const ReadEnds *other = readEndsArr[i]; opticalDuplicateFlags[i] = closeEnough(pBestRe, other, DEFAULT_OPTICAL_DUPLICATE_DISTANCE); } // Now go through and do each pairwise comparison not involving the actualKeeper for (int i = 0; i < len; ++i) { const ReadEnds *lhs = readEndsArr[i]; if (lhs == pBestRe) // no comparisons to actualKeeper since those are all handled above continue; for (int j = i + 1; j < len; ++j) { const ReadEnds *rhs = readEndsArr[j]; if (rhs == pBestRe) // no comparisons to actualKeeper since those are all handled above continue; if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j]) continue; // both already marked, no need to check if (closeEnough(lhs, rhs, DEFAULT_OPTICAL_DUPLICATE_DISTANCE)) { // At this point we want to mark either lhs or rhs as duplicate. Either could have been marked // as a duplicate of the keeper (but not both - that's checked above), so be careful about which // one to now mark as a duplicate. int index = opticalDuplicateFlags[j] ? i : j; opticalDuplicateFlags[index] = true; } } } } } /** * Looks through the set of reads and identifies how many of the duplicates are * in fact optical duplicates, and stores the data in the instance level histogram. * * We expect only reads with FR or RF orientations, not a mixture of both. * * In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end * of the mate. In optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF * when we order orientation by the first mate sequenced (read #1 of the pair). */ static int checkOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { vector opticalDuplicateFlags(readEndsArr.size(), false); // find OpticalDuplicates findOpticalDuplicates(readEndsArr, pBestRe, &opticalDuplicateFlags); int opticalDuplicates = 0; for (int i = 0; i < opticalDuplicateFlags.size(); ++i) { if (opticalDuplicateFlags[i]) { ++opticalDuplicates; ReadEnds *pRe = const_cast(readEndsArr[i]); pRe->isOpticalDuplicate = true; } } if (opticalDuplicates > 0) gMetrics.OpticalDuplicatesByLibraryId += opticalDuplicates; return opticalDuplicates; } /** * 记录光学原因造成的冗余 */ void trackOpticalDuplicates(vector &readEndsArr, const ReadEnds *pBestRe) { bool hasFR = false, hasRF = false; // Check to see if we have a mixture of FR/RF for (auto pRe : readEndsArr) { if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) hasFR = true; else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) hasRF = true; } // Check if we need to partition since the orientations could have changed int nOpticalDup; if (hasFR && hasRF) { // need to track them independently vector trackOpticalDuplicatesF; vector trackOpticalDuplicatesR; // Split into two lists: first of pairs and second of pairs, // since they must have orientation and same starting end for (auto pRe: readEndsArr) { if (ReadEnds::FR == pRe->orientationForOpticalDuplicates) trackOpticalDuplicatesF.push_back(pRe); else if (ReadEnds::RF == pRe->orientationForOpticalDuplicates) trackOpticalDuplicatesR.push_back(pRe); else cerr << "Found an unexpected orientation: " << pRe->orientation << endl; } // track the duplicates int nOpticalDupF = checkOpticalDuplicates(trackOpticalDuplicatesF, pBestRe); int nOpticalDupR = checkOpticalDuplicates(trackOpticalDuplicatesR, pBestRe); nOpticalDup = nOpticalDupF + nOpticalDupR; } else { // No need to partition nOpticalDup = checkOpticalDuplicates(readEndsArr, pBestRe); } // trackDuplicateCounts gMetrics.DuplicateCountHist += readEndsArr.size() - nOpticalDup; if (readEndsArr.size() > nOpticalDup) gMetrics.NonOpticalDuplicateCountHist += readEndsArr.size() - nOpticalDup; if (nOpticalDup) gMetrics.OpticalDuplicatesCountHist += nOpticalDup + 1; // cout << "zzh optical:" << (++zzhtestnum) << "\t" << readEndsArr.size() << "\t" << nOpticalDup << endl; }