2024-12-15 03:20:35 +08:00
|
|
|
|
/*
|
|
|
|
|
|
Description: read
|
|
|
|
|
|
ends结构体主要用来标记冗余,包含一些序列的测序过程中的物理信息等
|
|
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2023/11/3
|
|
|
|
|
|
*/
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Small interface that provides access to the physical location information
|
|
|
|
|
|
* about a cluster. All values should be defaulted to -1 if unavailable.
|
|
|
|
|
|
* ReadGroup and Tile should only allow non-zero positive integers, x and y
|
|
|
|
|
|
* coordinates may be negative.
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct PhysicalLocation {
|
|
|
|
|
|
static const int NO_VALUE = -1;
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Small class that provides access to the physical location information
|
|
|
|
|
|
* about a cluster. All values should be defaulted to -1 if unavailable.
|
|
|
|
|
|
* Tile should only allow non-zero positive integers, x and y coordinates
|
|
|
|
|
|
* must be non-negative. This is different from PhysicalLocationShort in
|
|
|
|
|
|
* that the x and y positions are ints, not shorts thus, they do not
|
|
|
|
|
|
* overflow within a HiSeqX tile.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int16_t tile = -1;
|
|
|
|
|
|
// int32_t x = -1;
|
|
|
|
|
|
// int32_t y = -1;
|
|
|
|
|
|
int16_t x = -1;
|
|
|
|
|
|
int16_t y = -1;
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* 包含了所有read ends信息,如picard里边的 ReadEndsForMarkDuplicates*/
|
|
|
|
|
|
struct ReadEnds : PhysicalLocation {
|
|
|
|
|
|
static const int8_t F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;
|
|
|
|
|
|
/* 保留一些bam记录中的数据 */
|
|
|
|
|
|
bool read1FirstOfPair = true;
|
|
|
|
|
|
/* ReadEnds中的成员变量 */
|
|
|
|
|
|
/** Little struct-like class to hold read pair (and fragment) end data for
|
|
|
|
|
|
* duplicate marking. */
|
|
|
|
|
|
// int16_t libraryId; // 没用,不考虑多样本
|
|
|
|
|
|
int8_t orientation = -1;
|
|
|
|
|
|
int32_t read1ReferenceIndex = -1;
|
|
|
|
|
|
int32_t read1Coordinate = -1;
|
|
|
|
|
|
int32_t read2ReferenceIndex = -1;
|
|
|
|
|
|
// This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)
|
|
|
|
|
|
int32_t read2Coordinate = -1;
|
|
|
|
|
|
/* Additional information used to detect optical dupes */
|
|
|
|
|
|
// int16_t readGroup = -1; 一般经过比对后的bam文件只有一个read
|
|
|
|
|
|
// group,normal或者tumor
|
|
|
|
|
|
/** For optical duplicate detection the orientation matters regard to 1st or
|
|
|
|
|
|
* 2nd end of a mate */
|
|
|
|
|
|
int8_t orientationForOpticalDuplicates = -1;
|
|
|
|
|
|
/** A *transient* flag marking this read end as being an optical duplicate.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool isOpticalDuplicate = false;
|
|
|
|
|
|
|
|
|
|
|
|
/* ReadEndsForMarkDuplicates中的成员变量 */
|
|
|
|
|
|
/** Little struct-like class to hold read pair (and fragment) end data for
|
|
|
|
|
|
* MarkDuplicatesWithMateCigar **/
|
|
|
|
|
|
int16_t score = 0;
|
|
|
|
|
|
int64_t read1IndexInFile = -1;
|
|
|
|
|
|
int64_t read2IndexInFile = -1;
|
|
|
|
|
|
int64_t duplicateSetSize = -1;
|
|
|
|
|
|
|
|
|
|
|
|
/* ReadEndsForMarkDuplicatesWithBarcodes中的成员变量 (好像用不到) */
|
|
|
|
|
|
// int32_t barcode = 0; // primary barcode for this read (and pair)
|
|
|
|
|
|
// int32_t readOneBarcode = 0; // read one barcode, 0 if not present
|
|
|
|
|
|
// int32_t readTwoBarcode = 0; // read two barcode, 0 if not present or not
|
|
|
|
|
|
// paired
|
|
|
|
|
|
|
|
|
|
|
|
/* zzh增加的成员变量 */
|
|
|
|
|
|
int64_t posKey = -1; // 根据位置信息生成的关键字 return (int64_t)tid <<
|
|
|
|
|
|
// MAX_CONTIG_LEN_SHIFT | (int64_t)pos;
|
|
|
|
|
|
|
|
|
|
|
|
/* 用来做一些判断,因为一些readends会做多次操作,比如task之间有重叠等等 */
|
|
|
|
|
|
int oprateTime = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/* 根据pairend read的比对方向,来确定整体的比对方向 */
|
|
|
|
|
|
static int8_t GetOrientationByte(bool read1NegativeStrand, bool read2NegativeStrand) {
|
|
|
|
|
|
if (read1NegativeStrand) {
|
|
|
|
|
|
if (read2NegativeStrand)
|
|
|
|
|
|
return RR;
|
|
|
|
|
|
else
|
|
|
|
|
|
return RF;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if (read2NegativeStrand)
|
|
|
|
|
|
return FR;
|
|
|
|
|
|
else
|
|
|
|
|
|
return FF;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 比较两个readends是否一样(有个冗余) */
|
|
|
|
|
|
static bool AreComparableForDuplicates(const ReadEnds &lhs, const ReadEnds &rhs, bool compareRead2) {
|
|
|
|
|
|
bool areComparable = true;
|
|
|
|
|
|
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
|
|
|
|
|
|
lhs.read1Coordinate == rhs.read1Coordinate && lhs.orientation == rhs.orientation;
|
|
|
|
|
|
if (areComparable && compareRead2) {
|
|
|
|
|
|
areComparable =
|
|
|
|
|
|
lhs.read2ReferenceIndex == rhs.read2ReferenceIndex && lhs.read2Coordinate == rhs.read2Coordinate;
|
|
|
|
|
|
}
|
|
|
|
|
|
return areComparable;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 比对方向是否正向 */
|
|
|
|
|
|
bool IsPositiveStrand() const { return orientation == F; }
|
|
|
|
|
|
|
|
|
|
|
|
/* pairend是否合适的比对上了 */
|
|
|
|
|
|
bool IsPaired() const { return read2ReferenceIndex != -1; }
|
|
|
|
|
|
|
|
|
|
|
|
bool IsNegativeStrand() const { return orientation == R; }
|
|
|
|
|
|
|
|
|
|
|
|
// 对于相交的数据进行比对,a是否小于b,根据AreComparableForDuplicates函数得来
|
|
|
|
|
|
static inline bool ReadLittleThan(const ReadEnds &a, const ReadEnds &b, bool compareRead2 = false) {
|
|
|
|
|
|
int comp = a.read1ReferenceIndex - b.read1ReferenceIndex;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = a.read1Coordinate - b.read1Coordinate;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = a.orientation - b.orientation;
|
|
|
|
|
|
if (compareRead2) {
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = a.read2ReferenceIndex - b.read2ReferenceIndex;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = a.read2Coordinate - b.read2Coordinate;
|
|
|
|
|
|
}
|
|
|
|
|
|
return comp < 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 找某一个位置的所有readend时需要
|
|
|
|
|
|
static bool PairsLittleThan(const ReadEnds &lhs, const ReadEnds &rhs) { return ReadLittleThan(lhs, rhs, true); }
|
|
|
|
|
|
|
|
|
|
|
|
// 比较函数
|
|
|
|
|
|
bool operator<(const ReadEnds &o) const {
|
|
|
|
|
|
int comp = read1ReferenceIndex - o.read1ReferenceIndex;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = read1Coordinate - o.read1Coordinate;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = orientation - o.orientation;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = read2ReferenceIndex - o.read2ReferenceIndex;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = read2Coordinate - o.read2Coordinate;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = tile - o.tile;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = x - o.x;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp - y - o.y;
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = (int)(read1IndexInFile - o.read1IndexInFile);
|
|
|
|
|
|
if (comp == 0)
|
|
|
|
|
|
comp = (int)(read2IndexInFile - o.read2IndexInFile);
|
|
|
|
|
|
return comp < 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct ReadEndsHash {
|
|
|
|
|
|
std::size_t operator()(const ReadEnds &o) const { return std::hash<int64_t>()(o.read1IndexInFile); }
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct ReadEndsEqual {
|
|
|
|
|
|
bool operator()(const ReadEnds &o1, const ReadEnds &o2) const { return o1.read1IndexInFile == o2.read1IndexInFile; }
|
|
|
|
|
|
};
|