2023-11-09 21:07:58 +08:00
|
|
|
|
/*
|
|
|
|
|
|
Description: 解析read的name中的信息,比如tile, x, y等
|
|
|
|
|
|
|
|
|
|
|
|
Copyright : All right reserved by ICT
|
|
|
|
|
|
|
|
|
|
|
|
Author : Zhang Zhonghai
|
|
|
|
|
|
Date : 2023/11/6
|
|
|
|
|
|
*/
|
|
|
|
|
|
#ifndef READ_NAME_PARSER_H_
|
|
|
|
|
|
#define READ_NAME_PARSER_H_
|
|
|
|
|
|
|
|
|
|
|
|
#include <common/utils/util.h>
|
|
|
|
|
|
#include <stdint.h>
|
2024-08-22 02:28:36 +08:00
|
|
|
|
|
2023-11-09 21:07:58 +08:00
|
|
|
|
#include <string>
|
2024-08-22 02:28:36 +08:00
|
|
|
|
|
|
|
|
|
|
#include "read_ends.h"
|
2023-11-09 21:07:58 +08:00
|
|
|
|
// #include <regex>
|
|
|
|
|
|
#include <boost/regex.hpp>
|
|
|
|
|
|
|
|
|
|
|
|
// using std::regex;
|
|
|
|
|
|
using boost::cmatch;
|
|
|
|
|
|
using boost::regex;
|
|
|
|
|
|
using std::string;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Provides access to the physical location information about a cluster.
|
2024-08-22 02:28:36 +08:00
|
|
|
|
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile
|
|
|
|
|
|
* should only allow non-zero positive integers, x and y coordinates may be
|
|
|
|
|
|
* negative. 非线程安全
|
2023-11-09 21:07:58 +08:00
|
|
|
|
*/
|
2024-08-22 02:28:36 +08:00
|
|
|
|
struct ReadNameParser {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
/**
|
2024-08-22 02:28:36 +08:00
|
|
|
|
* The read name regular expression (regex) is used to extract three pieces
|
|
|
|
|
|
* of information from the read name: tile, x location, and y location. Any
|
|
|
|
|
|
* read name regex should parse the read name to produce these and only
|
|
|
|
|
|
* these values. An example regex is:
|
2023-11-09 21:07:58 +08:00
|
|
|
|
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
|
2024-08-22 02:28:36 +08:00
|
|
|
|
* which assumes that fields in the read name are delimited by ':' and the
|
|
|
|
|
|
* last three fields correspond to the tile, x and y locations, ignoring any
|
|
|
|
|
|
* trailing non-digit characters.
|
2023-11-09 21:07:58 +08:00
|
|
|
|
*
|
2024-08-22 02:28:36 +08:00
|
|
|
|
* The default regex is optimized for fast parsing (see {@link
|
|
|
|
|
|
* #getLastThreeFields(String, char, int[])}) by searching for the last
|
|
|
|
|
|
* three fields, ignoring any trailing non-digit characters, assuming the
|
|
|
|
|
|
* delimiter ':'. This should consider correctly read names where we have 5
|
|
|
|
|
|
* or 7 field with the last three fields being tile/x/y, as is the case for
|
|
|
|
|
|
* the majority of read names produced by Illumina technology.
|
2023-11-09 21:07:58 +08:00
|
|
|
|
*/
|
2024-08-22 02:28:36 +08:00
|
|
|
|
const string DEFAULT_READ_NAME_REGEX =
|
|
|
|
|
|
"(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
2023-11-09 21:07:58 +08:00
|
|
|
|
|
|
|
|
|
|
string readNameStored = "";
|
|
|
|
|
|
PhysicalLocation physicalLocationStored;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
int tmpLocationFields[3]; // for optimization of addLocationInformation
|
|
|
|
|
|
bool useOptimizedDefaultParsing = true; // was the regex default?
|
2023-11-09 21:07:58 +08:00
|
|
|
|
string readNameRegex = DEFAULT_READ_NAME_REGEX;
|
|
|
|
|
|
regex readNamePattern;
|
|
|
|
|
|
bool warnedAboutRegexNotMatching = true;
|
|
|
|
|
|
|
|
|
|
|
|
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
|
2024-08-22 02:28:36 +08:00
|
|
|
|
ReadNameParser(const string &strReadNameRegex)
|
|
|
|
|
|
: ReadNameParser(strReadNameRegex, true) {}
|
|
|
|
|
|
ReadNameParser(const string &strReadNameRegex, bool isWarn) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
readNameRegex = strReadNameRegex;
|
|
|
|
|
|
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
|
|
|
|
|
useOptimizedDefaultParsing = true;
|
|
|
|
|
|
else
|
|
|
|
|
|
useOptimizedDefaultParsing = false;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
readNamePattern =
|
|
|
|
|
|
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
2023-11-09 21:07:58 +08:00
|
|
|
|
warnedAboutRegexNotMatching = isWarn;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 重新设置readNameRegex */
|
2024-08-22 02:28:36 +08:00
|
|
|
|
void SetReadNameRegex(const string &strReadNameRegex) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
readNameRegex = strReadNameRegex;
|
|
|
|
|
|
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
|
|
|
|
|
useOptimizedDefaultParsing = true;
|
|
|
|
|
|
else
|
|
|
|
|
|
useOptimizedDefaultParsing = false;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
readNamePattern =
|
|
|
|
|
|
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
2023-11-09 21:07:58 +08:00
|
|
|
|
// readNamePattern = strReadNameRegex;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* 添加测序时候的tile x y 信息 */
|
2024-08-22 02:28:36 +08:00
|
|
|
|
bool AddLocationInformation(const string &readName, PhysicalLocation *loc) {
|
|
|
|
|
|
if (!(readName == readNameStored)) {
|
|
|
|
|
|
if (ReadLocationInformation(readName, loc)) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
readNameStored = readName;
|
|
|
|
|
|
physicalLocationStored = *loc;
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
// return false if read name cannot be parsed
|
|
|
|
|
|
return false;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
} else {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
*loc = physicalLocationStored;
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2024-08-22 02:28:36 +08:00
|
|
|
|
* Method used to extract tile/x/y from the read name and add it to the
|
|
|
|
|
|
* PhysicalLocationShort so that it can be used later to determine optical
|
|
|
|
|
|
* duplication
|
2023-11-09 21:07:58 +08:00
|
|
|
|
*
|
|
|
|
|
|
* @param readName the name of the read/cluster
|
|
|
|
|
|
* @param loc the object to add tile/x/y to
|
2024-08-22 02:28:36 +08:00
|
|
|
|
* @return true if the read name contained the information in parsable form,
|
|
|
|
|
|
* false otherwise
|
2023-11-09 21:07:58 +08:00
|
|
|
|
*/
|
2024-08-22 02:28:36 +08:00
|
|
|
|
bool ReadLocationInformation(const string &readName,
|
|
|
|
|
|
PhysicalLocation *loc) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
try {
|
2024-08-25 00:08:38 +08:00
|
|
|
|
// Optimized version if using the default read name regex (== used on purpose):
|
2024-08-22 02:28:36 +08:00
|
|
|
|
if (useOptimizedDefaultParsing) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
const int fields = getLastThreeFields(readName, ':');
|
2024-08-22 02:28:36 +08:00
|
|
|
|
if (!(fields == 5 || fields == 7)) {
|
|
|
|
|
|
if (warnedAboutRegexNotMatching) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
Warn(
|
2024-08-22 02:28:36 +08:00
|
|
|
|
"Default READ_NAME_REGEX '%s' did not match read "
|
|
|
|
|
|
"name '%s'."
|
|
|
|
|
|
"You may need to specify a READ_NAME_REGEX in "
|
|
|
|
|
|
"order to correctly identify optical duplicates. "
|
|
|
|
|
|
"Note that this message will not be emitted again "
|
|
|
|
|
|
"even if other read names do not match the regex.",
|
2023-11-09 21:07:58 +08:00
|
|
|
|
readNameRegex.c_str(), readName.c_str());
|
|
|
|
|
|
warnedAboutRegexNotMatching = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
loc->tile = (int16_t)tmpLocationFields[0];
|
|
|
|
|
|
loc->x = tmpLocationFields[1];
|
|
|
|
|
|
loc->y = tmpLocationFields[2];
|
|
|
|
|
|
return true;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
} else if (readNameRegex.empty()) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
return false;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
} else {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
// Standard version that will use the regex
|
|
|
|
|
|
cmatch m;
|
|
|
|
|
|
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
|
|
|
|
|
loc->tile = std::stoi(m[1].str());
|
|
|
|
|
|
loc->x = std::stoi(m[2].str());
|
|
|
|
|
|
loc->y = std::stoi(m[3].str());
|
|
|
|
|
|
return true;
|
2024-08-22 02:28:36 +08:00
|
|
|
|
} else {
|
|
|
|
|
|
if (warnedAboutRegexNotMatching) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
Warn(
|
|
|
|
|
|
"READ_NAME_REGEX '%s' did not match read name '%s'."
|
|
|
|
|
|
"Your regex may not be correct. "
|
2024-08-22 02:28:36 +08:00
|
|
|
|
"Note that this message will not be emitted again "
|
|
|
|
|
|
"even if other read names do not match the regex.",
|
2023-11-09 21:07:58 +08:00
|
|
|
|
readNameRegex.c_str(), readName.c_str());
|
|
|
|
|
|
warnedAboutRegexNotMatching = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2024-08-22 02:28:36 +08:00
|
|
|
|
} catch (const std::runtime_error &e) {
|
|
|
|
|
|
if (warnedAboutRegexNotMatching) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
Warn(
|
2024-08-22 02:28:36 +08:00
|
|
|
|
"A field parsed out of a read name was expected to contain "
|
|
|
|
|
|
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
|
|
|
|
|
|
"%s; Error Msg: %s",
|
2023-11-09 21:07:58 +08:00
|
|
|
|
readNameRegex.c_str(), readName.c_str(), e.what());
|
|
|
|
|
|
warnedAboutRegexNotMatching = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2024-08-22 02:28:36 +08:00
|
|
|
|
* Given a string, splits the string by the delimiter, and returns the the
|
|
|
|
|
|
* last three fields parsed as integers. Parsing a field considers only a
|
|
|
|
|
|
* sequence of digits up until the first non-digit character. The three
|
|
|
|
|
|
* values are stored in the passed-in array.
|
2023-11-09 21:07:58 +08:00
|
|
|
|
*
|
2024-08-22 02:28:36 +08:00
|
|
|
|
* @throws NumberFormatException if any of the tokens that should contain
|
|
|
|
|
|
* numbers do not start with parsable numbers
|
2023-11-09 21:07:58 +08:00
|
|
|
|
*/
|
2024-08-22 02:28:36 +08:00
|
|
|
|
int getLastThreeFields(const string &readName, char delim) {
|
|
|
|
|
|
int tokensIdx = 2; // start at the last token
|
2023-11-09 21:07:58 +08:00
|
|
|
|
int numFields = 0;
|
|
|
|
|
|
int i, endIdx;
|
|
|
|
|
|
endIdx = readName.size();
|
|
|
|
|
|
// find the last three tokens only
|
2024-08-22 02:28:36 +08:00
|
|
|
|
for (i = (int)readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) {
|
|
|
|
|
|
if (readName.at(i) == delim || 0 == i) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
numFields++;
|
|
|
|
|
|
const int startIdx = (0 == i) ? 0 : (i + 1);
|
2024-08-22 02:28:36 +08:00
|
|
|
|
tmpLocationFields[tokensIdx] =
|
|
|
|
|
|
std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
2023-11-09 21:07:58 +08:00
|
|
|
|
tokensIdx--;
|
|
|
|
|
|
endIdx = i;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// continue to find the # of fields
|
2024-08-22 02:28:36 +08:00
|
|
|
|
while (0 <= i) {
|
2023-11-09 21:07:58 +08:00
|
|
|
|
if (readName.at(i) == delim || 0 == i)
|
|
|
|
|
|
numFields++;
|
|
|
|
|
|
i--;
|
|
|
|
|
|
}
|
2024-08-22 02:28:36 +08:00
|
|
|
|
if (numFields < 3) {
|
|
|
|
|
|
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] =
|
|
|
|
|
|
-1;
|
2023-11-09 21:07:58 +08:00
|
|
|
|
numFields = -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return numFields;
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|