226 lines
8.9 KiB
C++
226 lines
8.9 KiB
C++
/*
|
||
Description: 解析read的name中的信息,比如tile, x, y等
|
||
|
||
Copyright : All right reserved by ICT
|
||
|
||
Author : Zhang Zhonghai
|
||
Date : 2023/11/6
|
||
*/
|
||
#ifndef READ_NAME_PARSER_H_
|
||
#define READ_NAME_PARSER_H_
|
||
|
||
#include <common/utils/util.h>
|
||
#include <stdint.h>
|
||
|
||
#include <string>
|
||
|
||
#include "read_ends.h"
|
||
// #include <regex>
|
||
#include <boost/regex.hpp>
|
||
|
||
// using std::regex;
|
||
using boost::cmatch;
|
||
using boost::regex;
|
||
using std::string;
|
||
|
||
/**
|
||
* Provides access to the physical location information about a cluster.
|
||
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile
|
||
* should only allow non-zero positive integers, x and y coordinates may be
|
||
* negative. 非线程安全
|
||
*/
|
||
struct ReadNameParser {
|
||
/**
|
||
* The read name regular expression (regex) is used to extract three pieces
|
||
* of information from the read name: tile, x location, and y location. Any
|
||
* read name regex should parse the read name to produce these and only
|
||
* these values. An example regex is:
|
||
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
|
||
* which assumes that fields in the read name are delimited by ':' and the
|
||
* last three fields correspond to the tile, x and y locations, ignoring any
|
||
* trailing non-digit characters.
|
||
*
|
||
* The default regex is optimized for fast parsing (see {@link
|
||
* #getLastThreeFields(String, char, int[])}) by searching for the last
|
||
* three fields, ignoring any trailing non-digit characters, assuming the
|
||
* delimiter ':'. This should consider correctly read names where we have 5
|
||
* or 7 field with the last three fields being tile/x/y, as is the case for
|
||
* the majority of read names produced by Illumina technology.
|
||
*/
|
||
const string DEFAULT_READ_NAME_REGEX =
|
||
"(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
||
|
||
string readNameStored = "";
|
||
PhysicalLocation physicalLocationStored;
|
||
int tmpLocationFields[3]; // for optimization of addLocationInformation
|
||
bool useOptimizedDefaultParsing = true; // was the regex default?
|
||
string readNameRegex = DEFAULT_READ_NAME_REGEX;
|
||
regex readNamePattern;
|
||
bool warnedAboutRegexNotMatching = true;
|
||
|
||
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
|
||
ReadNameParser(const string &strReadNameRegex)
|
||
: ReadNameParser(strReadNameRegex, true) {}
|
||
ReadNameParser(const string &strReadNameRegex, bool isWarn) {
|
||
readNameRegex = strReadNameRegex;
|
||
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
||
useOptimizedDefaultParsing = true;
|
||
else
|
||
useOptimizedDefaultParsing = false;
|
||
readNamePattern =
|
||
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||
warnedAboutRegexNotMatching = isWarn;
|
||
}
|
||
|
||
/* 重新设置readNameRegex */
|
||
void SetReadNameRegex(const string &strReadNameRegex) {
|
||
readNameRegex = strReadNameRegex;
|
||
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
||
useOptimizedDefaultParsing = true;
|
||
else
|
||
useOptimizedDefaultParsing = false;
|
||
readNamePattern =
|
||
boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
||
// readNamePattern = strReadNameRegex;
|
||
}
|
||
|
||
/* 添加测序时候的tile x y 信息 */
|
||
bool AddLocationInformation(const string &readName, PhysicalLocation *loc) {
|
||
if (!(readName == readNameStored)) {
|
||
if (ReadLocationInformation(readName, loc)) {
|
||
readNameStored = readName;
|
||
physicalLocationStored = *loc;
|
||
return true;
|
||
}
|
||
// return false if read name cannot be parsed
|
||
return false;
|
||
} else {
|
||
*loc = physicalLocationStored;
|
||
return true;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Method used to extract tile/x/y from the read name and add it to the
|
||
* PhysicalLocationShort so that it can be used later to determine optical
|
||
* duplication
|
||
*
|
||
* @param readName the name of the read/cluster
|
||
* @param loc the object to add tile/x/y to
|
||
* @return true if the read name contained the information in parsable form,
|
||
* false otherwise
|
||
*/
|
||
bool ReadLocationInformation(const string &readName,
|
||
PhysicalLocation *loc) {
|
||
try {
|
||
// Optimized version if using the default read name regex (== used on purpose):
|
||
if (useOptimizedDefaultParsing) {
|
||
const int fields = getLastThreeFields(readName, ':');
|
||
if (!(fields == 5 || fields == 7)) {
|
||
if (warnedAboutRegexNotMatching) {
|
||
Warn(
|
||
"Default READ_NAME_REGEX '%s' did not match read "
|
||
"name '%s'."
|
||
"You may need to specify a READ_NAME_REGEX in "
|
||
"order to correctly identify optical duplicates. "
|
||
"Note that this message will not be emitted again "
|
||
"even if other read names do not match the regex.",
|
||
readNameRegex.c_str(), readName.c_str());
|
||
warnedAboutRegexNotMatching = false;
|
||
}
|
||
return false;
|
||
}
|
||
loc->tile = (int16_t)tmpLocationFields[0];
|
||
loc->x = tmpLocationFields[1];
|
||
loc->y = tmpLocationFields[2];
|
||
return true;
|
||
} else if (readNameRegex.empty()) {
|
||
return false;
|
||
} else {
|
||
// Standard version that will use the regex
|
||
cmatch m;
|
||
// cout << "here1" << endl;
|
||
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
||
loc->tile = std::stoi(m[1].str());
|
||
loc->x = std::stoi(m[2].str());
|
||
loc->y = std::stoi(m[3].str());
|
||
return true;
|
||
} else {
|
||
if (warnedAboutRegexNotMatching) {
|
||
Warn(
|
||
"READ_NAME_REGEX '%s' did not match read name '%s'."
|
||
"Your regex may not be correct. "
|
||
"Note that this message will not be emitted again "
|
||
"even if other read names do not match the regex.",
|
||
readNameRegex.c_str(), readName.c_str());
|
||
warnedAboutRegexNotMatching = false;
|
||
}
|
||
return false;
|
||
}
|
||
}
|
||
} catch (const std::runtime_error &e) {
|
||
if (warnedAboutRegexNotMatching) {
|
||
Warn(
|
||
"A field parsed out of a read name was expected to contain "
|
||
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
|
||
"%s; Error Msg: %s",
|
||
readNameRegex.c_str(), readName.c_str(), e.what());
|
||
warnedAboutRegexNotMatching = false;
|
||
}
|
||
} catch (...) {
|
||
if (warnedAboutRegexNotMatching) {
|
||
Warn(
|
||
"A field parsed out of a read name was expected to contain "
|
||
"an integer and did not. READ_NAME_REGEX: %s; Read name: "
|
||
"%s",
|
||
readNameRegex.c_str(), readName.c_str());
|
||
warnedAboutRegexNotMatching = false;
|
||
}
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Given a string, splits the string by the delimiter, and returns the the
|
||
* last three fields parsed as integers. Parsing a field considers only a
|
||
* sequence of digits up until the first non-digit character. The three
|
||
* values are stored in the passed-in array.
|
||
*
|
||
* @throws NumberFormatException if any of the tokens that should contain
|
||
* numbers do not start with parsable numbers
|
||
*/
|
||
int getLastThreeFields(const string &readName, char delim) {
|
||
int tokensIdx = 2; // start at the last token
|
||
int numFields = 0;
|
||
int i, endIdx;
|
||
endIdx = readName.size();
|
||
// find the last three tokens only
|
||
for (i = (int)readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--) {
|
||
if (readName.at(i) == delim || 0 == i) {
|
||
numFields++;
|
||
const int startIdx = (0 == i) ? 0 : (i + 1);
|
||
// cout << readName << endl;
|
||
tmpLocationFields[tokensIdx] =
|
||
std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
||
tokensIdx--;
|
||
endIdx = i;
|
||
}
|
||
}
|
||
// continue to find the # of fields
|
||
while (0 <= i) {
|
||
if (readName.at(i) == delim || 0 == i)
|
||
numFields++;
|
||
i--;
|
||
}
|
||
if (numFields < 3) {
|
||
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] =
|
||
-1;
|
||
numFields = -1;
|
||
}
|
||
|
||
return numFields;
|
||
}
|
||
};
|
||
|
||
#endif |