218 lines
8.3 KiB
C
218 lines
8.3 KiB
C
|
|
/*
|
|||
|
|
Description: 解析read的name中的信息,比如tile, x, y等
|
|||
|
|
|
|||
|
|
Copyright : All right reserved by ICT
|
|||
|
|
|
|||
|
|
Author : Zhang Zhonghai
|
|||
|
|
Date : 2023/11/6
|
|||
|
|
*/
|
|||
|
|
#ifndef READ_NAME_PARSER_H_
|
|||
|
|
#define READ_NAME_PARSER_H_
|
|||
|
|
|
|||
|
|
#include "read_ends.h"
|
|||
|
|
#include <common/utils/util.h>
|
|||
|
|
|
|||
|
|
#include <stdint.h>
|
|||
|
|
#include <string>
|
|||
|
|
// #include <regex>
|
|||
|
|
#include <boost/regex.hpp>
|
|||
|
|
|
|||
|
|
// using std::regex;
|
|||
|
|
using boost::cmatch;
|
|||
|
|
using boost::regex;
|
|||
|
|
using std::string;
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Provides access to the physical location information about a cluster.
|
|||
|
|
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
|
|||
|
|
* non-zero positive integers, x and y coordinates may be negative.
|
|||
|
|
* 非线程安全
|
|||
|
|
*/
|
|||
|
|
struct ReadNameParser
|
|||
|
|
{
|
|||
|
|
/**
|
|||
|
|
* The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
|
|||
|
|
* and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is:
|
|||
|
|
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
|
|||
|
|
* which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
|
|||
|
|
* ignoring any trailing non-digit characters.
|
|||
|
|
*
|
|||
|
|
* The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
|
|||
|
|
* three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names
|
|||
|
|
* where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
|
|||
|
|
* Illumina technology.
|
|||
|
|
*/
|
|||
|
|
const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
|
|||
|
|
|
|||
|
|
string readNameStored = "";
|
|||
|
|
PhysicalLocation physicalLocationStored;
|
|||
|
|
int tmpLocationFields[3]; // for optimization of addLocationInformation
|
|||
|
|
bool useOptimizedDefaultParsing = true; // was the regex default?
|
|||
|
|
string readNameRegex = DEFAULT_READ_NAME_REGEX;
|
|||
|
|
regex readNamePattern;
|
|||
|
|
bool warnedAboutRegexNotMatching = true;
|
|||
|
|
|
|||
|
|
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
|
|||
|
|
ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {}
|
|||
|
|
ReadNameParser(const string &strReadNameRegex, bool isWarn)
|
|||
|
|
{
|
|||
|
|
readNameRegex = strReadNameRegex;
|
|||
|
|
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
|||
|
|
useOptimizedDefaultParsing = true;
|
|||
|
|
else
|
|||
|
|
useOptimizedDefaultParsing = false;
|
|||
|
|
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
|||
|
|
warnedAboutRegexNotMatching = isWarn;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/* 重新设置readNameRegex */
|
|||
|
|
void SetReadNameRegex(const string &strReadNameRegex)
|
|||
|
|
{
|
|||
|
|
readNameRegex = strReadNameRegex;
|
|||
|
|
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
|
|||
|
|
useOptimizedDefaultParsing = true;
|
|||
|
|
else
|
|||
|
|
useOptimizedDefaultParsing = false;
|
|||
|
|
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
|
|||
|
|
// readNamePattern = strReadNameRegex;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/* 添加测序时候的tile x y 信息 */
|
|||
|
|
bool AddLocationInformation(const string &readName, PhysicalLocation *loc)
|
|||
|
|
{
|
|||
|
|
if (!(readName == readNameStored))
|
|||
|
|
{
|
|||
|
|
if (ReadLocationInformation(readName, loc))
|
|||
|
|
{
|
|||
|
|
readNameStored = readName;
|
|||
|
|
physicalLocationStored = *loc;
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
// return false if read name cannot be parsed
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
*loc = physicalLocationStored;
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
|
|||
|
|
* can be used later to determine optical duplication
|
|||
|
|
*
|
|||
|
|
* @param readName the name of the read/cluster
|
|||
|
|
* @param loc the object to add tile/x/y to
|
|||
|
|
* @return true if the read name contained the information in parsable form, false otherwise
|
|||
|
|
*/
|
|||
|
|
bool ReadLocationInformation(const string &readName, PhysicalLocation *loc)
|
|||
|
|
{
|
|||
|
|
try {
|
|||
|
|
// Optimized version if using the default read name regex (== used on purpose):
|
|||
|
|
if (useOptimizedDefaultParsing)
|
|||
|
|
{
|
|||
|
|
const int fields = getLastThreeFields(readName, ':');
|
|||
|
|
if (!(fields == 5 || fields == 7))
|
|||
|
|
{
|
|||
|
|
if (warnedAboutRegexNotMatching)
|
|||
|
|
{
|
|||
|
|
Warn(
|
|||
|
|
"Default READ_NAME_REGEX '%s' did not match read name '%s'."
|
|||
|
|
"You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. "
|
|||
|
|
"Note that this message will not be emitted again even if other read names do not match the regex.",
|
|||
|
|
readNameRegex.c_str(), readName.c_str());
|
|||
|
|
warnedAboutRegexNotMatching = false;
|
|||
|
|
}
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
loc->tile = (int16_t)tmpLocationFields[0];
|
|||
|
|
loc->x = tmpLocationFields[1];
|
|||
|
|
loc->y = tmpLocationFields[2];
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
else if (readNameRegex.empty())
|
|||
|
|
{
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
// Standard version that will use the regex
|
|||
|
|
cmatch m;
|
|||
|
|
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
|
|||
|
|
loc->tile = std::stoi(m[1].str());
|
|||
|
|
loc->x = std::stoi(m[2].str());
|
|||
|
|
loc->y = std::stoi(m[3].str());
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
if (warnedAboutRegexNotMatching)
|
|||
|
|
{
|
|||
|
|
Warn(
|
|||
|
|
"READ_NAME_REGEX '%s' did not match read name '%s'."
|
|||
|
|
"Your regex may not be correct. "
|
|||
|
|
"Note that this message will not be emitted again even if other read names do not match the regex.",
|
|||
|
|
readNameRegex.c_str(), readName.c_str());
|
|||
|
|
warnedAboutRegexNotMatching = false;
|
|||
|
|
}
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
catch (const std::runtime_error &e)
|
|||
|
|
{
|
|||
|
|
if (warnedAboutRegexNotMatching)
|
|||
|
|
{
|
|||
|
|
Warn(
|
|||
|
|
"A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s",
|
|||
|
|
readNameRegex.c_str(), readName.c_str(), e.what());
|
|||
|
|
warnedAboutRegexNotMatching = false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field
|
|||
|
|
* considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array.
|
|||
|
|
*
|
|||
|
|
* @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
|
|||
|
|
*/
|
|||
|
|
int getLastThreeFields(const string &readName, char delim)
|
|||
|
|
{
|
|||
|
|
int tokensIdx = 2; // start at the last token
|
|||
|
|
int numFields = 0;
|
|||
|
|
int i, endIdx;
|
|||
|
|
endIdx = readName.size();
|
|||
|
|
// find the last three tokens only
|
|||
|
|
for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--)
|
|||
|
|
{
|
|||
|
|
if (readName.at(i) == delim || 0 == i)
|
|||
|
|
{
|
|||
|
|
numFields++;
|
|||
|
|
const int startIdx = (0 == i) ? 0 : (i + 1);
|
|||
|
|
tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx));
|
|||
|
|
tokensIdx--;
|
|||
|
|
endIdx = i;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// continue to find the # of fields
|
|||
|
|
while (0 <= i)
|
|||
|
|
{
|
|||
|
|
if (readName.at(i) == delim || 0 == i)
|
|||
|
|
numFields++;
|
|||
|
|
i--;
|
|||
|
|
}
|
|||
|
|
if (numFields < 3)
|
|||
|
|
{
|
|||
|
|
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1;
|
|||
|
|
numFields = -1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return numFields;
|
|||
|
|
}
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
#endif
|