picard_cpp/src/sam/utils/read_name_parser.h

218 lines
8.3 KiB
C
Raw Normal View History

/*
Description: readnametile, x, y
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/11/6
*/
#ifndef READ_NAME_PARSER_H_
#define READ_NAME_PARSER_H_
#include "read_ends.h"
#include <common/utils/util.h>
#include <stdint.h>
#include <string>
// #include <regex>
#include <boost/regex.hpp>
// using std::regex;
using boost::cmatch;
using boost::regex;
using std::string;
/**
* Provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
* non-zero positive integers, x and y coordinates may be negative.
* 线
*/
struct ReadNameParser
{
/**
* The read name regular expression (regex) is used to extract three pieces of information from the read name: tile, x location,
* and y location. Any read name regex should parse the read name to produce these and only these values. An example regex is:
* (?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$
* which assumes that fields in the read name are delimited by ':' and the last three fields correspond to the tile, x and y locations,
* ignoring any trailing non-digit characters.
*
* The default regex is optimized for fast parsing (see {@link #getLastThreeFields(String, char, int[])}) by searching for the last
* three fields, ignoring any trailing non-digit characters, assuming the delimiter ':'. This should consider correctly read names
* where we have 5 or 7 field with the last three fields being tile/x/y, as is the case for the majority of read names produced by
* Illumina technology.
*/
const string DEFAULT_READ_NAME_REGEX = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*$";
string readNameStored = "";
PhysicalLocation physicalLocationStored;
int tmpLocationFields[3]; // for optimization of addLocationInformation
bool useOptimizedDefaultParsing = true; // was the regex default?
string readNameRegex = DEFAULT_READ_NAME_REGEX;
regex readNamePattern;
bool warnedAboutRegexNotMatching = true;
ReadNameParser() : ReadNameParser(DEFAULT_READ_NAME_REGEX) {}
ReadNameParser(const string &strReadNameRegex) : ReadNameParser(strReadNameRegex, true) {}
ReadNameParser(const string &strReadNameRegex, bool isWarn)
{
readNameRegex = strReadNameRegex;
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
useOptimizedDefaultParsing = true;
else
useOptimizedDefaultParsing = false;
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
warnedAboutRegexNotMatching = isWarn;
}
/* 重新设置readNameRegex */
void SetReadNameRegex(const string &strReadNameRegex)
{
readNameRegex = strReadNameRegex;
if (strReadNameRegex == DEFAULT_READ_NAME_REGEX)
useOptimizedDefaultParsing = true;
else
useOptimizedDefaultParsing = false;
readNamePattern = boost::regex(strReadNameRegex, boost::regex_constants::optimize);
// readNamePattern = strReadNameRegex;
}
/* 添加测序时候的tile x y 信息 */
bool AddLocationInformation(const string &readName, PhysicalLocation *loc)
{
if (!(readName == readNameStored))
{
if (ReadLocationInformation(readName, loc))
{
readNameStored = readName;
physicalLocationStored = *loc;
return true;
}
// return false if read name cannot be parsed
return false;
}
else
{
*loc = physicalLocationStored;
return true;
}
}
/**
* Method used to extract tile/x/y from the read name and add it to the PhysicalLocationShort so that it
* can be used later to determine optical duplication
*
* @param readName the name of the read/cluster
* @param loc the object to add tile/x/y to
* @return true if the read name contained the information in parsable form, false otherwise
*/
bool ReadLocationInformation(const string &readName, PhysicalLocation *loc)
{
try {
// Optimized version if using the default read name regex (== used on purpose):
if (useOptimizedDefaultParsing)
{
const int fields = getLastThreeFields(readName, ':');
if (!(fields == 5 || fields == 7))
{
if (warnedAboutRegexNotMatching)
{
Warn(
"Default READ_NAME_REGEX '%s' did not match read name '%s'."
"You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. "
"Note that this message will not be emitted again even if other read names do not match the regex.",
readNameRegex.c_str(), readName.c_str());
warnedAboutRegexNotMatching = false;
}
return false;
}
loc->tile = (int16_t)tmpLocationFields[0];
loc->x = tmpLocationFields[1];
loc->y = tmpLocationFields[2];
return true;
}
else if (readNameRegex.empty())
{
return false;
}
else
{
// Standard version that will use the regex
cmatch m;
if (boost::regex_match(readName.c_str(), m, readNamePattern)) {
loc->tile = std::stoi(m[1].str());
loc->x = std::stoi(m[2].str());
loc->y = std::stoi(m[3].str());
return true;
}
else
{
if (warnedAboutRegexNotMatching)
{
Warn(
"READ_NAME_REGEX '%s' did not match read name '%s'."
"Your regex may not be correct. "
"Note that this message will not be emitted again even if other read names do not match the regex.",
readNameRegex.c_str(), readName.c_str());
warnedAboutRegexNotMatching = false;
}
return false;
}
}
}
catch (const std::runtime_error &e)
{
if (warnedAboutRegexNotMatching)
{
Warn(
"A field parsed out of a read name was expected to contain an integer and did not. READ_NAME_REGEX: %s; Read name: %s; Error Msg: %s",
readNameRegex.c_str(), readName.c_str(), e.what());
warnedAboutRegexNotMatching = false;
}
}
return true;
}
/**
* Given a string, splits the string by the delimiter, and returns the the last three fields parsed as integers. Parsing a field
* considers only a sequence of digits up until the first non-digit character. The three values are stored in the passed-in array.
*
* @throws NumberFormatException if any of the tokens that should contain numbers do not start with parsable numbers
*/
int getLastThreeFields(const string &readName, char delim)
{
int tokensIdx = 2; // start at the last token
int numFields = 0;
int i, endIdx;
endIdx = readName.size();
// find the last three tokens only
for (i = readName.size() - 1; 0 <= i && 0 <= tokensIdx; i--)
{
if (readName.at(i) == delim || 0 == i)
{
numFields++;
const int startIdx = (0 == i) ? 0 : (i + 1);
tmpLocationFields[tokensIdx] = std::stoi(readName.substr(startIdx, endIdx - startIdx));
tokensIdx--;
endIdx = i;
}
}
// continue to find the # of fields
while (0 <= i)
{
if (readName.at(i) == delim || 0 == i)
numFields++;
i--;
}
if (numFields < 3)
{
tmpLocationFields[0] = tmpLocationFields[1] = tmpLocationFields[2] = -1;
numFields = -1;
}
return numFields;
}
};
#endif