twirls/MexFunc/WordSplit.cpp

101 lines
3.4 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <mex.h>
#include <mat.h>
#include <iostream>
#include <algorithm>
#include <vector>
#include <string>
#include <unordered_set>
#include <ctime>
using std::cout;
using std::endl;
using namespace std;
#define STRING_BUF_SIZE 204800
/* 读取abs */
void GetAbstract(const mxArray* pMxAbs, vector<string>& vAbs) {
int rowNum = (int)mxGetM(pMxAbs);
int colNum = (int)mxGetN(pMxAbs);
char *strBuf = new char[STRING_BUF_SIZE];
vAbs.resize(rowNum * colNum);
for (int i = 0; i < rowNum; ++i) {
for (int j = 0; j < colNum; ++j) {
mxArray* pCell = mxGetCell(pMxAbs, j * rowNum + i);
if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) {
cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
delete[]strBuf;
return;
}
vAbs[i * colNum + j] = strBuf;
}
}
delete[]strBuf;
}
/*
nlhs输出参数数目(Number Left - hand side),等号左边
plhs指向输出参数的指针(Point Left - hand side),等号左边
nrhs输入参数数目(Number Right - hand side),等号右边
prhs指向输入参数的指针(Point Right - hand side)等号右边。要注意prhs是const的指针数组即不能改变其指向内容。
*/
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
//cout << "WordSplit" << endl;
//cout << nlhs << '\t' << nrhs << endl;
if (nrhs != 1) {
cout << "1 arguments should be given for this function!" << endl;
return;
}
/* 读取参数中的摘要信息 */
vector<string> vAbstract; // 读取abs1, 然后分割成一个一个的单词
GetAbstract(prhs[0], vAbstract);
/* 将摘要信息分割成一个一个的词汇 */
clock_t begin, finish;
begin = clock();
unordered_set<char> usWordChars; // 能组成单词的字符要不要考虑数字原版matlab是提取了数字的
for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z
for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z
for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9
usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-');
vector<vector<string> > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数
for (int i = 0; i < vAbstract.size(); i++) {
auto& strAbs = vAbstract[i];
// 遍历摘要字符串的每一个字符,取出每一个单词
vector<string>& vWord = vvWordMtx[i];
if (strAbs.size() == 0) continue; // 摘要信息为空,跳过(一般不会出现这个情况)
int wordStartPos = 0;
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
wordStartPos++;
for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) {
if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符
vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos));
wordStartPos = curPos + 1; // 找下一个词语起始位置
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
wordStartPos++;
curPos = wordStartPos; // 循环会自动加1
}
}
}
/* 将结果写入返回参数 */
/* 将结果写入返回值 */
if (nlhs > 0) {
mxArray* pCellMtx = mxCreateCellMatrix(1, vvWordMtx.size());
for (int i = 0; i < vvWordMtx.size(); ++i) {
mxArray* pChildCellMtx = mxCreateCellMatrix(1, vvWordMtx[i].size());
for (int j = 0; j < vvWordMtx[i].size(); ++j) {
mxArray* mxStr = mxCreateString(vvWordMtx[i][j].c_str());
mxSetCell(pChildCellMtx, j, mxStr);
}
mxSetCell(pCellMtx, i, pChildCellMtx);
}
plhs[0] = pCellMtx; // 赋值给返回值
}
finish = clock();
//cout << "split abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}