101 lines
3.4 KiB
C++
101 lines
3.4 KiB
C++
#include <mex.h>
|
||
#include <mat.h>
|
||
#include <iostream>
|
||
#include <algorithm>
|
||
#include <vector>
|
||
#include <string>
|
||
#include <unordered_set>
|
||
#include <ctime>
|
||
using std::cout;
|
||
using std::endl;
|
||
using namespace std;
|
||
|
||
#define STRING_BUF_SIZE 204800
|
||
|
||
/* 读取abs */
|
||
void GetAbstract(const mxArray* pMxAbs, vector<string>& vAbs) {
|
||
int rowNum = (int)mxGetM(pMxAbs);
|
||
int colNum = (int)mxGetN(pMxAbs);
|
||
char *strBuf = new char[STRING_BUF_SIZE];
|
||
|
||
vAbs.resize(rowNum * colNum);
|
||
for (int i = 0; i < rowNum; ++i) {
|
||
for (int j = 0; j < colNum; ++j) {
|
||
mxArray* pCell = mxGetCell(pMxAbs, j * rowNum + i);
|
||
if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) {
|
||
cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
|
||
delete[]strBuf;
|
||
return;
|
||
}
|
||
vAbs[i * colNum + j] = strBuf;
|
||
}
|
||
}
|
||
delete[]strBuf;
|
||
}
|
||
|
||
/*
|
||
nlhs:输出参数数目(Number Left - hand side),等号左边
|
||
plhs:指向输出参数的指针(Point Left - hand side),等号左边
|
||
nrhs:输入参数数目(Number Right - hand side),等号右边
|
||
prhs:指向输入参数的指针(Point Right - hand side),等号右边。要注意prhs是const的指针数组,即不能改变其指向内容。
|
||
*/
|
||
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
|
||
//cout << "WordSplit" << endl;
|
||
//cout << nlhs << '\t' << nrhs << endl;
|
||
if (nrhs != 1) {
|
||
cout << "1 arguments should be given for this function!" << endl;
|
||
return;
|
||
}
|
||
|
||
/* 读取参数中的摘要信息 */
|
||
vector<string> vAbstract; // 读取abs1, 然后分割成一个一个的单词
|
||
GetAbstract(prhs[0], vAbstract);
|
||
|
||
/* 将摘要信息分割成一个一个的词汇 */
|
||
clock_t begin, finish;
|
||
begin = clock();
|
||
unordered_set<char> usWordChars; // 能组成单词的字符,要不要考虑数字?原版matlab是提取了数字的
|
||
for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z
|
||
for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z
|
||
for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9
|
||
usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-');
|
||
vector<vector<string> > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数
|
||
for (int i = 0; i < vAbstract.size(); i++) {
|
||
auto& strAbs = vAbstract[i];
|
||
// 遍历摘要字符串的每一个字符,取出每一个单词
|
||
vector<string>& vWord = vvWordMtx[i];
|
||
if (strAbs.size() == 0) continue; // 摘要信息为空,跳过(一般不会出现这个情况)
|
||
int wordStartPos = 0;
|
||
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
|
||
wordStartPos++;
|
||
for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) {
|
||
if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符
|
||
vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos));
|
||
wordStartPos = curPos + 1; // 找下一个词语起始位置
|
||
while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
|
||
wordStartPos++;
|
||
curPos = wordStartPos; // 循环会自动加1
|
||
}
|
||
}
|
||
}
|
||
|
||
/* 将结果写入返回参数 */
|
||
/* 将结果写入返回值 */
|
||
if (nlhs > 0) {
|
||
mxArray* pCellMtx = mxCreateCellMatrix(1, vvWordMtx.size());
|
||
for (int i = 0; i < vvWordMtx.size(); ++i) {
|
||
mxArray* pChildCellMtx = mxCreateCellMatrix(1, vvWordMtx[i].size());
|
||
for (int j = 0; j < vvWordMtx[i].size(); ++j) {
|
||
mxArray* mxStr = mxCreateString(vvWordMtx[i][j].c_str());
|
||
mxSetCell(pChildCellMtx, j, mxStr);
|
||
}
|
||
mxSetCell(pCellMtx, i, pChildCellMtx);
|
||
}
|
||
plhs[0] = pCellMtx; // 赋值给返回值
|
||
}
|
||
|
||
|
||
finish = clock();
|
||
//cout << "split abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||
|
||
} |