#include #include #include #include #include #include #include #include using std::cout; using std::endl; using namespace std; #define STRING_BUF_SIZE 204800 /* 读取abs */ void GetAbstract(const mxArray* pMxAbs, vector& vAbs) { int rowNum = (int)mxGetM(pMxAbs); int colNum = (int)mxGetN(pMxAbs); char *strBuf = new char[STRING_BUF_SIZE]; vAbs.resize(rowNum * colNum); for (int i = 0; i < rowNum; ++i) { for (int j = 0; j < colNum; ++j) { mxArray* pCell = mxGetCell(pMxAbs, j * rowNum + i); if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) { cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; delete[]strBuf; return; } vAbs[i * colNum + j] = strBuf; } } delete[]strBuf; } /* nlhs:输出参数数目(Number Left - hand side),等号左边 plhs:指向输出参数的指针(Point Left - hand side),等号左边 nrhs:输入参数数目(Number Right - hand side),等号右边 prhs:指向输入参数的指针(Point Right - hand side),等号右边。要注意prhs是const的指针数组,即不能改变其指向内容。 */ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { //cout << "WordSplit" << endl; //cout << nlhs << '\t' << nrhs << endl; if (nrhs != 1) { cout << "1 arguments should be given for this function!" << endl; return; } /* 读取参数中的摘要信息 */ vector vAbstract; // 读取abs1, 然后分割成一个一个的单词 GetAbstract(prhs[0], vAbstract); /* 将摘要信息分割成一个一个的词汇 */ clock_t begin, finish; begin = clock(); unordered_set usWordChars; // 能组成单词的字符,要不要考虑数字?原版matlab是提取了数字的 for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9 usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-'); vector > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数 for (int i = 0; i < vAbstract.size(); i++) { auto& strAbs = vAbstract[i]; // 遍历摘要字符串的每一个字符,取出每一个单词 vector& vWord = vvWordMtx[i]; if (strAbs.size() == 0) continue; // 摘要信息为空,跳过(一般不会出现这个情况) int wordStartPos = 0; while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end()) wordStartPos++; for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) { if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符 vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos)); wordStartPos = curPos + 1; // 找下一个词语起始位置 while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end()) wordStartPos++; curPos = wordStartPos; // 循环会自动加1 } } } /* 将结果写入返回参数 */ /* 将结果写入返回值 */ if (nlhs > 0) { mxArray* pCellMtx = mxCreateCellMatrix(1, vvWordMtx.size()); for (int i = 0; i < vvWordMtx.size(); ++i) { mxArray* pChildCellMtx = mxCreateCellMatrix(1, vvWordMtx[i].size()); for (int j = 0; j < vvWordMtx[i].size(); ++j) { mxArray* mxStr = mxCreateString(vvWordMtx[i][j].c_str()); mxSetCell(pChildCellMtx, j, mxStr); } mxSetCell(pCellMtx, i, pChildCellMtx); } plhs[0] = pCellMtx; // 赋值给返回值 } finish = clock(); //cout << "split abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; }