twirls/MexFunc/WordSplit.cpp

#include <mex.h>
#include <mat.h>
#include <iostream>
#include <algorithm>
#include <vector>
#include <string>
#include <unordered_set>
#include <ctime>
using std::cout;
using std::endl;
using namespace std;

#define STRING_BUF_SIZE 204800

/* 读取abs */
void GetAbstract(const mxArray* pMxAbs, vector<string>& vAbs) {
	int rowNum = (int)mxGetM(pMxAbs);
	int colNum = (int)mxGetN(pMxAbs);
	char *strBuf = new char[STRING_BUF_SIZE];

	vAbs.resize(rowNum * colNum);
	for (int i = 0; i < rowNum; ++i) {
		for (int j = 0; j < colNum; ++j) {
			mxArray* pCell = mxGetCell(pMxAbs, j * rowNum + i);
			if (mxGetString(pCell, strBuf, STRING_BUF_SIZE) != 0) {
				cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl;
				delete[]strBuf;
				return;
			}
			vAbs[i * colNum + j] = strBuf;
		}
	}
	delete[]strBuf;
}

/*
nlhs：输出参数数目(Number Left - hand side)，等号左边
plhs：指向输出参数的指针(Point Left - hand side)，等号左边
nrhs：输入参数数目(Number Right - hand side)，等号右边
prhs：指向输入参数的指针(Point Right - hand side)，等号右边。要注意prhs是const的指针数组，即不能改变其指向内容。
*/
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) {
	//cout << "WordSplit" << endl;
	//cout << nlhs << '\t' << nrhs << endl;
	if (nrhs != 1) {
		cout << "1 arguments should be given for this function!" << endl;
		return;
	}

	/* 读取参数中的摘要信息 */
	vector<string> vAbstract; // 读取abs1, 然后分割成一个一个的单词
	GetAbstract(prhs[0], vAbstract);

	/* 将摘要信息分割成一个一个的词汇 */
	clock_t begin, finish;
	begin = clock();
	unordered_set<char> usWordChars; // 能组成单词的字符，要不要考虑数字？原版matlab是提取了数字的
	for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z
	for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z
	for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9
	usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-');
	vector<vector<string> > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数
	for (int i = 0; i < vAbstract.size(); i++) {
		auto& strAbs = vAbstract[i];
		// 遍历摘要字符串的每一个字符，取出每一个单词
		vector<string>& vWord = vvWordMtx[i];
		if (strAbs.size() == 0) continue; // 摘要信息为空，跳过（一般不会出现这个情况）
		int wordStartPos = 0;
		while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
			wordStartPos++;
		for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) {
			if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符
				vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos));
				wordStartPos = curPos + 1; // 找下一个词语起始位置
				while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end())
					wordStartPos++;
				curPos = wordStartPos; // 循环会自动加1
			}
		}
	}

	/* 将结果写入返回参数 */
	/* 将结果写入返回值 */
	if (nlhs > 0) {
		mxArray* pCellMtx = mxCreateCellMatrix(1, vvWordMtx.size());
		for (int i = 0; i < vvWordMtx.size(); ++i) {
			mxArray* pChildCellMtx = mxCreateCellMatrix(1, vvWordMtx[i].size());
			for (int j = 0; j < vvWordMtx[i].size(); ++j) {
				mxArray* mxStr = mxCreateString(vvWordMtx[i][j].c_str());
				mxSetCell(pChildCellMtx, j, mxStr);
			}
			mxSetCell(pCellMtx, i, pChildCellMtx);
		}
		plhs[0] = pCellMtx; // 赋值给返回值
	}


	finish = clock();
	//cout << "split abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;

}