diff --git a/MexFunc/IsWordInDic.cpp b/MexFunc/IsWordInDic.cpp index 385e671..7c1cfe3 100644 --- a/MexFunc/IsWordInDic.cpp +++ b/MexFunc/IsWordInDic.cpp @@ -91,7 +91,7 @@ mxArray* writeToMatDouble(const double *data, int rowNum, int colNum) { 输入: 1. wd 文献摘要中的单词,二维cell 2. dic 字典,按字母序排序的,一维cell -3. threshold 保留超过阈值的列 +[3]. flagPrint 输出: x 一维int(double)类型,表示在wd的每一行单词中,dic中是否有单词匹配上(匹配后,对应的坐标为设为1,否则为0),所有匹配的个数(统计每一行的匹配个数) dic每一个单词在wd所有行中出现的次数 @@ -108,63 +108,56 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { Read2DWord(prhs[0], vvWd); Read1DWord(prhs[1], vDic); int rowNum = vvWd.size(); + int colNum = vDic.size(); - int threshold = 5; + int flagPrint = 0; if (nrhs > 2) { - double* pThreshold = (double*)mxGetData(prhs[2]); - threshold = (int)pThreshold[0]; - if (threshold < 5) threshold = 5; + double* pData = (double*)mxGetData(prhs[2]); + flagPrint = (int)pData[0]; } finish = clock(); - cout << "Load data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; + if (flagPrint == 2) cout << "Load data time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; - vector vXSum(vDic.size()); + vector vX(rowNum * colNum); // 一维表示二维 /* 统计dicr字典中,每个单词在wd中出现的次数 */ mid = clock(); - unordered_map umWordPos; - for (int i = 0; i < vDic.size(); ++i) umWordPos[vDic[i]] = i; // 记录单词位置 + unordered_map> umWordPos; + for (int i = 0; i < vDic.size(); ++i) umWordPos[vDic[i]].push_back(i); // dic中可能存在重复,记录单词位置 unordered_set usPos; // 多次出现在wd中的单词,只统计一次,这是原matlab代码的功能,是否需要修改? vector> vusX(rowNum); // 保存每一行中非零元的坐标 int row = 0; + // vector vSum(colNum); for (auto& vWd : vvWd) { - auto& usPos = vusX[row++]; + auto& usPos = vusX[row]; for (auto& word : vWd) { auto itr = umWordPos.find(word); if (itr != umWordPos.end()) { - usPos.insert(itr->second); + for (auto pos : itr->second) + usPos.insert(pos); } } for (auto idx : usPos) { - vXSum[idx] += 1; + vX[idx * rowNum + row] = 1; // matlab 列优先存储模式 + // vSum[idx] += 1; } + ++row; } finish = clock(); - cout << "Calc word occurrence time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + if (flagPrint == 2) cout << "Calc word occurrence time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; - /* 计算xs */ - mid = clock(); - int colNum = 0; - vector vColIdx; - for (int i = 0; i < vXSum.size(); ++i) { - if (vXSum[i] >= threshold) { - vColIdx.push_back(i); - } - } - colNum = vColIdx.size(); + // for (auto& w : vDic) cout << umWordPos[w] << endl; - vector vXsData(rowNum * colNum); - - for (int i = 0; i < rowNum; ++i) { - for (int j = 0; j < colNum; ++j) { - if (vusX[i].find(vColIdx[j]) != vusX[i].end()) { - vXsData[j * rowNum + i] = 1; - } - } - } - finish = clock(); - cout << "Calc xs time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + // for (auto& w : vvWd[260]) cout << w << '\t' << w.size() << endl; + //for (auto& w : vDic) cout << w << '\t' << w.size() << endl; + // vector vSum(colNum); + // for (int i = 0; i < rowNum; ++i) { + // for (int j = 0; j < colNum; ++j) { + // vSum[j] += vX[j * rowNum + i]; + // } + // } + // for (auto val : vSum) cout << val << endl; // 测试输出 // cout << rowNum << '\t' << colNum << endl; // ofstream ofs1("d:\\result_xsum.txt"); @@ -186,16 +179,14 @@ void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[]) { /* 写入结果 */ mid = clock(); - if (nlhs > 0) { - plhs[0] = writeToMatDouble(vXSum.data(), 1, vXSum.size()); + if (nlhs > 0) { // vvX + plhs[0] = writeToMatDouble(vX.data(), rowNum, colNum); } - if (nlhs > 1) { // xs - plhs[1] = writeToMatDouble(vXsData.data(), rowNum, colNum); - } - finish = clock(); - cout << "Write result time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; - cout << "Calc word occurrence in Dic Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; + finish = clock(); + if (flagPrint == 2) cout << "Write result time: " << (double)(finish - mid) / CLOCKS_PER_SEC << " s" << endl; + + if (flagPrint) cout << "Calc word occurrence in Dic Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; } // 供c++调试用 diff --git a/MexFunc/main.cpp b/MexFunc/main.cpp index 876c07e..a355ccf 100644 --- a/MexFunc/main.cpp +++ b/MexFunc/main.cpp @@ -13,12 +13,12 @@ int main(int argc, const char** argv) const mxArray* prhs[argReserveNum]; /* SortDedup */ - int nlhs = 1, nrhs = 2; - MATFile* pwdMat = matOpen("D:\\tmp\\wd_small.mat", "r"); - prhs[0] = matGetVariable(pwdMat, "wd"); - prhs[1] = mxCreateString("D:\\Twirls\\runtime\\output_1.dat"); - prhs[2] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(prhs[2]) = 2; + // int nlhs = 1, nrhs = 2; + // MATFile* pwdMat = matOpen("D:\\tmp\\wd_small.mat", "r"); + // prhs[0] = matGetVariable(pwdMat, "wd"); + // prhs[1] = mxCreateString("D:\\Twirls\\runtime\\output_1.dat"); + // prhs[2] = mxCreateDoubleMatrix(1, 1, mxREAL); + // *mxGetPr(prhs[2]) = 2; /* CalcEntropy */ // int nlhs = 2, nrhs = 4; @@ -32,12 +32,14 @@ int main(int argc, const char** argv) // *mxGetPr(prhs[3]) = 2; /* IsWordInDic */ - // MATFile* pwdMat, * pdicMat; - // int nlhs = 2, nrhs = 2; - // pwdMat = matOpen("D:\\tmp\\wd_large.mat", "r"); - // pdicMat = matOpen("D:\\tmp\\G_dc_large.mat", "r"); - // prhs[0] = matGetVariable(pwdMat, "wd"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃 - // prhs[1] = matGetVariable(pdicMat, "dc"); + MATFile* pwdMat, * pdicMat; + int nlhs = 1, nrhs = 3; + pwdMat = matOpen("D:\\tmp\\ws_small.mat", "r"); + pdicMat = matOpen("D:\\tmp\\x_small.mat", "r"); + prhs[0] = matGetVariable(pwdMat, "ws"); //鑾峰彇.mat鏂囦欢閲岄潰鍚嶄负matrixName鐨勭煩闃 + prhs[1] = matGetVariable(pdicMat, "x"); + prhs[2] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(prhs[2]) = 2; /* ClusterRandSim */ // int nlhs = 2, nrhs = 4;