From 31f0382cc1768098a72ad81a6b5b0341d03ab89c Mon Sep 17 00:00:00 2001 From: zzh Date: Fri, 22 Sep 2023 00:51:34 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BA=86=E4=BF=A1=E6=81=AF?= =?UTF-8?q?=E7=86=B5=E8=AE=A1=E7=AE=97=E7=9A=84=E7=A8=8B=E5=BA=8F=EF=BC=8C?= =?UTF-8?q?=E8=A7=A3=E5=86=B3=E4=BA=86bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + CommonLib/{readme.txt => readme.md} | 0 CppRun/calc_entropy.cpp | 102 ++++++++++++++++++++-------- CppRun/{readme.txt => readme.md} | 0 GMM/{readme.txt => readme.md} | 0 RandSim/{readme.txt => readme.md} | 0 6 files changed, 75 insertions(+), 29 deletions(-) rename CommonLib/{readme.txt => readme.md} (100%) rename CppRun/{readme.txt => readme.md} (100%) rename GMM/{readme.txt => readme.md} (100%) rename RandSim/{readme.txt => readme.md} (100%) diff --git a/.gitignore b/.gitignore index 33068a9..8ea8d51 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*.mat +*.txt x64/ eigen-3.4.0/ *.debug diff --git a/CommonLib/readme.txt b/CommonLib/readme.md similarity index 100% rename from CommonLib/readme.txt rename to CommonLib/readme.md diff --git a/CppRun/calc_entropy.cpp b/CppRun/calc_entropy.cpp index 404c3e9..ea85df5 100644 --- a/CppRun/calc_entropy.cpp +++ b/CppRun/calc_entropy.cpp @@ -133,24 +133,25 @@ bool ReadInfoFromMat(const string & filePath, vector >&vvDs, vect return true; } +// 将二维索引转成一维的索引 +inline int Get1DIndex(int colNum, int row, int col) { + return row * colNum + col; +} + /* 处理一个知识颗粒 */ -struct EntropyResult { // 存放每个文献对应的结果 - vector > vvEntropy; // 信息熵 - vector > vvTransEntropy; // 转置的信息熵 -}; struct ThreadParam { // 线程参数 fs::path matFilePath; + fs::path outFilePath; vector >* pvusWord; - EntropyResult* pRes; }; void ThreadProcessData(const ThreadParam& param) { const fs::path& matFilePath = param.matFilePath; - EntropyResult& res = *param.pRes; + const fs::path& outFilePath = param.outFilePath; vector >& vusWord = *param.pvusWord; - // 存放结果 - auto& hs = res.vvEntropy; - auto& hr = res.vvTransEntropy; + // 存放结果,用一维数组存放二维数据 + vector hs; + vector hr; vector > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵) vector > vvFr; // 词汇对应的频率 @@ -162,11 +163,10 @@ void ThreadProcessData(const ThreadParam& param) { // cout << vvDs.size() << '\t' << vvDs[0].size() << endl; const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量 const int numGroup = vvDs.size(); // ds包含的组数 - hs.resize(numGroup); - hr.resize(numLiterature); - for (int i = 0; i < numGroup; ++i) hs[i].resize(numLiterature); // resize会自动初始化 - for (int i = 0; i < numLiterature; ++i) hr[i].resize(numGroup); - for (int groupIdx = 0; groupIdx < vvDs.size(); ++groupIdx) { // 遍历知识颗粒中的每一组 + hs.resize(numGroup * numLiterature); + hr.resize(numLiterature * numGroup); + + for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) { // 遍历知识颗粒中的每一组 vector& vDs = vvDs[groupIdx]; // 这一组ds vector& vFr = vvFr[groupIdx]; // frequency const int numWord = vDs.size(); // 这一组数据中包含的单词数量 @@ -176,27 +176,53 @@ void ThreadProcessData(const ThreadParam& param) { for (int j = 0; j < numWord; ++j) { if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过 vX[i][j] = 1; - // 对每个知识颗粒每一组数据,计算信息熵 - hs[groupIdx][i] -= vFr[j] * log2(vFr[j]); + if (groupIdx == 1 && i == 2) { + // cout << matFilePath.string() << '\t' << j+1 << '\t' << vDs[j] << endl; + } } } } - // cout << vX[0][0] << endl; - - + // 找词汇的最高频率 + double maxFr = *max_element(vFr.begin(), vFr.end()); + // 将fr的数值规范化到(0,0.368)之间 + const double normalMax = 0.368; + for (auto& frVal : vFr) frVal = frVal * normalMax / maxFr; + maxFr = normalMax; + // 对每个知识颗粒每一组数据,计算信息熵 for (int i = 0; i < numLiterature; ++i) { - - if (vX[groupIdx][i] == 1) { - + for (int j = 0; j < numWord; ++j) { + if (vX[i][j] == 1) { + hs[Get1DIndex(numLiterature, groupIdx, i)] -= vFr[j] * log2(vFr[j]); + } } } // 找最高频词汇所在的索引位置 + vector vMaxPos; + int idx = 0; + for_each(vFr.begin(), vFr.end(), [&idx, maxFr, &vMaxPos](double val) { + if (val == maxFr) vMaxPos.push_back(idx); + idx++; + }); + + for (int i = 0; i < numLiterature; ++i) { + int cumulateX = 0; // 计算在最高频词汇处,x值的累加结果 + for (int j = 0; j < vMaxPos.size(); ++j) cumulateX += vX[i][vMaxPos[j]]; + if (cumulateX == vMaxPos.size()) { // 如果频率最高的词汇都出现在了文献中 + hr[Get1DIndex(numGroup,i, groupIdx)] = 1; // 应该是表示知识颗粒的这一组数据跟这篇文献相关性比较高 + } + } } + /* 将结果(hs和hr)写入每个知识颗粒的目录内 */ + MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w"); + SaveMtxDouble(hs.data(), pMatFile, "hs1", numGroup, numLiterature); + SaveMtxDouble(hr.data(), pMatFile, "hr1", numLiterature, numGroup); + matClose(pMatFile); } /* 程序入口 */ +// 运行例子:CalcEntropy d:\Twirls\runtime\ALS_test abs2class.mat d:\Twirls\runtime\pubmed_files\pubmed-multiplesc-set.mat hx_info.mat 12 word.mat void CalcEntropy(int argc, const char** argv) { // argv // 1. 知识颗粒的父目录名称 @@ -205,12 +231,13 @@ void CalcEntropy(int argc, const char** argv) { // 4. 存放输出结果的mat文件的后缀(每个知识颗粒目录中生成一个结果文件) // 5. 线程数量(可选) if (argc < 5) { - cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number])!" << endl; + cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl; return; } clock_t begin, finish; string parrentDir(argv[1]); // 知识颗粒的父目录名称 string wordMatSuffix(argv[2]); // 高频词汇矩阵对应的mat文件的后缀名(可以是全文件名,可以是文件名后缀,必须保证唯一) + fs::path outFileName(argv[4]); int numThread = 1; if (argc >= 5) numThread = atoi(argv[5]); if (numThread < 1) numThread = 1; @@ -224,11 +251,13 @@ void CalcEntropy(int argc, const char** argv) { cout << "PubMed Abstract info is null!" << endl; return; } - // 将摘要信息分割成一个一个的词汇 + /* 将摘要信息分割成一个一个的词汇 */ begin = clock(); unordered_set usWordChars; // 能组成单词的字符,要不要考虑数字?原版matlab是提取了数字的 for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z + for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9 + usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-'); vector > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数 vector > vusAbsWord(vAbstract.size()); // 将每篇文章摘要的单词放入hash表 for (int i = 0; i < vAbstract.size(); i++) { @@ -256,6 +285,22 @@ void CalcEntropy(int argc, const char** argv) { vusAbsWord[i].insert(upWord); } } + /* 将分割结果写入mat文件 */ + if (argc >= 6) { + MATFile* pMatFile = matOpen(argv[6], "w"); + mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size()); + for (int i = 0; i < vvWordMtx.size(); ++i) { + mxArray* pChildCellMtx = mxCreateCellMatrix(1, vvWordMtx[i].size()); + for (int j = 0; j < vvWordMtx[i].size(); ++j) { + mxArray* mxStr = mxCreateString(vvWordMtx[i][j].c_str()); + mxSetCell(pChildCellMtx, j, mxStr); + } + mxSetCell(pCellMtx, i, pChildCellMtx); + } + matPutVariable(pMatFile, "wd1", pCellMtx); + matClose(pMatFile); + mxDestroyArray(pCellMtx); + } finish = clock(); cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; //auto & vTest = vvWordMtx[0]; @@ -265,20 +310,19 @@ void CalcEntropy(int argc, const char** argv) { /* 遍历所有的知识颗粒目录,逐一进行处理 */ begin = clock(); - // ThreadPool thPool(numThread); - ThreadPool thPool(24); + ThreadPool thPool(numThread); + // ThreadPool thPool(24); // 查看知识颗粒数量 int numKnowledgeParticle = 0; FOREACH_PARTICLE_START numKnowledgeParticle++; FOREACH_PARTICLE_END - vector vEntropyResult(numKnowledgeParticle); // 存放所有结果 // 遍历每个知识颗粒,逐一进行处理 for (int round = 0; round < 1; ++round) { // 测试用 int i = 0; FOREACH_PARTICLE_START - ThreadParam tParam = { file, &vusAbsWord, &vEntropyResult[i] }; + ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord }; thPool.enqueue(ThreadProcessData, tParam); i++; FOREACH_PARTICLE_END @@ -289,7 +333,7 @@ void CalcEntropy(int argc, const char** argv) { finish = clock(); cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; - /* 合并处理结果 */ + //ofstream ofs("test_out.txt"); //for (auto& item : vEntropyResult) { // auto& vvEntropy = item.vvEntropy; diff --git a/CppRun/readme.txt b/CppRun/readme.md similarity index 100% rename from CppRun/readme.txt rename to CppRun/readme.md diff --git a/GMM/readme.txt b/GMM/readme.md similarity index 100% rename from GMM/readme.txt rename to GMM/readme.md diff --git a/RandSim/readme.txt b/RandSim/readme.md similarity index 100% rename from RandSim/readme.txt rename to RandSim/readme.md