/********************************************************************************************* Description: 检查每个文献的摘要部分是否包含设定的高频词汇，并用这些高频词汇计算摘要的信息熵 Copyright : All right reserved by ZheYuan.BJ Author : Zhang Zhonghai Date : 2023/09/20 ***********************************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #include #include #define F_OK 0 #else #include #endif #include #include "common.h" #include "CommonLib/thread_pool.h" #include "CommonLib/matlab_io.h" using namespace std; using std::cout; using std::vector; namespace fs = std::filesystem; #include "common.h" #include "CommonLib/matlab_io.h" using namespace std; // 遍历知识颗粒，循环处理 #define FOREACH_PARTICLE_START \ for (auto &childDir : fs::directory_iterator(parrentDir)) { \ for (auto &file : fs::directory_iterator(childDir)) { \ const string &fileName = file.path().filename().string(); \ auto rPos = fileName.rfind(wordMatSuffix); \ if (rPos != string::npos && fileName.size() - rPos == wordMatSuffix.size()) { #define FOREACH_PARTICLE_END \ } \ } \ } /* 读取二层cell包裹的字符串,和数值，ds,fr */ #define OUTER_FOR_BEGIN \ rowNum = (int)mxGetM(pMxArray); \ colNum = (int)mxGetN(pMxArray); \ for (int i = 0; i < rowNum; ++i) { \ for (int j = 0; j < colNum; ++j) { \ pCell = mxGetCell(pMxArray, j * rowNum + i); \ int childRowNum = (int)mxGetM(pCell); \ int childColNum = (int)mxGetN(pCell); #define OUTER_FOR_END \ } \ } \ mxDestroyArray(pMxArray); #define INNTER_FOR_BEGIN \ for (int ii = 0; ii < childRowNum; ii++) { \ for (int jj = 0; jj < childColNum; jj++) { \ mxArray *pChildCell = mxGetCell(pCell, jj * childRowNum + ii); #define INNTER_FOR_END \ } \ } // 将matlab存储方式转换成c存储方式 #define TRANS_ROW_COL(dst, src, rowNum, colNum) \ for (int rowI = 0; rowI < rowNum; ++rowI) { \ for (int colJ = 0; colJ < colNum; ++colJ) { \ dst[rowI * colNum + colJ] = src[colJ * rowNum + rowI]; \ } \ } // 读取ds和fr信息 bool ReadInfoFromMat(const string & filePath, vector >&vvDs, vector >&vvFr) { MATFile* pMatFile = nullptr; mxArray* pMxArray = nullptr; mxArray* pCell = nullptr; int rowNum, colNum; char strBuf[STRING_BUF_SIZE]; const string& parrentName = "G"; const string& firstChildName = "ds"; const string& secondChildName = "fr"; pMatFile = matOpen(filePath.c_str(), "r"); //打开.mat文件 if (pMatFile == nullptr) { cout << "filePath is error!" << endl; return false; } mxArray* pMxG = matGetVariable(pMatFile, parrentName.c_str()); //获取G变量 // 读取ds字符串 pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds OUTER_FOR_BEGIN // cout << childRowNum << '\t' << childColNum << endl; vvDs.push_back(vector()); vvDs.back().resize(childRowNum * childColNum); INNTER_FOR_BEGIN if (mxGetString(pChildCell, strBuf, STRING_BUF_SIZE) != 0) { cout << "String is too large to fit in the buffer! " << i + 1 << '\t' << j + 1 << endl; return false; } vvDs.back()[ii * childColNum + jj] = strBuf; auto& lastStr = vvDs.back()[ii * childColNum + jj]; transform(lastStr.begin(), lastStr.end(), lastStr.begin(), ::toupper); // 转成大写 INNTER_FOR_END OUTER_FOR_END // 读取fr数值 pMxArray = mxGetField(pMxG, 0, secondChildName.c_str()); // fr OUTER_FOR_BEGIN vvFr.push_back(vector()); vvFr.back().resize(childRowNum * childColNum); double *pVal = (double*)mxGetData(pCell); //获取指针 TRANS_ROW_COL(vvFr.back(), pVal, childRowNum, childColNum); // 行列存储方式转换 OUTER_FOR_END // 没考虑完全哪些数据需要mxDestroyArray，可能会有内存泄漏 return true; } /* 处理一个知识颗粒 */ struct EntropyResult { // 存放每个文献对应的结果 vector > vvEntropy; // 信息熵 vector > vvTransEntropy; // 转置的信息熵 }; struct ThreadParam { // 线程参数 fs::path matFilePath; vector >* pvusWord; EntropyResult* pRes; }; void ThreadProcessData(const ThreadParam& param) { const fs::path& matFilePath = param.matFilePath; EntropyResult& res = *param.pRes; vector >& vusWord = *param.pvusWord; // 存放结果 auto& hs = res.vvEntropy; auto& hr = res.vvTransEntropy; vector > vvDs; // 每个知识颗粒的ds矩阵（词汇矩阵） vector > vvFr; // 词汇对应的频率 // cout << matFilePath.string() << endl; // 读取G结构体中的ds和fr信息 ReadInfoFromMat(matFilePath.string(), vvDs, vvFr); // res.vvEntropy.push_back(vvFr[0]); // cout << vvDs.size() << '\t' << vvDs[0].size() << endl; const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量 const int numGroup = vvDs.size(); // ds包含的组数 hs.resize(numGroup); hr.resize(numLiterature); for (int i = 0; i < numGroup; ++i) hs[i].resize(numLiterature); // resize会自动初始化 for (int i = 0; i < numLiterature; ++i) hr[i].resize(numGroup); for (int groupIdx = 0; groupIdx < vvDs.size(); ++groupIdx) { // 遍历知识颗粒中的每一组 vector& vDs = vvDs[groupIdx]; // 这一组ds vector& vFr = vvFr[groupIdx]; // frequency const int numWord = vDs.size(); // 这一组数据中包含的单词数量 vector > vX(numLiterature, vector(numWord, 0)); // 检查知识颗粒中的词语是否出现在pubmed摘要的词语中 for (int i= 0; i < numLiterature; ++i) { for (int j = 0; j < numWord; ++j) { if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过 vX[i][j] = 1; // 对每个知识颗粒每一组数据，计算信息熵 hs[groupIdx][i] -= vFr[j] * log2(vFr[j]); } } } // cout << vX[0][0] << endl; for (int i = 0; i < numLiterature; ++i) { if (vX[groupIdx][i] == 1) { } } // 找最高频词汇所在的索引位置 } } /* 程序入口 */ void CalcEntropy(int argc, const char** argv) { // argv // 1. 知识颗粒的父目录名称 // 2. 包含高频词汇信息的mat文件的后缀 // 3. 包含处理后的pubmed文献信息的mat文件路径 // 4. 存放输出结果的mat文件的后缀（每个知识颗粒目录中生成一个结果文件） // 5. 线程数量(可选) if (argc < 5) { cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number])!" << endl; return; } clock_t begin, finish; string parrentDir(argv[1]); // 知识颗粒的父目录名称 string wordMatSuffix(argv[2]); // 高频词汇矩阵对应的mat文件的后缀名（可以是全文件名，可以是文件名后缀，必须保证唯一） int numThread = 1; if (argc >= 5) numThread = atoi(argv[5]); if (numThread < 1) numThread = 1; // cout << "thread num: " << numThread << endl; /* 读入处理后的pubmed文献信息的mat文件，只读入摘要信息，即变量abs1 */ vector vAbstract; int rowNum, colNum; ReadMtxString(argv[3], "abs1", vAbstract, &rowNum, &colNum); if (vAbstract.size() == 0) { // 摘要信息为空，出错 cout << "PubMed Abstract info is null!" << endl; return; } // 将摘要信息分割成一个一个的词汇 begin = clock(); unordered_set usWordChars; // 能组成单词的字符，要不要考虑数字？原版matlab是提取了数字的 for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z vector > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数 vector > vusAbsWord(vAbstract.size()); // 将每篇文章摘要的单词放入hash表 for (int i = 0; i < vAbstract.size(); i++) { auto& strAbs = vAbstract[i]; // 遍历摘要字符串的每一个字符，取出每一个单词 vector& vWord = vvWordMtx[i]; if (strAbs.size() == 0) continue; // 摘要信息为空，跳过（一般不会出现这个情况） int wordStartPos = 0; while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end()) wordStartPos++; for (int curPos = wordStartPos + 1; curPos < strAbs.size(); ++curPos) { if (usWordChars.find(strAbs[curPos]) == usWordChars.end()) { // 找到了分割符 vWord.push_back(strAbs.substr(wordStartPos, curPos - wordStartPos)); wordStartPos = curPos + 1; // 找下一个词语起始位置 while (wordStartPos < strAbs.size() && usWordChars.find(strAbs[wordStartPos]) == usWordChars.end()) wordStartPos++; curPos = wordStartPos; // 循环会自动加1 } } // 将处理摘要之后的每个词语放入hash表 for (auto& word : vWord) { string upWord(word); transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper); // cout << upWord << endl; vusAbsWord[i].insert(upWord); } } finish = clock(); cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; //auto & vTest = vvWordMtx[0]; //cout << vTest.size() << endl; //for (auto& str : vTest) cout << str << endl; /* 遍历所有的知识颗粒目录，逐一进行处理 */ begin = clock(); // ThreadPool thPool(numThread); ThreadPool thPool(24); // 查看知识颗粒数量 int numKnowledgeParticle = 0; FOREACH_PARTICLE_START numKnowledgeParticle++; FOREACH_PARTICLE_END vector vEntropyResult(numKnowledgeParticle); // 存放所有结果 // 遍历每个知识颗粒，逐一进行处理 for (int round = 0; round < 1; ++round) { // 测试用 int i = 0; FOREACH_PARTICLE_START ThreadParam tParam = { file, &vusAbsWord, &vEntropyResult[i] }; thPool.enqueue(ThreadProcessData, tParam); i++; FOREACH_PARTICLE_END } // synchronize thPool.~ThreadPool(); finish = clock(); cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 合并处理结果 */ //ofstream ofs("test_out.txt"); //for (auto& item : vEntropyResult) { // auto& vvEntropy = item.vvEntropy; // auto& vVal = vvEntropy[0]; // for (auto& val : vVal) ofs << val << ' '; // ofs << endl; //} //ofs.close(); }