完成了信息熵计算的程序,解决了bug
This commit is contained in:
parent
1a02c97c66
commit
31f0382cc1
|
|
@ -1,3 +1,5 @@
|
|||
*.mat
|
||||
*.txt
|
||||
x64/
|
||||
eigen-3.4.0/
|
||||
*.debug
|
||||
|
|
|
|||
|
|
@ -133,24 +133,25 @@ bool ReadInfoFromMat(const string & filePath, vector<vector<string> >&vvDs, vect
|
|||
return true;
|
||||
}
|
||||
|
||||
// 将二维索引转成一维的索引
|
||||
inline int Get1DIndex(int colNum, int row, int col) {
|
||||
return row * colNum + col;
|
||||
}
|
||||
|
||||
/* 处理一个知识颗粒 */
|
||||
struct EntropyResult { // 存放每个文献对应的结果
|
||||
vector<vector<double> > vvEntropy; // 信息熵
|
||||
vector<vector<double> > vvTransEntropy; // 转置的信息熵
|
||||
};
|
||||
struct ThreadParam { // 线程参数
|
||||
fs::path matFilePath;
|
||||
fs::path outFilePath;
|
||||
vector<unordered_set<string> >* pvusWord;
|
||||
EntropyResult* pRes;
|
||||
};
|
||||
void ThreadProcessData(const ThreadParam& param) {
|
||||
const fs::path& matFilePath = param.matFilePath;
|
||||
EntropyResult& res = *param.pRes;
|
||||
const fs::path& outFilePath = param.outFilePath;
|
||||
vector <unordered_set<string> >& vusWord = *param.pvusWord;
|
||||
|
||||
// 存放结果
|
||||
auto& hs = res.vvEntropy;
|
||||
auto& hr = res.vvTransEntropy;
|
||||
// 存放结果,用一维数组存放二维数据
|
||||
vector<double> hs;
|
||||
vector<double> hr;
|
||||
|
||||
vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵)
|
||||
vector<vector<double> > vvFr; // 词汇对应的频率
|
||||
|
|
@ -162,11 +163,10 @@ void ThreadProcessData(const ThreadParam& param) {
|
|||
// cout << vvDs.size() << '\t' << vvDs[0].size() << endl;
|
||||
const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量
|
||||
const int numGroup = vvDs.size(); // ds包含的组数
|
||||
hs.resize(numGroup);
|
||||
hr.resize(numLiterature);
|
||||
for (int i = 0; i < numGroup; ++i) hs[i].resize(numLiterature); // resize会自动初始化
|
||||
for (int i = 0; i < numLiterature; ++i) hr[i].resize(numGroup);
|
||||
for (int groupIdx = 0; groupIdx < vvDs.size(); ++groupIdx) { // 遍历知识颗粒中的每一组
|
||||
hs.resize(numGroup * numLiterature);
|
||||
hr.resize(numLiterature * numGroup);
|
||||
|
||||
for (int groupIdx = 0; groupIdx < numGroup; ++groupIdx) { // 遍历知识颗粒中的每一组
|
||||
vector<string>& vDs = vvDs[groupIdx]; // 这一组ds
|
||||
vector<double>& vFr = vvFr[groupIdx]; // frequency
|
||||
const int numWord = vDs.size(); // 这一组数据中包含的单词数量
|
||||
|
|
@ -176,27 +176,53 @@ void ThreadProcessData(const ThreadParam& param) {
|
|||
for (int j = 0; j < numWord; ++j) {
|
||||
if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
|
||||
vX[i][j] = 1;
|
||||
// 对每个知识颗粒每一组数据,计算信息熵
|
||||
hs[groupIdx][i] -= vFr[j] * log2(vFr[j]);
|
||||
if (groupIdx == 1 && i == 2) {
|
||||
// cout << matFilePath.string() << '\t' << j+1 << '\t' << vDs[j] << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// cout << vX[0][0] << endl;
|
||||
|
||||
|
||||
|
||||
// 找词汇的最高频率
|
||||
double maxFr = *max_element(vFr.begin(), vFr.end());
|
||||
// 将fr的数值规范化到(0,0.368)之间
|
||||
const double normalMax = 0.368;
|
||||
for (auto& frVal : vFr) frVal = frVal * normalMax / maxFr;
|
||||
maxFr = normalMax;
|
||||
// 对每个知识颗粒每一组数据,计算信息熵
|
||||
for (int i = 0; i < numLiterature; ++i) {
|
||||
|
||||
if (vX[groupIdx][i] == 1) {
|
||||
|
||||
for (int j = 0; j < numWord; ++j) {
|
||||
if (vX[i][j] == 1) {
|
||||
hs[Get1DIndex(numLiterature, groupIdx, i)] -= vFr[j] * log2(vFr[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 找最高频词汇所在的索引位置
|
||||
vector<int> vMaxPos;
|
||||
int idx = 0;
|
||||
for_each(vFr.begin(), vFr.end(), [&idx, maxFr, &vMaxPos](double val) {
|
||||
if (val == maxFr) vMaxPos.push_back(idx);
|
||||
idx++;
|
||||
});
|
||||
|
||||
for (int i = 0; i < numLiterature; ++i) {
|
||||
int cumulateX = 0; // 计算在最高频词汇处,x值的累加结果
|
||||
for (int j = 0; j < vMaxPos.size(); ++j) cumulateX += vX[i][vMaxPos[j]];
|
||||
if (cumulateX == vMaxPos.size()) { // 如果频率最高的词汇都出现在了文献中
|
||||
hr[Get1DIndex(numGroup,i, groupIdx)] = 1; // 应该是表示知识颗粒的这一组数据跟这篇文献相关性比较高
|
||||
}
|
||||
}
|
||||
}
|
||||
/* 将结果(hs和hr)写入每个知识颗粒的目录内 */
|
||||
MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w");
|
||||
SaveMtxDouble(hs.data(), pMatFile, "hs1", numGroup, numLiterature);
|
||||
SaveMtxDouble(hr.data(), pMatFile, "hr1", numLiterature, numGroup);
|
||||
matClose(pMatFile);
|
||||
}
|
||||
|
||||
/* 程序入口 */
|
||||
// 运行例子:CalcEntropy d:\Twirls\runtime\ALS_test abs2class.mat d:\Twirls\runtime\pubmed_files\pubmed-multiplesc-set.mat hx_info.mat 12 word.mat
|
||||
void CalcEntropy(int argc, const char** argv) {
|
||||
// argv
|
||||
// 1. 知识颗粒的父目录名称
|
||||
|
|
@ -205,12 +231,13 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
// 4. 存放输出结果的mat文件的后缀(每个知识颗粒目录中生成一个结果文件)
|
||||
// 5. 线程数量(可选)
|
||||
if (argc < 5) {
|
||||
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number])!" << endl;
|
||||
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl;
|
||||
return;
|
||||
}
|
||||
clock_t begin, finish;
|
||||
string parrentDir(argv[1]); // 知识颗粒的父目录名称
|
||||
string wordMatSuffix(argv[2]); // 高频词汇矩阵对应的mat文件的后缀名(可以是全文件名,可以是文件名后缀,必须保证唯一)
|
||||
fs::path outFileName(argv[4]);
|
||||
int numThread = 1;
|
||||
if (argc >= 5) numThread = atoi(argv[5]);
|
||||
if (numThread < 1) numThread = 1;
|
||||
|
|
@ -224,11 +251,13 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
cout << "PubMed Abstract info is null!" << endl;
|
||||
return;
|
||||
}
|
||||
// 将摘要信息分割成一个一个的词汇
|
||||
/* 将摘要信息分割成一个一个的词汇 */
|
||||
begin = clock();
|
||||
unordered_set<char> usWordChars; // 能组成单词的字符,要不要考虑数字?原版matlab是提取了数字的
|
||||
for (int i = 65; i <= 90; i++) usWordChars.insert(char(i)); // A - Z
|
||||
for (int i = 97; i <= 122; i++) usWordChars.insert(char(i)); // a - z
|
||||
for (int i = 48; i <= 57; i++) usWordChars.insert(char(i)); // 0 - 9
|
||||
usWordChars.insert('/'); usWordChars.insert('+'); usWordChars.insert('-');
|
||||
vector<vector<string> > vvWordMtx(vAbstract.size()); // 初始大小为文章的个数
|
||||
vector<unordered_set<string> > vusAbsWord(vAbstract.size()); // 将每篇文章摘要的单词放入hash表
|
||||
for (int i = 0; i < vAbstract.size(); i++) {
|
||||
|
|
@ -256,6 +285,22 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
vusAbsWord[i].insert(upWord);
|
||||
}
|
||||
}
|
||||
/* 将分割结果写入mat文件 */
|
||||
if (argc >= 6) {
|
||||
MATFile* pMatFile = matOpen(argv[6], "w");
|
||||
mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size());
|
||||
for (int i = 0; i < vvWordMtx.size(); ++i) {
|
||||
mxArray* pChildCellMtx = mxCreateCellMatrix(1, vvWordMtx[i].size());
|
||||
for (int j = 0; j < vvWordMtx[i].size(); ++j) {
|
||||
mxArray* mxStr = mxCreateString(vvWordMtx[i][j].c_str());
|
||||
mxSetCell(pChildCellMtx, j, mxStr);
|
||||
}
|
||||
mxSetCell(pCellMtx, i, pChildCellMtx);
|
||||
}
|
||||
matPutVariable(pMatFile, "wd1", pCellMtx);
|
||||
matClose(pMatFile);
|
||||
mxDestroyArray(pCellMtx);
|
||||
}
|
||||
finish = clock();
|
||||
cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
//auto & vTest = vvWordMtx[0];
|
||||
|
|
@ -265,20 +310,19 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
|
||||
/* 遍历所有的知识颗粒目录,逐一进行处理 */
|
||||
begin = clock();
|
||||
// ThreadPool thPool(numThread);
|
||||
ThreadPool thPool(24);
|
||||
ThreadPool thPool(numThread);
|
||||
// ThreadPool thPool(24);
|
||||
// 查看知识颗粒数量
|
||||
int numKnowledgeParticle = 0;
|
||||
FOREACH_PARTICLE_START
|
||||
numKnowledgeParticle++;
|
||||
FOREACH_PARTICLE_END
|
||||
|
||||
vector<EntropyResult> vEntropyResult(numKnowledgeParticle); // 存放所有结果
|
||||
// 遍历每个知识颗粒,逐一进行处理
|
||||
for (int round = 0; round < 1; ++round) { // 测试用
|
||||
int i = 0;
|
||||
FOREACH_PARTICLE_START
|
||||
ThreadParam tParam = { file, &vusAbsWord, &vEntropyResult[i] };
|
||||
ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord };
|
||||
thPool.enqueue(ThreadProcessData, tParam);
|
||||
i++;
|
||||
FOREACH_PARTICLE_END
|
||||
|
|
@ -289,7 +333,7 @@ void CalcEntropy(int argc, const char** argv) {
|
|||
finish = clock();
|
||||
|
||||
cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||
/* 合并处理结果 */
|
||||
|
||||
//ofstream ofs("test_out.txt");
|
||||
//for (auto& item : vEntropyResult) {
|
||||
// auto& vvEntropy = item.vvEntropy;
|
||||
|
|
|
|||
Loading…
Reference in New Issue