diff --git a/CommonLib/CommonLib.vcxproj b/CommonLib/CommonLib.vcxproj index 321cf13..8836c8c 100644 --- a/CommonLib/CommonLib.vcxproj +++ b/CommonLib/CommonLib.vcxproj @@ -112,6 +112,7 @@ + diff --git a/CommonLib/CommonLib.vcxproj.filters b/CommonLib/CommonLib.vcxproj.filters index 20334ba..7d4df5e 100644 --- a/CommonLib/CommonLib.vcxproj.filters +++ b/CommonLib/CommonLib.vcxproj.filters @@ -28,6 +28,9 @@ Header Files + + Header Files + diff --git a/CommonLib/kthread.h b/CommonLib/kthread.h new file mode 100644 index 0000000..52abb5d --- /dev/null +++ b/CommonLib/kthread.h @@ -0,0 +1,145 @@ +#ifndef KTHREAD_H +#define KTHREAD_H +#include +#include +#include +#include +#include + +using std::atomic; +using std::thread; +using std::vector; + +/************ + * kt_for() * + ************/ + +template +using FuncType3Arg = void (*)(vector&, long, int); + +template +using FuncType1Arg = void (*)(T&); + +template +struct kt_for_t; + +template +struct ktf_worker_t +{ + kt_for_t* t; + atomic i; +}; + +template +struct kt_for_t +{ + int n_threads; + long n; + + ktf_worker_t* w; + FuncType1Arg func1Arg; + FuncType3Arg func3Arg; + vector* data; +}; + +template +static inline long steal_work(kt_for_t* t) +{ + int i, min_i = -1; + long k, min = LONG_MAX; + for (i = 0; i < t->n_threads; ++i) + if (min > t->w[i].i) + min = t->w[i].i, min_i = i; + k = t->w[min_i].i.fetch_add(t->n_threads); + return k >= t->n ? -1 : k; +} + +template +static void ktf_worker_1_arg(void* data) +{ + ktf_worker_t* w = (ktf_worker_t *)data; + long i; + + for (;;) + { + i = w->i.fetch_add(w->t->n_threads); + if (i >= w->t->n) + break; + w->t->func1Arg(( * w->t->data)[i]); + } + while ((i = steal_work(w->t)) >= 0) + w->t->func1Arg((*w->t->data)[i]); +} + +template +static void ktf_worker_3_arg(void* data) +{ + ktf_worker_t* w = (ktf_worker_t *)data; + long i; + + for (;;) + { + i = w->i.fetch_add(w->t->n_threads); + if (i >= w->t->n) + break; + w->t->func3Arg(*w->t->data, i, w - w->t->w); + } + while ((i = steal_work(w->t)) >= 0) + w->t->func3Arg(*w->t->data, i, w - w->t->w); +} + +template +void kt_for(int n_threads, FuncType3Arg func, vector& vData) +{ + const long n = (long)vData.size(); + if (n_threads > 1) + { + int i; + kt_for_t t; + t.func3Arg = func, t.data = &vData, t.n_threads = n_threads, t.n = n; + t.w = (ktf_worker_t *)alloca(n_threads * sizeof(ktf_worker_t)); + vector vThread; + + for (i = 0; i < n_threads; ++i) + t.w[i].t = &t, t.w[i].i.store(i); + for (i = 0; i < n_threads; ++i) + vThread.push_back(thread(ktf_worker_3_arg, &t.w[i])); + for (i = 0; i < n_threads; ++i) + vThread[i].join(); + } + else + { + long j; + for (j = 0; j < n; ++j) + func(vData, j, 0); + } +} + +template +void kt_for(int n_threads, FuncType1Arg func, vector& vData) +{ + const long n = (long)vData.size(); + if (n_threads > 1) + { + int i; + kt_for_t t; + t.func1Arg = func, t.data = &vData, t.n_threads = n_threads, t.n = n; + t.w = (ktf_worker_t *)alloca(n_threads * sizeof(ktf_worker_t)); + vector vThread; + + for (i = 0; i < n_threads; ++i) + t.w[i].t = &t, t.w[i].i.store(i); + for (i = 0; i < n_threads; ++i) + vThread.push_back(thread(ktf_worker_1_arg, &t.w[i])); + for (i = 0; i < n_threads; ++i) + vThread[i].join(); + } + else + { + long j; + for (j = 0; j < n; ++j) + func(vData[j]); + } +} + +#endif diff --git a/CppRun/calc_entropy.cpp b/CppRun/calc_entropy.cpp index ea85df5..4e507c6 100644 --- a/CppRun/calc_entropy.cpp +++ b/CppRun/calc_entropy.cpp @@ -33,6 +33,7 @@ #include "common.h" #include "CommonLib/thread_pool.h" #include "CommonLib/matlab_io.h" +#include "CommonLib/kthread.h" using namespace std; using std::cout; using std::vector; @@ -106,7 +107,6 @@ bool ReadInfoFromMat(const string & filePath, vector >&vvDs, vect // 读取ds字符串 pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds OUTER_FOR_BEGIN - // cout << childRowNum << '\t' << childColNum << endl; vvDs.push_back(vector()); vvDs.back().resize(childRowNum * childColNum); INNTER_FOR_BEGIN @@ -144,7 +144,7 @@ struct ThreadParam { // fs::path outFilePath; vector >* pvusWord; }; -void ThreadProcessData(const ThreadParam& param) { +void ThreadProcessData(ThreadParam& param) { const fs::path& matFilePath = param.matFilePath; const fs::path& outFilePath = param.outFilePath; vector >& vusWord = *param.pvusWord; @@ -156,11 +156,8 @@ void ThreadProcessData(const ThreadParam& param) { vector > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵) vector > vvFr; // 词汇对应的频率 - // cout << matFilePath.string() << endl; // 读取G结构体中的ds和fr信息 ReadInfoFromMat(matFilePath.string(), vvDs, vvFr); - // res.vvEntropy.push_back(vvFr[0]); - // cout << vvDs.size() << '\t' << vvDs[0].size() << endl; const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量 const int numGroup = vvDs.size(); // ds包含的组数 hs.resize(numGroup * numLiterature); @@ -176,9 +173,6 @@ void ThreadProcessData(const ThreadParam& param) { for (int j = 0; j < numWord; ++j) { if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过 vX[i][j] = 1; - if (groupIdx == 1 && i == 2) { - // cout << matFilePath.string() << '\t' << j+1 << '\t' << vDs[j] << endl; - } } } } @@ -216,8 +210,8 @@ void ThreadProcessData(const ThreadParam& param) { } /* 将结果(hs和hr)写入每个知识颗粒的目录内 */ MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w"); - SaveMtxDouble(hs.data(), pMatFile, "hs1", numGroup, numLiterature); - SaveMtxDouble(hr.data(), pMatFile, "hr1", numLiterature, numGroup); + SaveMtxDouble(hs.data(), pMatFile, "hs", numGroup, numLiterature); + SaveMtxDouble(hr.data(), pMatFile, "hr", numLiterature, numGroup); matClose(pMatFile); } @@ -228,7 +222,7 @@ void CalcEntropy(int argc, const char** argv) { // 1. 知识颗粒的父目录名称 // 2. 包含高频词汇信息的mat文件的后缀 // 3. 包含处理后的pubmed文献信息的mat文件路径 - // 4. 存放输出结果的mat文件的后缀(每个知识颗粒目录中生成一个结果文件) + // 4. 存放输出结果的mat文件名(每个知识颗粒目录中生成一个结果文件) // 5. 线程数量(可选) if (argc < 5) { cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl; @@ -241,7 +235,6 @@ void CalcEntropy(int argc, const char** argv) { int numThread = 1; if (argc >= 5) numThread = atoi(argv[5]); if (numThread < 1) numThread = 1; - // cout << "thread num: " << numThread << endl; /* 读入处理后的pubmed文献信息的mat文件,只读入摘要信息,即变量abs1 */ vector vAbstract; @@ -281,11 +274,13 @@ void CalcEntropy(int argc, const char** argv) { for (auto& word : vWord) { string upWord(word); transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper); - // cout << upWord << endl; vusAbsWord[i].insert(upWord); } } + finish = clock(); + cout << "read abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 将分割结果写入mat文件 */ + begin = clock(); if (argc >= 6) { MATFile* pMatFile = matOpen(argv[6], "w"); mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size()); @@ -297,21 +292,16 @@ void CalcEntropy(int argc, const char** argv) { } mxSetCell(pCellMtx, i, pChildCellMtx); } - matPutVariable(pMatFile, "wd1", pCellMtx); + matPutVariable(pMatFile, "wd", pCellMtx); matClose(pMatFile); mxDestroyArray(pCellMtx); } finish = clock(); - cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; - //auto & vTest = vvWordMtx[0]; - //cout << vTest.size() << endl; - //for (auto& str : vTest) cout << str << endl; - + cout << "write abstract word time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 遍历所有的知识颗粒目录,逐一进行处理 */ begin = clock(); - ThreadPool thPool(numThread); - // ThreadPool thPool(24); + //ThreadPool thPool(numThread); // 查看知识颗粒数量 int numKnowledgeParticle = 0; FOREACH_PARTICLE_START @@ -319,27 +309,21 @@ void CalcEntropy(int argc, const char** argv) { FOREACH_PARTICLE_END // 遍历每个知识颗粒,逐一进行处理 + vector vTP; for (int round = 0; round < 1; ++round) { // 测试用 int i = 0; FOREACH_PARTICLE_START - ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord }; - thPool.enqueue(ThreadProcessData, tParam); + //ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord }; + //thPool.enqueue(ThreadProcessData, tParam); + vTP.push_back({ file, childDir / outFileName, &vusAbsWord }); i++; FOREACH_PARTICLE_END } + kt_for(numThread, ThreadProcessData, vTP); // synchronize - thPool.~ThreadPool(); + //thPool.~ThreadPool(); finish = clock(); cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; - - //ofstream ofs("test_out.txt"); - //for (auto& item : vEntropyResult) { - // auto& vvEntropy = item.vvEntropy; - // auto& vVal = vvEntropy[0]; - // for (auto& val : vVal) ofs << val << ' '; - // ofs << endl; - //} - //ofs.close(); } \ No newline at end of file diff --git a/CppRun/main.cpp b/CppRun/main.cpp index c4eeece..d34d699 100644 --- a/CppRun/main.cpp +++ b/CppRun/main.cpp @@ -27,6 +27,7 @@ int main(int argc, const char** argv) { } else if (string(argv[1]) == "CalcEntropy") { /* 璁$畻淇℃伅鐔 */ + cout << "CalcEntropy" << endl; CalcEntropy(argc - 1, argv + 1); } finish = clock(); diff --git a/CppRun/process_pubmed_txt.cpp b/CppRun/process_pubmed_txt.cpp index 8eb630c..2f62c14 100644 --- a/CppRun/process_pubmed_txt.cpp +++ b/CppRun/process_pubmed_txt.cpp @@ -14,7 +14,9 @@ #include #include #include "common.h" +#include "CommonLib/thread_pool.h" #include "CommonLib/matlab_io.h" +#include "CommonLib/kthread.h" using namespace std; /* 灏嗙粨鏋滃啓鍏at鏂囦欢 */ @@ -35,7 +37,7 @@ bool SavePubmed(const string& matPath, } // 鍒涘缓缁撴瀯浣撴暟鎹 - mxArray* mxStruct = mxCreateStructMatrix(1, 1, vTgName.size(), vTgChars.data()); + mxArray* mxStruct = mxCreateStructMatrix(1, 1, (int)vTgName.size(), vTgChars.data()); // 鍒涘缓cell matrix unordered_map ummxCellMtx; for (auto & tgName : vTgName) { @@ -68,9 +70,45 @@ bool SavePubmed(const string& matPath, return true; } +/* 澶勭悊涓绡囨枃绔 */ +struct ThreadParam { // 绾跨▼鍙傛暟 + unordered_map *pumTagContent; + vector* pvLineTag; + vector* pvTgName; + int paperStartIdx; + int paperEndIdx; + unordered_map* pumFullTagToTag; + vector* pvStrPubmedTxt; +}; + +//void ThreadProcessArticle(vector& vTP, long idx, int tid) { +void ThreadProcessArticle(ThreadParam& param) { + //ThreadParam& param = vTP[idx]; + unordered_map& umTagContent = *param.pumTagContent; + vector& vLineTag = *param.pvLineTag; + vector& vTgName = *param.pvTgName; + unordered_map& umFullTagToTag = *param.pumFullTagToTag; + vector& vStrPubmedTxt = *param.pvStrPubmedTxt; + + int startIdx = param.paperStartIdx; + int endIdx = param.paperEndIdx; + + for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) { + umTagContent[vTgName[tgIdx]] = ""; // 瀵规瘡涓涓猼ag锛岃缃竴涓柊鐨剆tring + } + for (int idx = startIdx; idx < endIdx; ++idx) { // 閬嶅巻褰撳墠鏂囩珷鐨勬瘡涓涓猼ag鍐呭 + string& fullTag = vLineTag[idx]; + auto tagItr = umFullTagToTag.find(fullTag); + if (tagItr != umFullTagToTag.end()) { // 鎵惧埌tag浜 + const string& tag = tagItr->second; + string& tagContent = umTagContent[tag]; + tagContent.append(vStrPubmedTxt[idx]); + } + } +} // 鍛戒护琛屽弬鏁扮ず渚 -// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat +// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat 12 /* pubmed txt鏂囦欢涓寘鍚涓枃绔犵殑鎽樿淇℃伅锛屾瘡涓俊鎭渶鍓嶈竟鏈変竴涓猼ag锛屾瘡涓猼ag瀵瑰簲鐨勪俊鎭彲鑳芥湁涓琛岋紝涔熷彲鑳藉琛岋紝姣忎釜鏂囩珷涓棿鐢变竴涓┖琛岄殧寮 1. 璇诲彇棰勫厛鎻愬彇鐨刾ubmed tags, 骞跺皢tags涓殑'-'鍜' '瀛楃鍘绘帀锛屽彧鐣欎笅绾瓧绗︿覆鍋歵ag @@ -81,11 +119,11 @@ bool SavePubmed(const string& matPath, void ProcessPubmedTxt(int argc, const char** argv) { // argv 1.pubmed tag.mat鏂囦欢; 2.pubmed article.txt鏂囦欢; 3.pubmed out.mat杈撳嚭鏂囦欢 // - if (argc != 4) { - cout << "This program should take 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat)!" << endl; + if (argc < 4) { + cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat; [4. thread num])!" << endl; return; } - + clock_t begin, finish; int rowNum, colNum; vector vTg; vector vTgName; @@ -94,6 +132,7 @@ void ProcessPubmedTxt(int argc, const char** argv) { /* 璇诲彇pubmed tags */ ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum); /* 1. 鍘绘帀tags閲岀殑'-'鍜' '瀛楃锛屽緱鍒扮函鍑鐨則ag */ + begin = clock(); vTgName = vTg; for (int i = 0; i < vTg.size(); ++i) { int pos = 0; @@ -105,6 +144,8 @@ void ProcessPubmedTxt(int argc, const char** argv) { vTgName[i].resize(pos); umFullTagToTag[vTg[i]] = vTgName[i]; } + finish = clock(); + cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 2. 璇诲彇pubmed txt鏂囦欢锛屽厛璇诲叆鍚庡鐞 */ ifstream ifsPubmedTxt(argv[2]); @@ -117,6 +158,7 @@ void ProcessPubmedTxt(int argc, const char** argv) { int curPos = 0; vPaperStartIdx.push_back(curPos); // 娣诲姞鍒濆绱㈠紩 const int FULL_TAG_LEN = 5; + begin = clock(); while (getline(ifsPubmedTxt, strLine)) { // 璇诲彇鍐呭鏃跺欏幓鎺変簡琛屽熬鐨勬崲琛岀 while (strLine.back() == ' ') strLine.pop_back(); // 鍘绘帀琛屽熬鐨勭┖鏍 if (strLine.size() == 0) { // 鏂扮殑paper @@ -135,31 +177,27 @@ void ProcessPubmedTxt(int argc, const char** argv) { } } vPaperStartIdx.push_back(curPos); // 姣旀枃绔犲1锛屾渶鍚庝竴涓褰曠粨鏉熶綅缃 + finish = clock(); + cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 澶勭悊姣忎竴绡囨枃绔 */ - for (int i = 0; i < vPaperStartIdx.size() - 1; ++i) { - int startIdx = vPaperStartIdx[i]; - int endIdx = vPaperStartIdx[i + 1]; - unordered_map umTagContent; - - for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) { - umTagContent[vTgName[tgIdx]] = ""; // 瀵规瘡涓涓猼ag锛岃缃竴涓柊鐨剆tring - } - for (int idx = startIdx; idx < endIdx; ++idx) { // 閬嶅巻褰撳墠鏂囩珷鐨勬瘡涓涓猼ag鍐呭 - string& fullTag = vLineTag[idx]; - auto tagItr = umFullTagToTag.find(fullTag); - if (tagItr != umFullTagToTag.end()) { // 鎵惧埌tag浜 - const string& tag = tagItr->second; - string& tagContent = umTagContent[tag]; - tagContent.append(vStrPubmedTxt[idx]); - } - } - vumPaperTagVal.push_back(umTagContent); + int numThread = 1; + if (argc >= 5) numThread = atoi(argv[4]); + if (numThread < 1) numThread = 1; + ThreadPool thPool(numThread); + vumPaperTagVal.resize(vPaperStartIdx.size()-1); + vector vT; + vector vTP(vPaperStartIdx.size() - 1); + begin = clock(); + for (int i = 0; i < vTP.size(); ++i) { + vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt }; } - - // cout << "鏂囦欢涓暟锛" << vumPaperTagVal.size() << endl; + kt_for(numThread, ThreadProcessArticle, vTP); + finish = clock(); + cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 鍘婚櫎娌℃湁鎽樿鐨勬枃绔 */ + begin = clock(); const string abstractTag = "AB"; for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) { if ((*itr)[abstractTag].size() == 0) { @@ -169,8 +207,11 @@ void ProcessPubmedTxt(int argc, const char** argv) { itr++; } } + finish = clock(); + cout << "remove no AB Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 鏍规嵁PMID锛屽幓闄ゅ啑浣 */ + begin = clock(); unordered_map umPMID; const string pmidTag = "PMID"; for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) { @@ -183,16 +224,25 @@ void ProcessPubmedTxt(int argc, const char** argv) { itr++; } } + finish = clock(); + cout << "remove duplication Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 灏唗itle鍜宎bstract鍚堝苟锛岃祴鍊肩粰abstract */ + begin = clock(); const string titleTag = "TI"; for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) { string& abstractStr = (*itr)[abstractTag]; abstractStr = (*itr)[titleTag] + " " + abstractStr; // 鍙兘浼氭湁鎬ц兘鎹熷け锛屼笉杩囧奖鍝嶄笉澶 } + finish = clock(); + cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; + // 鍏抽棴txt鏂囦欢 ifsPubmedTxt.close(); /* 灏嗗鐞嗗悗鐨勬暟鎹啓鍏at鏂囦欢锛宮at涓殑鍙橀噺鍚嶇О鍒嗗埆涓篢x鍜宎bs1 */ + begin = clock(); SavePubmed(argv[3], vTgName, vumPaperTagVal); + finish = clock(); + cout << "write to MAT Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; } \ No newline at end of file diff --git a/GMM/main.cpp b/GMM/main.cpp index 4f254cf..cd93d62 100644 --- a/GMM/main.cpp +++ b/GMM/main.cpp @@ -30,8 +30,9 @@ #endif #include #include "gmm.h" -#include "CommonLib/thread_pool.h" +// #include "CommonLib/thread_pool.h" #include "CommonLib/matlab_io.h" +#include "CommonLib/kthread.h" using namespace std; using std::cout; using std::vector; @@ -144,7 +145,9 @@ struct ThreadParam { fs::path matFilePath; fs::path outFilePath; }; -void ThreadProcessData(const ThreadParam& param) { +//void ThreadProcessData(vector& vTP, long idx, int tid) { +void ThreadProcessData(ThreadParam& param) { + //const ThreadParam& param = vTP[idx]; const fs::path& matFilePath = param.matFilePath; const fs::path& outFilePath = param.outFilePath; double* hs = nullptr; @@ -192,10 +195,10 @@ int main(int argc, const char** argv) { int numThread = 1; if (argc >= 4) numThread = atoi(argv[4]); if (numThread < 1) numThread = 1; - ThreadPool thPool(numThread); + //ThreadPool thPool(numThread); clock_t begin, finish; begin = clock(); - + vector vTP; /* 遍历所有的知识颗粒目录,逐一进行处理 */ for (auto& childDir : fs::directory_iterator(parrentDir)) { fs::path outFilePath = childDir / outFileName; @@ -203,12 +206,14 @@ int main(int argc, const char** argv) { const string& fileName = file.path().filename().string(); auto rPos = fileName.rfind(hsMatSuffix); if (rPos != string::npos && fileName.size() - rPos == hsMatSuffix.size()) { - ThreadParam tParam = { file, outFilePath }; - thPool.enqueue(ThreadProcessData, tParam); + //ThreadParam tParam = { file, outFilePath }; + //thPool.enqueue(ThreadProcessData, tParam); + vTP.push_back({ file, outFilePath }); } } } - thPool.~ThreadPool(); + kt_for(numThread, ThreadProcessData, vTP); + //thPool.~ThreadPool(); finish = clock(); cout << "GMM Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; return 0;