添加了kthread,并适配c++和windows
This commit is contained in:
parent
31f0382cc1
commit
0c73318fb7
|
|
@ -112,6 +112,7 @@
|
||||||
</Text>
|
</Text>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
<ClInclude Include="kthread.h" />
|
||||||
<ClInclude Include="matlab_io.h" />
|
<ClInclude Include="matlab_io.h" />
|
||||||
<ClInclude Include="thread_pool.h" />
|
<ClInclude Include="thread_pool.h" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,9 @@
|
||||||
<ClInclude Include="thread_pool.h">
|
<ClInclude Include="thread_pool.h">
|
||||||
<Filter>Header Files</Filter>
|
<Filter>Header Files</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
<ClInclude Include="kthread.h">
|
||||||
|
<Filter>Header Files</Filter>
|
||||||
|
</ClInclude>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="matlab_io.cpp">
|
<ClCompile Include="matlab_io.cpp">
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,145 @@
|
||||||
|
#ifndef KTHREAD_H
|
||||||
|
#define KTHREAD_H
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
|
using std::atomic;
|
||||||
|
using std::thread;
|
||||||
|
using std::vector;
|
||||||
|
|
||||||
|
/************
|
||||||
|
* kt_for() *
|
||||||
|
************/
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using FuncType3Arg = void (*)(vector<T>&, long, int);
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using FuncType1Arg = void (*)(T&);
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
struct kt_for_t;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct ktf_worker_t
|
||||||
|
{
|
||||||
|
kt_for_t<T>* t;
|
||||||
|
atomic<long> i;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct kt_for_t
|
||||||
|
{
|
||||||
|
int n_threads;
|
||||||
|
long n;
|
||||||
|
|
||||||
|
ktf_worker_t<T>* w;
|
||||||
|
FuncType1Arg<T> func1Arg;
|
||||||
|
FuncType3Arg<T> func3Arg;
|
||||||
|
vector<T>* data;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
static inline long steal_work(kt_for_t<T>* t)
|
||||||
|
{
|
||||||
|
int i, min_i = -1;
|
||||||
|
long k, min = LONG_MAX;
|
||||||
|
for (i = 0; i < t->n_threads; ++i)
|
||||||
|
if (min > t->w[i].i)
|
||||||
|
min = t->w[i].i, min_i = i;
|
||||||
|
k = t->w[min_i].i.fetch_add(t->n_threads);
|
||||||
|
return k >= t->n ? -1 : k;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
static void ktf_worker_1_arg(void* data)
|
||||||
|
{
|
||||||
|
ktf_worker_t<T>* w = (ktf_worker_t<T> *)data;
|
||||||
|
long i;
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
i = w->i.fetch_add(w->t->n_threads);
|
||||||
|
if (i >= w->t->n)
|
||||||
|
break;
|
||||||
|
w->t->func1Arg(( * w->t->data)[i]);
|
||||||
|
}
|
||||||
|
while ((i = steal_work<T>(w->t)) >= 0)
|
||||||
|
w->t->func1Arg((*w->t->data)[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
static void ktf_worker_3_arg(void* data)
|
||||||
|
{
|
||||||
|
ktf_worker_t<T>* w = (ktf_worker_t<T> *)data;
|
||||||
|
long i;
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
i = w->i.fetch_add(w->t->n_threads);
|
||||||
|
if (i >= w->t->n)
|
||||||
|
break;
|
||||||
|
w->t->func3Arg(*w->t->data, i, w - w->t->w);
|
||||||
|
}
|
||||||
|
while ((i = steal_work<T>(w->t)) >= 0)
|
||||||
|
w->t->func3Arg(*w->t->data, i, w - w->t->w);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void kt_for(int n_threads, FuncType3Arg<T> func, vector<T>& vData)
|
||||||
|
{
|
||||||
|
const long n = (long)vData.size();
|
||||||
|
if (n_threads > 1)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
kt_for_t<T> t;
|
||||||
|
t.func3Arg = func, t.data = &vData, t.n_threads = n_threads, t.n = n;
|
||||||
|
t.w = (ktf_worker_t<T> *)alloca(n_threads * sizeof(ktf_worker_t<T>));
|
||||||
|
vector<thread> vThread;
|
||||||
|
|
||||||
|
for (i = 0; i < n_threads; ++i)
|
||||||
|
t.w[i].t = &t, t.w[i].i.store(i);
|
||||||
|
for (i = 0; i < n_threads; ++i)
|
||||||
|
vThread.push_back(thread(ktf_worker_3_arg<T>, &t.w[i]));
|
||||||
|
for (i = 0; i < n_threads; ++i)
|
||||||
|
vThread[i].join();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
long j;
|
||||||
|
for (j = 0; j < n; ++j)
|
||||||
|
func(vData, j, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void kt_for(int n_threads, FuncType1Arg<T> func, vector<T>& vData)
|
||||||
|
{
|
||||||
|
const long n = (long)vData.size();
|
||||||
|
if (n_threads > 1)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
kt_for_t<T> t;
|
||||||
|
t.func1Arg = func, t.data = &vData, t.n_threads = n_threads, t.n = n;
|
||||||
|
t.w = (ktf_worker_t<T> *)alloca(n_threads * sizeof(ktf_worker_t<T>));
|
||||||
|
vector<thread> vThread;
|
||||||
|
|
||||||
|
for (i = 0; i < n_threads; ++i)
|
||||||
|
t.w[i].t = &t, t.w[i].i.store(i);
|
||||||
|
for (i = 0; i < n_threads; ++i)
|
||||||
|
vThread.push_back(thread(ktf_worker_1_arg<T>, &t.w[i]));
|
||||||
|
for (i = 0; i < n_threads; ++i)
|
||||||
|
vThread[i].join();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
long j;
|
||||||
|
for (j = 0; j < n; ++j)
|
||||||
|
func(vData[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -33,6 +33,7 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "CommonLib/thread_pool.h"
|
#include "CommonLib/thread_pool.h"
|
||||||
#include "CommonLib/matlab_io.h"
|
#include "CommonLib/matlab_io.h"
|
||||||
|
#include "CommonLib/kthread.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using std::cout;
|
using std::cout;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
@ -106,7 +107,6 @@ bool ReadInfoFromMat(const string & filePath, vector<vector<string> >&vvDs, vect
|
||||||
// 读取ds字符串
|
// 读取ds字符串
|
||||||
pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds
|
pMxArray = mxGetField(pMxG, 0, firstChildName.c_str()); // ds
|
||||||
OUTER_FOR_BEGIN
|
OUTER_FOR_BEGIN
|
||||||
// cout << childRowNum << '\t' << childColNum << endl;
|
|
||||||
vvDs.push_back(vector<string>());
|
vvDs.push_back(vector<string>());
|
||||||
vvDs.back().resize(childRowNum * childColNum);
|
vvDs.back().resize(childRowNum * childColNum);
|
||||||
INNTER_FOR_BEGIN
|
INNTER_FOR_BEGIN
|
||||||
|
|
@ -144,7 +144,7 @@ struct ThreadParam { //
|
||||||
fs::path outFilePath;
|
fs::path outFilePath;
|
||||||
vector<unordered_set<string> >* pvusWord;
|
vector<unordered_set<string> >* pvusWord;
|
||||||
};
|
};
|
||||||
void ThreadProcessData(const ThreadParam& param) {
|
void ThreadProcessData(ThreadParam& param) {
|
||||||
const fs::path& matFilePath = param.matFilePath;
|
const fs::path& matFilePath = param.matFilePath;
|
||||||
const fs::path& outFilePath = param.outFilePath;
|
const fs::path& outFilePath = param.outFilePath;
|
||||||
vector <unordered_set<string> >& vusWord = *param.pvusWord;
|
vector <unordered_set<string> >& vusWord = *param.pvusWord;
|
||||||
|
|
@ -156,11 +156,8 @@ void ThreadProcessData(const ThreadParam& param) {
|
||||||
vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵)
|
vector<vector<string> > vvDs; // 每个知识颗粒的ds矩阵(词汇矩阵)
|
||||||
vector<vector<double> > vvFr; // 词汇对应的频率
|
vector<vector<double> > vvFr; // 词汇对应的频率
|
||||||
|
|
||||||
// cout << matFilePath.string() << endl;
|
|
||||||
// 读取G结构体中的ds和fr信息
|
// 读取G结构体中的ds和fr信息
|
||||||
ReadInfoFromMat(matFilePath.string(), vvDs, vvFr);
|
ReadInfoFromMat(matFilePath.string(), vvDs, vvFr);
|
||||||
// res.vvEntropy.push_back(vvFr[0]);
|
|
||||||
// cout << vvDs.size() << '\t' << vvDs[0].size() << endl;
|
|
||||||
const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量
|
const int numLiterature = vusWord.size(); // pubmed 文件中包含的文献数量
|
||||||
const int numGroup = vvDs.size(); // ds包含的组数
|
const int numGroup = vvDs.size(); // ds包含的组数
|
||||||
hs.resize(numGroup * numLiterature);
|
hs.resize(numGroup * numLiterature);
|
||||||
|
|
@ -176,9 +173,6 @@ void ThreadProcessData(const ThreadParam& param) {
|
||||||
for (int j = 0; j < numWord; ++j) {
|
for (int j = 0; j < numWord; ++j) {
|
||||||
if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
|
if (vusWord[i].find(vDs[j]) != vusWord[i].end()) { // 这一组单词中的j索引位置的单词在第i个文献中出现过
|
||||||
vX[i][j] = 1;
|
vX[i][j] = 1;
|
||||||
if (groupIdx == 1 && i == 2) {
|
|
||||||
// cout << matFilePath.string() << '\t' << j+1 << '\t' << vDs[j] << endl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -216,8 +210,8 @@ void ThreadProcessData(const ThreadParam& param) {
|
||||||
}
|
}
|
||||||
/* 将结果(hs和hr)写入每个知识颗粒的目录内 */
|
/* 将结果(hs和hr)写入每个知识颗粒的目录内 */
|
||||||
MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w");
|
MATFile* pMatFile = matOpen(outFilePath.string().c_str(), "w");
|
||||||
SaveMtxDouble(hs.data(), pMatFile, "hs1", numGroup, numLiterature);
|
SaveMtxDouble(hs.data(), pMatFile, "hs", numGroup, numLiterature);
|
||||||
SaveMtxDouble(hr.data(), pMatFile, "hr1", numLiterature, numGroup);
|
SaveMtxDouble(hr.data(), pMatFile, "hr", numLiterature, numGroup);
|
||||||
matClose(pMatFile);
|
matClose(pMatFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -228,7 +222,7 @@ void CalcEntropy(int argc, const char** argv) {
|
||||||
// 1. 知识颗粒的父目录名称
|
// 1. 知识颗粒的父目录名称
|
||||||
// 2. 包含高频词汇信息的mat文件的后缀
|
// 2. 包含高频词汇信息的mat文件的后缀
|
||||||
// 3. 包含处理后的pubmed文献信息的mat文件路径
|
// 3. 包含处理后的pubmed文献信息的mat文件路径
|
||||||
// 4. 存放输出结果的mat文件的后缀(每个知识颗粒目录中生成一个结果文件)
|
// 4. 存放输出结果的mat文件名(每个知识颗粒目录中生成一个结果文件)
|
||||||
// 5. 线程数量(可选)
|
// 5. 线程数量(可选)
|
||||||
if (argc < 5) {
|
if (argc < 5) {
|
||||||
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl;
|
cout << "This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])!" << endl;
|
||||||
|
|
@ -241,7 +235,6 @@ void CalcEntropy(int argc, const char** argv) {
|
||||||
int numThread = 1;
|
int numThread = 1;
|
||||||
if (argc >= 5) numThread = atoi(argv[5]);
|
if (argc >= 5) numThread = atoi(argv[5]);
|
||||||
if (numThread < 1) numThread = 1;
|
if (numThread < 1) numThread = 1;
|
||||||
// cout << "thread num: " << numThread << endl;
|
|
||||||
|
|
||||||
/* 读入处理后的pubmed文献信息的mat文件,只读入摘要信息,即变量abs1 */
|
/* 读入处理后的pubmed文献信息的mat文件,只读入摘要信息,即变量abs1 */
|
||||||
vector<string> vAbstract;
|
vector<string> vAbstract;
|
||||||
|
|
@ -281,11 +274,13 @@ void CalcEntropy(int argc, const char** argv) {
|
||||||
for (auto& word : vWord) {
|
for (auto& word : vWord) {
|
||||||
string upWord(word);
|
string upWord(word);
|
||||||
transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper);
|
transform(upWord.begin(), upWord.end(), upWord.begin(), ::toupper);
|
||||||
// cout << upWord << endl;
|
|
||||||
vusAbsWord[i].insert(upWord);
|
vusAbsWord[i].insert(upWord);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
finish = clock();
|
||||||
|
cout << "read abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
/* 将分割结果写入mat文件 */
|
/* 将分割结果写入mat文件 */
|
||||||
|
begin = clock();
|
||||||
if (argc >= 6) {
|
if (argc >= 6) {
|
||||||
MATFile* pMatFile = matOpen(argv[6], "w");
|
MATFile* pMatFile = matOpen(argv[6], "w");
|
||||||
mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size());
|
mxArray* pCellMtx= mxCreateCellMatrix(1, vvWordMtx.size());
|
||||||
|
|
@ -297,21 +292,16 @@ void CalcEntropy(int argc, const char** argv) {
|
||||||
}
|
}
|
||||||
mxSetCell(pCellMtx, i, pChildCellMtx);
|
mxSetCell(pCellMtx, i, pChildCellMtx);
|
||||||
}
|
}
|
||||||
matPutVariable(pMatFile, "wd1", pCellMtx);
|
matPutVariable(pMatFile, "wd", pCellMtx);
|
||||||
matClose(pMatFile);
|
matClose(pMatFile);
|
||||||
mxDestroyArray(pCellMtx);
|
mxDestroyArray(pCellMtx);
|
||||||
}
|
}
|
||||||
finish = clock();
|
finish = clock();
|
||||||
cout << "abstract time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
cout << "write abstract word time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
//auto & vTest = vvWordMtx[0];
|
|
||||||
//cout << vTest.size() << endl;
|
|
||||||
//for (auto& str : vTest) cout << str << endl;
|
|
||||||
|
|
||||||
|
|
||||||
/* 遍历所有的知识颗粒目录,逐一进行处理 */
|
/* 遍历所有的知识颗粒目录,逐一进行处理 */
|
||||||
begin = clock();
|
begin = clock();
|
||||||
ThreadPool thPool(numThread);
|
//ThreadPool thPool(numThread);
|
||||||
// ThreadPool thPool(24);
|
|
||||||
// 查看知识颗粒数量
|
// 查看知识颗粒数量
|
||||||
int numKnowledgeParticle = 0;
|
int numKnowledgeParticle = 0;
|
||||||
FOREACH_PARTICLE_START
|
FOREACH_PARTICLE_START
|
||||||
|
|
@ -319,27 +309,21 @@ void CalcEntropy(int argc, const char** argv) {
|
||||||
FOREACH_PARTICLE_END
|
FOREACH_PARTICLE_END
|
||||||
|
|
||||||
// 遍历每个知识颗粒,逐一进行处理
|
// 遍历每个知识颗粒,逐一进行处理
|
||||||
|
vector<ThreadParam> vTP;
|
||||||
for (int round = 0; round < 1; ++round) { // 测试用
|
for (int round = 0; round < 1; ++round) { // 测试用
|
||||||
int i = 0;
|
int i = 0;
|
||||||
FOREACH_PARTICLE_START
|
FOREACH_PARTICLE_START
|
||||||
ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord };
|
//ThreadParam tParam = { file, childDir / outFileName, &vusAbsWord };
|
||||||
thPool.enqueue(ThreadProcessData, tParam);
|
//thPool.enqueue(ThreadProcessData, tParam);
|
||||||
|
vTP.push_back({ file, childDir / outFileName, &vusAbsWord });
|
||||||
i++;
|
i++;
|
||||||
FOREACH_PARTICLE_END
|
FOREACH_PARTICLE_END
|
||||||
}
|
}
|
||||||
|
kt_for(numThread, ThreadProcessData, vTP);
|
||||||
|
|
||||||
// synchronize
|
// synchronize
|
||||||
thPool.~ThreadPool();
|
//thPool.~ThreadPool();
|
||||||
finish = clock();
|
finish = clock();
|
||||||
|
|
||||||
cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
cout << "thread pool time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
//ofstream ofs("test_out.txt");
|
|
||||||
//for (auto& item : vEntropyResult) {
|
|
||||||
// auto& vvEntropy = item.vvEntropy;
|
|
||||||
// auto& vVal = vvEntropy[0];
|
|
||||||
// for (auto& val : vVal) ofs << val << ' ';
|
|
||||||
// ofs << endl;
|
|
||||||
//}
|
|
||||||
//ofs.close();
|
|
||||||
}
|
}
|
||||||
|
|
@ -27,6 +27,7 @@ int main(int argc, const char** argv) {
|
||||||
}
|
}
|
||||||
else if (string(argv[1]) == "CalcEntropy") {
|
else if (string(argv[1]) == "CalcEntropy") {
|
||||||
/* 计算信息熵 */
|
/* 计算信息熵 */
|
||||||
|
cout << "CalcEntropy" << endl;
|
||||||
CalcEntropy(argc - 1, argv + 1);
|
CalcEntropy(argc - 1, argv + 1);
|
||||||
}
|
}
|
||||||
finish = clock();
|
finish = clock();
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,9 @@
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <mat.h>
|
#include <mat.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "CommonLib/thread_pool.h"
|
||||||
#include "CommonLib/matlab_io.h"
|
#include "CommonLib/matlab_io.h"
|
||||||
|
#include "CommonLib/kthread.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
/* 将结果写入mat文件 */
|
/* 将结果写入mat文件 */
|
||||||
|
|
@ -35,7 +37,7 @@ bool SavePubmed(const string& matPath,
|
||||||
}
|
}
|
||||||
|
|
||||||
// 创建结构体数据
|
// 创建结构体数据
|
||||||
mxArray* mxStruct = mxCreateStructMatrix(1, 1, vTgName.size(), vTgChars.data());
|
mxArray* mxStruct = mxCreateStructMatrix(1, 1, (int)vTgName.size(), vTgChars.data());
|
||||||
// 创建cell matrix
|
// 创建cell matrix
|
||||||
unordered_map<string, mxArray*> ummxCellMtx;
|
unordered_map<string, mxArray*> ummxCellMtx;
|
||||||
for (auto & tgName : vTgName) {
|
for (auto & tgName : vTgName) {
|
||||||
|
|
@ -68,9 +70,45 @@ bool SavePubmed(const string& matPath,
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
/* 处理一篇文章 */
|
||||||
|
struct ThreadParam { // 线程参数
|
||||||
|
unordered_map<string, string> *pumTagContent;
|
||||||
|
vector<string>* pvLineTag;
|
||||||
|
vector<string>* pvTgName;
|
||||||
|
int paperStartIdx;
|
||||||
|
int paperEndIdx;
|
||||||
|
unordered_map<string, string>* pumFullTagToTag;
|
||||||
|
vector<string>* pvStrPubmedTxt;
|
||||||
|
};
|
||||||
|
|
||||||
|
//void ThreadProcessArticle(vector<ThreadParam>& vTP, long idx, int tid) {
|
||||||
|
void ThreadProcessArticle(ThreadParam& param) {
|
||||||
|
//ThreadParam& param = vTP[idx];
|
||||||
|
unordered_map<string, string>& umTagContent = *param.pumTagContent;
|
||||||
|
vector<string>& vLineTag = *param.pvLineTag;
|
||||||
|
vector<string>& vTgName = *param.pvTgName;
|
||||||
|
unordered_map<string, string>& umFullTagToTag = *param.pumFullTagToTag;
|
||||||
|
vector<string>& vStrPubmedTxt = *param.pvStrPubmedTxt;
|
||||||
|
|
||||||
|
int startIdx = param.paperStartIdx;
|
||||||
|
int endIdx = param.paperEndIdx;
|
||||||
|
|
||||||
|
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
|
||||||
|
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag,设置一个新的string
|
||||||
|
}
|
||||||
|
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容
|
||||||
|
string& fullTag = vLineTag[idx];
|
||||||
|
auto tagItr = umFullTagToTag.find(fullTag);
|
||||||
|
if (tagItr != umFullTagToTag.end()) { // 找到tag了
|
||||||
|
const string& tag = tagItr->second;
|
||||||
|
string& tagContent = umTagContent[tag];
|
||||||
|
tagContent.append(vStrPubmedTxt[idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// 命令行参数示例
|
// 命令行参数示例
|
||||||
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat
|
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat 12
|
||||||
/*
|
/*
|
||||||
pubmed txt文件中包含多个文章的摘要信息,每个信息最前边有一个tag,每个tag对应的信息可能有一行,也可能多行,每个文章中间由一个空行隔开
|
pubmed txt文件中包含多个文章的摘要信息,每个信息最前边有一个tag,每个tag对应的信息可能有一行,也可能多行,每个文章中间由一个空行隔开
|
||||||
1. 读取预先提取的pubmed tags, 并将tags中的'-'和' '字符去掉,只留下纯字符串做tag
|
1. 读取预先提取的pubmed tags, 并将tags中的'-'和' '字符去掉,只留下纯字符串做tag
|
||||||
|
|
@ -81,11 +119,11 @@ bool SavePubmed(const string& matPath,
|
||||||
void ProcessPubmedTxt(int argc, const char** argv) {
|
void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
// argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件
|
// argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件
|
||||||
//
|
//
|
||||||
if (argc != 4) {
|
if (argc < 4) {
|
||||||
cout << "This program should take 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat)!" << endl;
|
cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat; [4. thread num])!" << endl;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
clock_t begin, finish;
|
||||||
int rowNum, colNum;
|
int rowNum, colNum;
|
||||||
vector<string> vTg;
|
vector<string> vTg;
|
||||||
vector<string> vTgName;
|
vector<string> vTgName;
|
||||||
|
|
@ -94,6 +132,7 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
/* 读取pubmed tags */
|
/* 读取pubmed tags */
|
||||||
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
|
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
|
||||||
/* 1. 去掉tags里的'-'和' '字符,得到纯净的tag */
|
/* 1. 去掉tags里的'-'和' '字符,得到纯净的tag */
|
||||||
|
begin = clock();
|
||||||
vTgName = vTg;
|
vTgName = vTg;
|
||||||
for (int i = 0; i < vTg.size(); ++i) {
|
for (int i = 0; i < vTg.size(); ++i) {
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
|
|
@ -105,6 +144,8 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
vTgName[i].resize(pos);
|
vTgName[i].resize(pos);
|
||||||
umFullTagToTag[vTg[i]] = vTgName[i];
|
umFullTagToTag[vTg[i]] = vTgName[i];
|
||||||
}
|
}
|
||||||
|
finish = clock();
|
||||||
|
cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
/* 2. 读取pubmed txt文件,先读入后处理 */
|
/* 2. 读取pubmed txt文件,先读入后处理 */
|
||||||
ifstream ifsPubmedTxt(argv[2]);
|
ifstream ifsPubmedTxt(argv[2]);
|
||||||
|
|
@ -117,6 +158,7 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
int curPos = 0;
|
int curPos = 0;
|
||||||
vPaperStartIdx.push_back(curPos); // 添加初始索引
|
vPaperStartIdx.push_back(curPos); // 添加初始索引
|
||||||
const int FULL_TAG_LEN = 5;
|
const int FULL_TAG_LEN = 5;
|
||||||
|
begin = clock();
|
||||||
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
|
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
|
||||||
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
|
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
|
||||||
if (strLine.size() == 0) { // 新的paper
|
if (strLine.size() == 0) { // 新的paper
|
||||||
|
|
@ -135,31 +177,27 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
vPaperStartIdx.push_back(curPos); // 比文章多1,最后一个记录结束位置
|
vPaperStartIdx.push_back(curPos); // 比文章多1,最后一个记录结束位置
|
||||||
|
finish = clock();
|
||||||
|
cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
/* 处理每一篇文章 */
|
/* 处理每一篇文章 */
|
||||||
for (int i = 0; i < vPaperStartIdx.size() - 1; ++i) {
|
int numThread = 1;
|
||||||
int startIdx = vPaperStartIdx[i];
|
if (argc >= 5) numThread = atoi(argv[4]);
|
||||||
int endIdx = vPaperStartIdx[i + 1];
|
if (numThread < 1) numThread = 1;
|
||||||
unordered_map<string, string> umTagContent;
|
ThreadPool thPool(numThread);
|
||||||
|
vumPaperTagVal.resize(vPaperStartIdx.size()-1);
|
||||||
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
|
vector<thread> vT;
|
||||||
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag,设置一个新的string
|
vector<ThreadParam> vTP(vPaperStartIdx.size() - 1);
|
||||||
}
|
begin = clock();
|
||||||
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容
|
for (int i = 0; i < vTP.size(); ++i) {
|
||||||
string& fullTag = vLineTag[idx];
|
vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
|
||||||
auto tagItr = umFullTagToTag.find(fullTag);
|
|
||||||
if (tagItr != umFullTagToTag.end()) { // 找到tag了
|
|
||||||
const string& tag = tagItr->second;
|
|
||||||
string& tagContent = umTagContent[tag];
|
|
||||||
tagContent.append(vStrPubmedTxt[idx]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
vumPaperTagVal.push_back(umTagContent);
|
|
||||||
}
|
}
|
||||||
|
kt_for(numThread, ThreadProcessArticle, vTP);
|
||||||
// cout << "文件个数:" << vumPaperTagVal.size() << endl;
|
finish = clock();
|
||||||
|
cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
/* 去除没有摘要的文章 */
|
/* 去除没有摘要的文章 */
|
||||||
|
begin = clock();
|
||||||
const string abstractTag = "AB";
|
const string abstractTag = "AB";
|
||||||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
|
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
|
||||||
if ((*itr)[abstractTag].size() == 0) {
|
if ((*itr)[abstractTag].size() == 0) {
|
||||||
|
|
@ -169,8 +207,11 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
itr++;
|
itr++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
finish = clock();
|
||||||
|
cout << "remove no AB Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
/* 根据PMID,去除冗余 */
|
/* 根据PMID,去除冗余 */
|
||||||
|
begin = clock();
|
||||||
unordered_map<string, int> umPMID;
|
unordered_map<string, int> umPMID;
|
||||||
const string pmidTag = "PMID";
|
const string pmidTag = "PMID";
|
||||||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
|
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
|
||||||
|
|
@ -183,16 +224,25 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
itr++;
|
itr++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
finish = clock();
|
||||||
|
cout << "remove duplication Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
/* 将title和abstract合并,赋值给abstract */
|
/* 将title和abstract合并,赋值给abstract */
|
||||||
|
begin = clock();
|
||||||
const string titleTag = "TI";
|
const string titleTag = "TI";
|
||||||
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) {
|
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) {
|
||||||
string& abstractStr = (*itr)[abstractTag];
|
string& abstractStr = (*itr)[abstractTag];
|
||||||
abstractStr = (*itr)[titleTag] + " " + abstractStr; // 可能会有性能损失,不过影响不大
|
abstractStr = (*itr)[titleTag] + " " + abstractStr; // 可能会有性能损失,不过影响不大
|
||||||
}
|
}
|
||||||
|
finish = clock();
|
||||||
|
cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
|
// 关闭txt文件
|
||||||
ifsPubmedTxt.close();
|
ifsPubmedTxt.close();
|
||||||
|
|
||||||
/* 将处理后的数据写入mat文件,mat中的变量名称分别为Tx和abs1 */
|
/* 将处理后的数据写入mat文件,mat中的变量名称分别为Tx和abs1 */
|
||||||
|
begin = clock();
|
||||||
SavePubmed(argv[3], vTgName, vumPaperTagVal);
|
SavePubmed(argv[3], vTgName, vumPaperTagVal);
|
||||||
|
finish = clock();
|
||||||
|
cout << "write to MAT Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
}
|
}
|
||||||
19
GMM/main.cpp
19
GMM/main.cpp
|
|
@ -30,8 +30,9 @@
|
||||||
#endif
|
#endif
|
||||||
#include <mat.h>
|
#include <mat.h>
|
||||||
#include "gmm.h"
|
#include "gmm.h"
|
||||||
#include "CommonLib/thread_pool.h"
|
// #include "CommonLib/thread_pool.h"
|
||||||
#include "CommonLib/matlab_io.h"
|
#include "CommonLib/matlab_io.h"
|
||||||
|
#include "CommonLib/kthread.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using std::cout;
|
using std::cout;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
@ -144,7 +145,9 @@ struct ThreadParam {
|
||||||
fs::path matFilePath;
|
fs::path matFilePath;
|
||||||
fs::path outFilePath;
|
fs::path outFilePath;
|
||||||
};
|
};
|
||||||
void ThreadProcessData(const ThreadParam& param) {
|
//void ThreadProcessData(vector<ThreadParam>& vTP, long idx, int tid) {
|
||||||
|
void ThreadProcessData(ThreadParam& param) {
|
||||||
|
//const ThreadParam& param = vTP[idx];
|
||||||
const fs::path& matFilePath = param.matFilePath;
|
const fs::path& matFilePath = param.matFilePath;
|
||||||
const fs::path& outFilePath = param.outFilePath;
|
const fs::path& outFilePath = param.outFilePath;
|
||||||
double* hs = nullptr;
|
double* hs = nullptr;
|
||||||
|
|
@ -192,10 +195,10 @@ int main(int argc, const char** argv) {
|
||||||
int numThread = 1;
|
int numThread = 1;
|
||||||
if (argc >= 4) numThread = atoi(argv[4]);
|
if (argc >= 4) numThread = atoi(argv[4]);
|
||||||
if (numThread < 1) numThread = 1;
|
if (numThread < 1) numThread = 1;
|
||||||
ThreadPool thPool(numThread);
|
//ThreadPool thPool(numThread);
|
||||||
clock_t begin, finish;
|
clock_t begin, finish;
|
||||||
begin = clock();
|
begin = clock();
|
||||||
|
vector<ThreadParam> vTP;
|
||||||
/* 遍历所有的知识颗粒目录,逐一进行处理 */
|
/* 遍历所有的知识颗粒目录,逐一进行处理 */
|
||||||
for (auto& childDir : fs::directory_iterator(parrentDir)) {
|
for (auto& childDir : fs::directory_iterator(parrentDir)) {
|
||||||
fs::path outFilePath = childDir / outFileName;
|
fs::path outFilePath = childDir / outFileName;
|
||||||
|
|
@ -203,12 +206,14 @@ int main(int argc, const char** argv) {
|
||||||
const string& fileName = file.path().filename().string();
|
const string& fileName = file.path().filename().string();
|
||||||
auto rPos = fileName.rfind(hsMatSuffix);
|
auto rPos = fileName.rfind(hsMatSuffix);
|
||||||
if (rPos != string::npos && fileName.size() - rPos == hsMatSuffix.size()) {
|
if (rPos != string::npos && fileName.size() - rPos == hsMatSuffix.size()) {
|
||||||
ThreadParam tParam = { file, outFilePath };
|
//ThreadParam tParam = { file, outFilePath };
|
||||||
thPool.enqueue(ThreadProcessData, tParam);
|
//thPool.enqueue(ThreadProcessData, tParam);
|
||||||
|
vTP.push_back({ file, outFilePath });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
thPool.~ThreadPool();
|
kt_for(numThread, ThreadProcessData, vTP);
|
||||||
|
//thPool.~ThreadPool();
|
||||||
finish = clock();
|
finish = clock();
|
||||||
cout << "GMM Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
cout << "GMM Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue