解決了一个很诡异的bug,ThreadParam可能是由于命名的问题,导致vector内存错误
This commit is contained in:
parent
0c73318fb7
commit
215e1d3dea
|
|
@ -12,13 +12,16 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <filesystem>
|
||||||
#include <mat.h>
|
#include <mat.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "CommonLib/thread_pool.h"
|
#include "CommonLib/thread_pool.h"
|
||||||
#include "CommonLib/matlab_io.h"
|
#include "CommonLib/matlab_io.h"
|
||||||
#include "CommonLib/kthread.h"
|
#include "CommonLib/kthread.h"
|
||||||
|
namespace fs = std::filesystem;
|
||||||
|
using std::cout;
|
||||||
|
using std::vector;
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
/* 将结果写入mat文件 */
|
/* 将结果写入mat文件 */
|
||||||
/* 将数据写入mat文件中,用给定的名称命名 */
|
/* 将数据写入mat文件中,用给定的名称命名 */
|
||||||
bool SavePubmed(const string& matPath,
|
bool SavePubmed(const string& matPath,
|
||||||
|
|
@ -71,19 +74,17 @@ bool SavePubmed(const string& matPath,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
/* 处理一篇文章 */
|
/* 处理一篇文章 */
|
||||||
struct ThreadParam { // 线程参数
|
struct ThreadParamPubmed { // 线程参数
|
||||||
unordered_map<string, string> *pumTagContent;
|
unordered_map<string, string> *pumTagContent;
|
||||||
vector<string>* pvLineTag;
|
vector<string> *pvLineTag;
|
||||||
vector<string>* pvTgName;
|
vector<string> *pvTgName;
|
||||||
int paperStartIdx;
|
int paperStartIdx;
|
||||||
int paperEndIdx;
|
int paperEndIdx;
|
||||||
unordered_map<string, string>* pumFullTagToTag;
|
unordered_map<string, string> *pumFullTagToTag;
|
||||||
vector<string>* pvStrPubmedTxt;
|
vector<string> *pvStrPubmedTxt;
|
||||||
};
|
};
|
||||||
|
|
||||||
//void ThreadProcessArticle(vector<ThreadParam>& vTP, long idx, int tid) {
|
void ThreadProcessArticle(ThreadParamPubmed& param) {
|
||||||
void ThreadProcessArticle(ThreadParam& param) {
|
|
||||||
//ThreadParam& param = vTP[idx];
|
|
||||||
unordered_map<string, string>& umTagContent = *param.pumTagContent;
|
unordered_map<string, string>& umTagContent = *param.pumTagContent;
|
||||||
vector<string>& vLineTag = *param.pvLineTag;
|
vector<string>& vLineTag = *param.pvLineTag;
|
||||||
vector<string>& vTgName = *param.pvTgName;
|
vector<string>& vTgName = *param.pvTgName;
|
||||||
|
|
@ -108,7 +109,7 @@ void ThreadProcessArticle(ThreadParam& param) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 命令行参数示例
|
// 命令行参数示例
|
||||||
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat 12
|
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\pubmed_files d:\pubmed_txt.mat 12
|
||||||
/*
|
/*
|
||||||
pubmed txt文件中包含多个文章的摘要信息,每个信息最前边有一个tag,每个tag对应的信息可能有一行,也可能多行,每个文章中间由一个空行隔开
|
pubmed txt文件中包含多个文章的摘要信息,每个信息最前边有一个tag,每个tag对应的信息可能有一行,也可能多行,每个文章中间由一个空行隔开
|
||||||
1. 读取预先提取的pubmed tags, 并将tags中的'-'和' '字符去掉,只留下纯字符串做tag
|
1. 读取预先提取的pubmed tags, 并将tags中的'-'和' '字符去掉,只留下纯字符串做tag
|
||||||
|
|
@ -117,10 +118,10 @@ void ThreadProcessArticle(ThreadParam& param) {
|
||||||
4. 将结果写入mat文件
|
4. 将结果写入mat文件
|
||||||
*/
|
*/
|
||||||
void ProcessPubmedTxt(int argc, const char** argv) {
|
void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
// argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件
|
// argv 1.pubmed tag.mat文件; 2.pubmed txt文件父目录; 3.pubmed out.mat输出文件; 4.Thread number
|
||||||
//
|
//
|
||||||
if (argc < 4) {
|
if (argc < 4) {
|
||||||
cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat; [4. thread num])!" << endl;
|
cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed txt parent dir; 3. pubmed out.mat; [4. thread num])!" << endl;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
clock_t begin, finish;
|
clock_t begin, finish;
|
||||||
|
|
@ -128,7 +129,7 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
vector<string> vTg;
|
vector<string> vTg;
|
||||||
vector<string> vTgName;
|
vector<string> vTgName;
|
||||||
vector<unordered_map<string, string> > vumPaperTagVal;
|
vector<unordered_map<string, string> > vumPaperTagVal;
|
||||||
unordered_map<string, string> umFullTagToTag; // 完整tag与tag的映射,如“PMID- ”:“PMID”
|
unordered_map<string, string> umFullTagToTag; // 完整tag与tag的映射,如“PMID- ”:“PMID”
|
||||||
/* 读取pubmed tags */
|
/* 读取pubmed tags */
|
||||||
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
|
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
|
||||||
/* 1. 去掉tags里的'-'和' '字符,得到纯净的tag */
|
/* 1. 去掉tags里的'-'和' '字符,得到纯净的tag */
|
||||||
|
|
@ -148,7 +149,8 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
/* 2. 读取pubmed txt文件,先读入后处理 */
|
/* 2. 读取pubmed txt文件,先读入后处理 */
|
||||||
ifstream ifsPubmedTxt(argv[2]);
|
string parentDir(argv[2]);
|
||||||
|
string txtSuffix(".txt");
|
||||||
vector<string> vStrPubmedTxt;
|
vector<string> vStrPubmedTxt;
|
||||||
vector<string> vLineTag;
|
vector<string> vLineTag;
|
||||||
vector<int> vPaperStartIdx;
|
vector<int> vPaperStartIdx;
|
||||||
|
|
@ -158,40 +160,53 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
int curPos = 0;
|
int curPos = 0;
|
||||||
vPaperStartIdx.push_back(curPos); // 添加初始索引
|
vPaperStartIdx.push_back(curPos); // 添加初始索引
|
||||||
const int FULL_TAG_LEN = 5;
|
const int FULL_TAG_LEN = 5;
|
||||||
begin = clock();
|
begin = clock();
|
||||||
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
|
for (auto &file : fs::directory_iterator(parentDir)) { // 遍历目录里的每一个txt文件
|
||||||
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
|
const string &fileName = file.path().filename().string();
|
||||||
if (strLine.size() == 0) { // 新的paper
|
auto rPos = fileName.rfind(txtSuffix);
|
||||||
vPaperStartIdx.push_back(curPos);
|
if (rPos != string::npos && fileName.size() - rPos == txtSuffix.size()){
|
||||||
continue;
|
ifstream ifsPubmedTxt(file.path().string());
|
||||||
}
|
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
|
||||||
fullTag = strLine.substr(0, 5);
|
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
|
||||||
if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的
|
if (strLine.size() == 0) { // 新的paper
|
||||||
string& lastTagConteng = vStrPubmedTxt.back();
|
vPaperStartIdx.push_back(curPos);
|
||||||
lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格
|
continue;
|
||||||
}
|
}
|
||||||
else {
|
fullTag = strLine.substr(0, 5);
|
||||||
vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN));
|
if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的
|
||||||
vLineTag.push_back(fullTag);
|
string& lastTagConteng = vStrPubmedTxt.back();
|
||||||
curPos++;
|
lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN));
|
||||||
|
vLineTag.push_back(fullTag);
|
||||||
|
curPos++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
vPaperStartIdx.push_back(curPos); // 比文章多1,最后一个记录结束位置
|
||||||
|
ifsPubmedTxt.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
vPaperStartIdx.push_back(curPos); // 比文章多1,最后一个记录结束位置
|
|
||||||
finish = clock();
|
finish = clock();
|
||||||
cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
cout << "paper num: " << vPaperStartIdx.size() - 1 << endl;
|
||||||
/* 处理每一篇文章 */
|
/* 处理每一篇文章 */
|
||||||
int numThread = 1;
|
int numThread = 1;
|
||||||
if (argc >= 5) numThread = atoi(argv[4]);
|
if (argc >= 5) numThread = atoi(argv[4]);
|
||||||
if (numThread < 1) numThread = 1;
|
if (numThread < 1) numThread = 1;
|
||||||
ThreadPool thPool(numThread);
|
// ThreadPool thPool(numThread);
|
||||||
vumPaperTagVal.resize(vPaperStartIdx.size()-1);
|
vumPaperTagVal.resize(vPaperStartIdx.size()-1);
|
||||||
vector<thread> vT;
|
vector<ThreadParamPubmed> vTP(vumPaperTagVal.size());
|
||||||
vector<ThreadParam> vTP(vPaperStartIdx.size() - 1);
|
|
||||||
begin = clock();
|
begin = clock();
|
||||||
for (int i = 0; i < vTP.size(); ++i) {
|
for (int i = 0; i < vumPaperTagVal.size(); ++i) {
|
||||||
vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
|
vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
|
||||||
|
// ThreadParamPubmed tp = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
|
||||||
|
// ThreadProcessArticle(tp);
|
||||||
|
// thPool.enqueue(ThreadProcessArticle, tp);
|
||||||
}
|
}
|
||||||
|
// thPool.~ThreadPool();
|
||||||
|
|
||||||
kt_for(numThread, ThreadProcessArticle, vTP);
|
kt_for(numThread, ThreadProcessArticle, vTP);
|
||||||
finish = clock();
|
finish = clock();
|
||||||
cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
@ -237,9 +252,6 @@ void ProcessPubmedTxt(int argc, const char** argv) {
|
||||||
finish = clock();
|
finish = clock();
|
||||||
cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
|
||||||
|
|
||||||
// 关闭txt文件
|
|
||||||
ifsPubmedTxt.close();
|
|
||||||
|
|
||||||
/* 将处理后的数据写入mat文件,mat中的变量名称分别为Tx和abs1 */
|
/* 将处理后的数据写入mat文件,mat中的变量名称分别为Tx和abs1 */
|
||||||
begin = clock();
|
begin = clock();
|
||||||
SavePubmed(argv[3], vTgName, vumPaperTagVal);
|
SavePubmed(argv[3], vTgName, vumPaperTagVal);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue