diff --git a/CppRun/process_pubmed_txt.cpp b/CppRun/process_pubmed_txt.cpp index 2f62c14..3f780dd 100644 --- a/CppRun/process_pubmed_txt.cpp +++ b/CppRun/process_pubmed_txt.cpp @@ -12,13 +12,16 @@ #include #include #include +#include #include #include "common.h" #include "CommonLib/thread_pool.h" #include "CommonLib/matlab_io.h" #include "CommonLib/kthread.h" +namespace fs = std::filesystem; +using std::cout; +using std::vector; using namespace std; - /* 将结果写入mat文件 */ /* 将数据写入mat文件中,用给定的名称命名 */ bool SavePubmed(const string& matPath, @@ -71,19 +74,17 @@ bool SavePubmed(const string& matPath, return true; } /* 处理一篇文章 */ -struct ThreadParam { // 线程参数 +struct ThreadParamPubmed { // 线程参数 unordered_map *pumTagContent; - vector* pvLineTag; - vector* pvTgName; + vector *pvLineTag; + vector *pvTgName; int paperStartIdx; int paperEndIdx; - unordered_map* pumFullTagToTag; - vector* pvStrPubmedTxt; + unordered_map *pumFullTagToTag; + vector *pvStrPubmedTxt; }; -//void ThreadProcessArticle(vector& vTP, long idx, int tid) { -void ThreadProcessArticle(ThreadParam& param) { - //ThreadParam& param = vTP[idx]; +void ThreadProcessArticle(ThreadParamPubmed& param) { unordered_map& umTagContent = *param.pumTagContent; vector& vLineTag = *param.pvLineTag; vector& vTgName = *param.pvTgName; @@ -108,7 +109,7 @@ void ThreadProcessArticle(ThreadParam& param) { } // 命令行参数示例 -// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\negatives\pubmed-multiplesc-set.txt d:\pubmed_txt.mat 12 +// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\pubmed_files d:\pubmed_txt.mat 12 /* pubmed txt文件中包含多个文章的摘要信息,每个信息最前边有一个tag,每个tag对应的信息可能有一行,也可能多行,每个文章中间由一个空行隔开 1. 读取预先提取的pubmed tags, 并将tags中的'-'和' '字符去掉,只留下纯字符串做tag @@ -117,10 +118,10 @@ void ThreadProcessArticle(ThreadParam& param) { 4. 将结果写入mat文件 */ void ProcessPubmedTxt(int argc, const char** argv) { - // argv 1.pubmed tag.mat文件; 2.pubmed article.txt文件; 3.pubmed out.mat输出文件 + // argv 1.pubmed tag.mat文件; 2.pubmed txt文件父目录; 3.pubmed out.mat输出文件; 4.Thread number // if (argc < 4) { - cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed article.txt; 3. pubmed out.mat; [4. thread num])!" << endl; + cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed txt parent dir; 3. pubmed out.mat; [4. thread num])!" << endl; return; } clock_t begin, finish; @@ -128,7 +129,7 @@ void ProcessPubmedTxt(int argc, const char** argv) { vector vTg; vector vTgName; vector > vumPaperTagVal; - unordered_map umFullTagToTag; // 完整tag与tag的映射,如“PMID- ”:“PMID” + unordered_map umFullTagToTag; // 完整tag与tag的映射,如“PMID- ”:“PMID” /* 读取pubmed tags */ ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum); /* 1. 去掉tags里的'-'和' '字符,得到纯净的tag */ @@ -148,7 +149,8 @@ void ProcessPubmedTxt(int argc, const char** argv) { cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; /* 2. 读取pubmed txt文件,先读入后处理 */ - ifstream ifsPubmedTxt(argv[2]); + string parentDir(argv[2]); + string txtSuffix(".txt"); vector vStrPubmedTxt; vector vLineTag; vector vPaperStartIdx; @@ -158,40 +160,53 @@ void ProcessPubmedTxt(int argc, const char** argv) { int curPos = 0; vPaperStartIdx.push_back(curPos); // 添加初始索引 const int FULL_TAG_LEN = 5; - begin = clock(); - while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符 - while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格 - if (strLine.size() == 0) { // 新的paper - vPaperStartIdx.push_back(curPos); - continue; - } - fullTag = strLine.substr(0, 5); - if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的 - string& lastTagConteng = vStrPubmedTxt.back(); - lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格 - } - else { - vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN)); - vLineTag.push_back(fullTag); - curPos++; + begin = clock(); + for (auto &file : fs::directory_iterator(parentDir)) { // 遍历目录里的每一个txt文件 + const string &fileName = file.path().filename().string(); + auto rPos = fileName.rfind(txtSuffix); + if (rPos != string::npos && fileName.size() - rPos == txtSuffix.size()){ + ifstream ifsPubmedTxt(file.path().string()); + while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符 + while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格 + if (strLine.size() == 0) { // 新的paper + vPaperStartIdx.push_back(curPos); + continue; + } + fullTag = strLine.substr(0, 5); + if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的 + string& lastTagConteng = vStrPubmedTxt.back(); + lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格 + } + else { + vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN)); + vLineTag.push_back(fullTag); + curPos++; + } + } + vPaperStartIdx.push_back(curPos); // 比文章多1,最后一个记录结束位置 + ifsPubmedTxt.close(); } } - vPaperStartIdx.push_back(curPos); // 比文章多1,最后一个记录结束位置 + finish = clock(); cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; - + cout << "paper num: " << vPaperStartIdx.size() - 1 << endl; /* 处理每一篇文章 */ int numThread = 1; if (argc >= 5) numThread = atoi(argv[4]); if (numThread < 1) numThread = 1; - ThreadPool thPool(numThread); + // ThreadPool thPool(numThread); vumPaperTagVal.resize(vPaperStartIdx.size()-1); - vector vT; - vector vTP(vPaperStartIdx.size() - 1); + vector vTP(vumPaperTagVal.size()); begin = clock(); - for (int i = 0; i < vTP.size(); ++i) { + for (int i = 0; i < vumPaperTagVal.size(); ++i) { vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt }; + // ThreadParamPubmed tp = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt }; + // ThreadProcessArticle(tp); + // thPool.enqueue(ThreadProcessArticle, tp); } + // thPool.~ThreadPool(); + kt_for(numThread, ThreadProcessArticle, vTP); finish = clock(); cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; @@ -237,9 +252,6 @@ void ProcessPubmedTxt(int argc, const char** argv) { finish = clock(); cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl; - // 关闭txt文件 - ifsPubmedTxt.close(); - /* 将处理后的数据写入mat文件,mat中的变量名称分别为Tx和abs1 */ begin = clock(); SavePubmed(argv[3], vTgName, vumPaperTagVal);