twirls/CppRun/process_pubmed_txt.cpp

259 lines
9.3 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*********************************************************************************************
Description: 处理pubmed生成的txt文件将处理结果放到mat文件中
Copyright : All right reserved by ZheYuan.BJ
Author : Zhang Zhonghai
Date : 2023/09/18
***********************************************************************************************/
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <algorithm>
#include <unordered_map>
#include <filesystem>
#include <mat.h>
#include "common.h"
#include "CommonLib/thread_pool.h"
#include "CommonLib/matlab_io.h"
#include "CommonLib/kthread.h"
namespace fs = std::filesystem;
using std::cout;
using std::vector;
using namespace std;
/* 将结果写入mat文件 */
/* 将数据写入mat文件中用给定的名称命名 */
bool SavePubmed(const string& matPath,
const vector<string>& vTgName,
vector<unordered_map<string, string> >& vumPaperTagVal)
{
MATFile* pMatFile = matOpen(matPath.c_str(), "w"); //打开.mat文件
if (pMatFile == nullptr) {
cout << "filePath is error! " << matPath << endl;
return false;
}
vector<const char*> vTgChars;
for (auto& strTg : vTgName) {
vTgChars.push_back(strTg.c_str());
}
// 创建结构体数据
mxArray* mxStruct = mxCreateStructMatrix(1, 1, (int)vTgName.size(), vTgChars.data());
// 创建cell matrix
unordered_map<string, mxArray*> ummxCellMtx;
for (auto & tgName : vTgName) {
ummxCellMtx[tgName] = mxCreateCellMatrix(1, vumPaperTagVal.size());
}
// 遍历每一篇文章
for (int i = 0; i < vumPaperTagVal.size(); ++i) {
auto& umTagVal = vumPaperTagVal[i];
// 遍历文章的每一个tag
for (auto& tgName : vTgName) {
mxArray* mxStr = mxCreateString(umTagVal[tgName].c_str());
mxArray* pMxArr = ummxCellMtx[tgName];
mxSetCell(pMxArr, i, mxStr);
}
}
// 将cell matrix赋值给struct matrix
for (auto& tgName : vTgName) {
mxArray* pMxArr = ummxCellMtx[tgName];
mxSetField(mxStruct, 0, tgName.c_str(), pMxArr);
}
// 将结构体写入mat并命名为Tx
matPutVariable(pMatFile, "Tx", mxStruct);
// 将abstract信息写入mat并命名为abs1
matPutVariable(pMatFile, "abs1", ummxCellMtx["AB"]);
matClose(pMatFile);
return true;
}
/* 处理一篇文章 */
struct ThreadParamPubmed { // 线程参数
unordered_map<string, string> *pumTagContent;
vector<string> *pvLineTag;
vector<string> *pvTgName;
int paperStartIdx;
int paperEndIdx;
unordered_map<string, string> *pumFullTagToTag;
vector<string> *pvStrPubmedTxt;
};
void ThreadProcessArticle(ThreadParamPubmed& param) {
unordered_map<string, string>& umTagContent = *param.pumTagContent;
vector<string>& vLineTag = *param.pvLineTag;
vector<string>& vTgName = *param.pvTgName;
unordered_map<string, string>& umFullTagToTag = *param.pumFullTagToTag;
vector<string>& vStrPubmedTxt = *param.pvStrPubmedTxt;
int startIdx = param.paperStartIdx;
int endIdx = param.paperEndIdx;
for (int tgIdx = 0; tgIdx < vTgName.size(); ++tgIdx) {
umTagContent[vTgName[tgIdx]] = ""; // 对每一个tag设置一个新的string
}
for (int idx = startIdx; idx < endIdx; ++idx) { // 遍历当前文章的每一个tag内容
string& fullTag = vLineTag[idx];
auto tagItr = umFullTagToTag.find(fullTag);
if (tagItr != umFullTagToTag.end()) { // 找到tag了
const string& tag = tagItr->second;
string& tagContent = umTagContent[tag];
tagContent.append(vStrPubmedTxt[idx]);
}
}
}
// 命令行参数示例
// ProcessPubmedTxt d:\Twirls\gat1\literatures\pubmed_tag.mat D:\Twirls\runtime\pubmed_files d:\pubmed_txt.mat 12
/*
pubmed txt文件中包含多个文章的摘要信息每个信息最前边有一个tag每个tag对应的信息可能有一行也可能多行每个文章中间由一个空行隔开
1. 读取预先提取的pubmed tags, 并将tags中的'-'和' '字符去掉只留下纯字符串做tag
2. 读取pubmed txt文件, 对所有文献根据tag进行拆分同一个文献中合并同一个tag的所有内容空格隔开
3. 去除一些冗余等内容, 将title和abstract合并赋值给abstract
4. 将结果写入mat文件
*/
void ProcessPubmedTxt(int argc, const char** argv) {
// argv 1.pubmed tag.mat文件; 2.pubmed txt文件父目录; 3.pubmed out.mat输出文件; 4.Thread number
//
if (argc < 4) {
cout << "This program should take at least 3 arguments(1.pubmed tag.mat; 2. pubmed txt parent dir; 3. pubmed out.mat; [4. thread num])!" << endl;
return;
}
clock_t begin, finish;
int rowNum, colNum;
vector<string> vTg;
vector<string> vTgName;
vector<unordered_map<string, string> > vumPaperTagVal;
unordered_map<string, string> umFullTagToTag; // 完整tag与tag的映射如“PMID- ”:“PMID”
/* 读取pubmed tags */
ReadMtxString(argv[1], "tg", vTg, &rowNum, &colNum);
/* 1. 去掉tags里的'-'和' '字符得到纯净的tag */
begin = clock();
vTgName = vTg;
for (int i = 0; i < vTg.size(); ++i) {
int pos = 0;
for (int j = 0; j < vTg[i].size(); ++j) {
if (vTg[i][j] != ' ' && vTg[i][j] != '-') { // 去掉tag中的空格和'-'字符生成tag name
vTgName[i][pos++] = vTg[i][j];
}
}
vTgName[i].resize(pos);
umFullTagToTag[vTg[i]] = vTgName[i];
}
finish = clock();
cout << "process tag Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 2. 读取pubmed txt文件先读入后处理 */
string parentDir(argv[2]);
string txtSuffix(".txt");
vector<string> vStrPubmedTxt;
vector<string> vLineTag;
vector<int> vPaperStartIdx;
string blankTag = " "; // 5个空格
string strLine;
string fullTag;
int curPos = 0;
vPaperStartIdx.push_back(curPos); // 添加初始索引
const int FULL_TAG_LEN = 5;
begin = clock();
for (auto &file : fs::directory_iterator(parentDir)) { // 遍历目录里的每一个txt文件
const string &fileName = file.path().filename().string();
auto rPos = fileName.rfind(txtSuffix);
if (rPos != string::npos && fileName.size() - rPos == txtSuffix.size()){
ifstream ifsPubmedTxt(file.path().string());
while (getline(ifsPubmedTxt, strLine)) { // 读取内容时候去掉了行尾的换行符
while (strLine.back() == ' ') strLine.pop_back(); // 去掉行尾的空格
if (strLine.size() == 0) { // 新的paper
vPaperStartIdx.push_back(curPos);
continue;
}
fullTag = strLine.substr(0, 5);
if (fullTag == blankTag) { // 这一行的内容还是属于上一个tag的
string& lastTagConteng = vStrPubmedTxt.back();
lastTagConteng.append(strLine.substr(FULL_TAG_LEN)); // 最前边包含了一个空格
}
else {
vStrPubmedTxt.push_back(strLine.substr(FULL_TAG_LEN));
vLineTag.push_back(fullTag);
curPos++;
}
}
vPaperStartIdx.push_back(curPos); // 比文章多1最后一个记录结束位置
ifsPubmedTxt.close();
}
}
finish = clock();
cout << "read txt Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
cout << "paper num: " << vPaperStartIdx.size() - 1 << endl;
/* 处理每一篇文章 */
int numThread = 1;
if (argc >= 5) numThread = atoi(argv[4]);
if (numThread < 1) numThread = 1;
ThreadPool thPool(numThread);
vumPaperTagVal.resize(vPaperStartIdx.size()-1);
// vector<ThreadParamPubmed> vTP(vumPaperTagVal.size());
begin = clock();
for (int i = 0; i < vumPaperTagVal.size(); ++i) {
//vTP[i] = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
ThreadParamPubmed tp = { &vumPaperTagVal[i], &vLineTag, &vTgName, vPaperStartIdx[i], vPaperStartIdx[i + 1], &umFullTagToTag, &vStrPubmedTxt };
// ThreadProcessArticle(tp);
thPool.enqueue(ThreadProcessArticle, tp);
}
thPool.~ThreadPool();
//kt_for(numThread, ThreadProcessArticle, vTP);
finish = clock();
cout << "kt for Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 去除没有摘要的文章 */
begin = clock();
const string abstractTag = "AB";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
if ((*itr)[abstractTag].size() == 0) {
itr = vumPaperTagVal.erase(itr);
}
else {
itr++;
}
}
finish = clock();
cout << "remove no AB Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 根据PMID去除冗余 */
begin = clock();
unordered_map<string, int> umPMID;
const string pmidTag = "PMID";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); ) {
if (umPMID.find((*itr)[pmidTag]) != umPMID.end()) {
// out << "duplicate " << (*itr)[pmidTag] << endl;
itr = vumPaperTagVal.erase(itr);
}
else {
umPMID[(*itr)[pmidTag]] = 1;
itr++;
}
}
finish = clock();
cout << "remove duplication Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 将title和abstract合并赋值给abstract */
begin = clock();
const string titleTag = "TI";
for (auto itr = vumPaperTagVal.begin(); itr != vumPaperTagVal.end(); itr++) {
string& abstractStr = (*itr)[abstractTag];
abstractStr = (*itr)[titleTag] + " " + abstractStr; // 可能会有性能损失,不过影响不大
}
finish = clock();
cout << "merge abs and title Total time : " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
/* 将处理后的数据写入mat文件mat中的变量名称分别为Tx和abs1 */
begin = clock();
SavePubmed(argv[3], vTgName, vumPaperTagVal);
finish = clock();
cout << "write to MAT Total time: " << (double)(finish - begin) / CLOCKS_PER_SEC << " s" << endl;
}