2023-09-21 15:24:14 +08:00
/*********************************************************************************************
Description : <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ÿ <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ժ Ҫ <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ƿ <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> 趨 <EFBFBD> ĸ <EFBFBD> Ƶ <EFBFBD> ʻ 㣬 <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Щ <EFBFBD> <EFBFBD> Ƶ <EFBFBD> ʻ <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ժ Ҫ <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ϣ <EFBFBD> <EFBFBD>
Copyright : All right reserved by ZheYuan . BJ
Author : Zhang Zhonghai
Date : 2023 / 09 / 20
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# include <iostream>
# include <fstream>
# include <sstream>
# include <filesystem>
# include <vector>
# include <string>
# include <unordered_map>
# include <unordered_set>
# include <algorithm>
# include <omp.h>
# include <time.h>
# include <vector>
# include <queue>
# include <thread>
# include <cctype>
# include <cmath>
# ifdef _WIN32
# include <io.h>
# include <process.h>
# define F_OK 0
# else
# include <unistd.h>
# endif
# include <mat.h>
# include "common.h"
# include "CommonLib/thread_pool.h"
# include "CommonLib/matlab_io.h"
2023-09-27 10:27:19 +08:00
# include "CommonLib/kthread.h"
2023-09-21 15:24:14 +08:00
using namespace std ;
using std : : cout ;
using std : : vector ;
namespace fs = std : : filesystem ;
# include "common.h"
# include "CommonLib/matlab_io.h"
using namespace std ;
// <20> <> <EFBFBD> <EFBFBD> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ѭ<EFBFBD> <D1AD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
# define FOREACH_PARTICLE_START \
for ( auto & childDir : fs : : directory_iterator ( parrentDir ) ) { \
for ( auto & file : fs : : directory_iterator ( childDir ) ) { \
const string & fileName = file . path ( ) . filename ( ) . string ( ) ; \
auto rPos = fileName . rfind ( wordMatSuffix ) ; \
if ( rPos ! = string : : npos & & fileName . size ( ) - rPos = = wordMatSuffix . size ( ) ) {
# define FOREACH_PARTICLE_END \
} \
} \
}
/* <20> <> ȡ<EFBFBD> <C8A1> <EFBFBD> <EFBFBD> cell<6C> <6C> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ַ<EFBFBD> <D6B7> <EFBFBD> ,<2C> <> <EFBFBD> <EFBFBD> ֵ<EFBFBD> <D6B5> ds,fr */
# define OUTER_FOR_BEGIN \
rowNum = ( int ) mxGetM ( pMxArray ) ; \
colNum = ( int ) mxGetN ( pMxArray ) ; \
for ( int i = 0 ; i < rowNum ; + + i ) { \
for ( int j = 0 ; j < colNum ; + + j ) { \
pCell = mxGetCell ( pMxArray , j * rowNum + i ) ; \
int childRowNum = ( int ) mxGetM ( pCell ) ; \
int childColNum = ( int ) mxGetN ( pCell ) ;
# define OUTER_FOR_END \
} \
} \
mxDestroyArray ( pMxArray ) ;
# define INNTER_FOR_BEGIN \
for ( int ii = 0 ; ii < childRowNum ; ii + + ) { \
for ( int jj = 0 ; jj < childColNum ; jj + + ) { \
mxArray * pChildCell = mxGetCell ( pCell , jj * childRowNum + ii ) ;
# define INNTER_FOR_END \
} \
}
// <20> <> matlab<61> 洢<EFBFBD> <E6B4A2> ʽ ת<CABD> <D7AA> <EFBFBD> <EFBFBD> c<EFBFBD> 洢<EFBFBD> <E6B4A2> ʽ
# define TRANS_ROW_COL(dst, src, rowNum, colNum) \
for ( int rowI = 0 ; rowI < rowNum ; + + rowI ) { \
for ( int colJ = 0 ; colJ < colNum ; + + colJ ) { \
dst [ rowI * colNum + colJ ] = src [ colJ * rowNum + rowI ] ; \
} \
}
// <20> <> ȡds<64> <73> fr<66> <72> Ϣ
bool ReadInfoFromMat ( const string & filePath , vector < vector < string > > & vvDs , vector < vector < double > > & vvFr ) {
MATFile * pMatFile = nullptr ;
mxArray * pMxArray = nullptr ;
mxArray * pCell = nullptr ;
int rowNum , colNum ;
2023-10-05 10:38:21 +08:00
char * strBuf = new char [ STRING_BUF_SIZE ] ;
2023-09-21 15:24:14 +08:00
const string & parrentName = " G " ;
const string & firstChildName = " ds " ;
const string & secondChildName = " fr " ;
pMatFile = matOpen ( filePath . c_str ( ) , " r " ) ; //<2F> <> <EFBFBD> <EFBFBD> .mat<61> ļ<EFBFBD>
if ( pMatFile = = nullptr ) {
cout < < " filePath is error! " < < endl ;
return false ;
}
mxArray * pMxG = matGetVariable ( pMatFile , parrentName . c_str ( ) ) ; //<2F> <> ȡG<C8A1> <47> <EFBFBD> <EFBFBD>
// <20> <> ȡds<64> ַ<EFBFBD> <D6B7> <EFBFBD>
pMxArray = mxGetField ( pMxG , 0 , firstChildName . c_str ( ) ) ; // ds
OUTER_FOR_BEGIN
vvDs . push_back ( vector < string > ( ) ) ;
vvDs . back ( ) . resize ( childRowNum * childColNum ) ;
INNTER_FOR_BEGIN
if ( mxGetString ( pChildCell , strBuf , STRING_BUF_SIZE ) ! = 0 ) {
cout < < " String is too large to fit in the buffer! " < < i + 1 < < ' \t ' < < j + 1 < < endl ;
2023-10-05 10:38:21 +08:00
delete [ ] strBuf ;
2023-09-21 15:24:14 +08:00
return false ;
}
vvDs . back ( ) [ ii * childColNum + jj ] = strBuf ;
auto & lastStr = vvDs . back ( ) [ ii * childColNum + jj ] ;
transform ( lastStr . begin ( ) , lastStr . end ( ) , lastStr . begin ( ) , : : toupper ) ; // ת<> ɴ<EFBFBD> д
INNTER_FOR_END
OUTER_FOR_END
// <20> <> ȡfr<66> <72> ֵ
pMxArray = mxGetField ( pMxG , 0 , secondChildName . c_str ( ) ) ; // fr
OUTER_FOR_BEGIN
vvFr . push_back ( vector < double > ( ) ) ;
vvFr . back ( ) . resize ( childRowNum * childColNum ) ;
double * pVal = ( double * ) mxGetData ( pCell ) ; //<2F> <> ȡָ<C8A1> <D6B8>
TRANS_ROW_COL ( vvFr . back ( ) , pVal , childRowNum , childColNum ) ; // <20> <> <EFBFBD> д洢<D0B4> <E6B4A2> ʽ ת<CABD> <D7AA>
2023-10-05 10:38:21 +08:00
OUTER_FOR_END
2023-09-21 15:24:14 +08:00
2023-10-05 10:38:21 +08:00
// û<> <C3BB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ȫ<EFBFBD> <C8AB> Щ<EFBFBD> <D0A9> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ҪmxDestroyArray<61> <79> <EFBFBD> <EFBFBD> <EFBFBD> ܻ<EFBFBD> <DCBB> <EFBFBD> <EFBFBD> ڴ<EFBFBD> й©
delete [ ] strBuf ;
2023-09-21 15:24:14 +08:00
return true ;
}
2023-09-22 00:51:34 +08:00
// <20> <> <EFBFBD> <EFBFBD> ά<EFBFBD> <CEAC> <EFBFBD> <EFBFBD> ת<EFBFBD> <D7AA> һ ά<D2BB> <CEAC> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
inline int Get1DIndex ( int colNum , int row , int col ) {
return row * colNum + col ;
}
2023-09-21 15:24:14 +08:00
/* <20> <> <EFBFBD> <EFBFBD> һ <EFBFBD> <D2BB> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> */
2023-09-27 15:17:40 +08:00
struct ThreadParamEntropy { // <20> ̲߳<DFB3> <CCB2> <EFBFBD>
2023-09-21 15:24:14 +08:00
fs : : path matFilePath ;
2023-09-22 00:51:34 +08:00
fs : : path outFilePath ;
2023-09-21 15:24:14 +08:00
vector < unordered_set < string > > * pvusWord ;
} ;
2023-09-27 15:17:40 +08:00
void ThreadProcessEntropy ( ThreadParamEntropy & param ) {
2023-09-21 15:24:14 +08:00
const fs : : path & matFilePath = param . matFilePath ;
2023-09-22 00:51:34 +08:00
const fs : : path & outFilePath = param . outFilePath ;
2023-09-21 15:24:14 +08:00
vector < unordered_set < string > > & vusWord = * param . pvusWord ;
2023-09-22 00:51:34 +08:00
// <20> <> <EFBFBD> Ž<EFBFBD> <C5BD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> һ ά<D2BB> <CEAC> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ŷ<EFBFBD> ά<EFBFBD> <CEAC> <EFBFBD> <EFBFBD>
vector < double > hs ;
vector < double > hr ;
2023-09-21 15:24:14 +08:00
vector < vector < string > > vvDs ; // ÿ<> <C3BF> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ds<64> <73> <EFBFBD> ʻ <F3A3A8B4> <CABB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
vector < vector < double > > vvFr ; // <20> ʻ <EFBFBD> <CABB> <EFBFBD> Ӧ<EFBFBD> <D3A6> Ƶ<EFBFBD> <C6B5>
// <20> <> ȡG<C8A1> ṹ<EFBFBD> <E1B9B9> <EFBFBD> е <EFBFBD> ds<64> <73> fr<66> <72> Ϣ
ReadInfoFromMat ( matFilePath . string ( ) , vvDs , vvFr ) ;
const int numLiterature = vusWord . size ( ) ; // pubmed <20> ļ<EFBFBD> <C4BC> а <EFBFBD> <D0B0> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
const int numGroup = vvDs . size ( ) ; // ds<64> <73> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
2023-09-22 00:51:34 +08:00
hs . resize ( numGroup * numLiterature ) ;
hr . resize ( numLiterature * numGroup ) ;
for ( int groupIdx = 0 ; groupIdx < numGroup ; + + groupIdx ) { // <20> <> <EFBFBD> <EFBFBD> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> е <EFBFBD> ÿһ <C3BF> <D2BB>
2023-09-21 15:24:14 +08:00
vector < string > & vDs = vvDs [ groupIdx ] ; // <20> <> һ <EFBFBD> <D2BB> ds
vector < double > & vFr = vvFr [ groupIdx ] ; // frequency
const int numWord = vDs . size ( ) ; // <20> <> һ <EFBFBD> <D2BB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> а <EFBFBD> <D0B0> <EFBFBD> <EFBFBD> ĵ<EFBFBD> <C4B5> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
vector < vector < int > > vX ( numLiterature , vector < int > ( numWord , 0 ) ) ;
// <20> <> <EFBFBD> <EFBFBD> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> е Ĵ<D0B5> <C4B4> <EFBFBD> <EFBFBD> Ƿ<EFBFBD> <C7B7> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> pubmedժҪ<D5AA> Ĵ<EFBFBD> <C4B4> <EFBFBD> <EFBFBD> <EFBFBD>
for ( int i = 0 ; i < numLiterature ; + + i ) {
for ( int j = 0 ; j < numWord ; + + j ) {
if ( vusWord [ i ] . find ( vDs [ j ] ) ! = vusWord [ i ] . end ( ) ) { // <20> <> һ <EFBFBD> 鵥<EFBFBD> <E9B5A5> <EFBFBD> е <EFBFBD> j<EFBFBD> <6A> <EFBFBD> <EFBFBD> λ<EFBFBD> õĵ<C3B5> <C4B5> <EFBFBD> <EFBFBD> ڵ<EFBFBD> i<EFBFBD> <69> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> г <EFBFBD> <D0B3> ֹ<EFBFBD>
vX [ i ] [ j ] = 1 ;
}
}
}
2023-09-22 00:51:34 +08:00
// <20> Ҵʻ <D2B4> <CABB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ƶ<EFBFBD> <C6B5>
double maxFr = * max_element ( vFr . begin ( ) , vFr . end ( ) ) ;
// <20> <> fr<66> <72> <EFBFBD> <EFBFBD> ֵ<EFBFBD> 淶<EFBFBD> <E6B7B6> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> 0<EFBFBD> <30> 0.368<EFBFBD> <EFBFBD> ֮<EFBFBD> <EFBFBD>
const double normalMax = 0.368 ;
for ( auto & frVal : vFr ) frVal = frVal * normalMax / maxFr ;
maxFr = normalMax ;
// <20> <> ÿ<EFBFBD> <C3BF> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> ÿһ <C3BF> <D2BB> <EFBFBD> <EFBFBD> <EFBFBD> ݣ<EFBFBD> <DDA3> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ϣ<EFBFBD> <CFA2>
2023-09-21 15:24:14 +08:00
for ( int i = 0 ; i < numLiterature ; + + i ) {
2023-09-22 00:51:34 +08:00
for ( int j = 0 ; j < numWord ; + + j ) {
if ( vX [ i ] [ j ] = = 1 ) {
hs [ Get1DIndex ( numLiterature , groupIdx , i ) ] - = vFr [ j ] * log2 ( vFr [ j ] ) ;
}
2023-09-21 15:24:14 +08:00
}
}
// <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ƶ<EFBFBD> ʻ <EFBFBD> <CABB> <EFBFBD> <EFBFBD> ڵ<EFBFBD> <DAB5> <EFBFBD> <EFBFBD> <EFBFBD> λ<EFBFBD> <CEBB>
2023-09-22 00:51:34 +08:00
vector < int > vMaxPos ;
int idx = 0 ;
for_each ( vFr . begin ( ) , vFr . end ( ) , [ & idx , maxFr , & vMaxPos ] ( double val ) {
if ( val = = maxFr ) vMaxPos . push_back ( idx ) ;
idx + + ;
} ) ;
for ( int i = 0 ; i < numLiterature ; + + i ) {
int cumulateX = 0 ; // <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ƶ<EFBFBD> ʻ 㴦<CABB> <E3B4A6> xֵ<78> <D6B5> <EFBFBD> ۼӽ<DBBC> <D3BD> <EFBFBD>
for ( int j = 0 ; j < vMaxPos . size ( ) ; + + j ) cumulateX + = vX [ i ] [ vMaxPos [ j ] ] ;
if ( cumulateX = = vMaxPos . size ( ) ) { // <20> <> <EFBFBD> <EFBFBD> Ƶ<EFBFBD> <C6B5> <EFBFBD> <EFBFBD> <EFBFBD> ߵ Ĵʻ 㶼<CABB> <E3B6BC> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
hr [ Get1DIndex ( numGroup , i , groupIdx ) ] = 1 ; // Ӧ<> <D3A6> <EFBFBD> DZ<EFBFBD> ʾ ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> һ <EFBFBD> <D2BB> <EFBFBD> <EFBFBD> <EFBFBD> ݸ<EFBFBD> <DDB8> <EFBFBD> ƪ<EFBFBD> <C6AA> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ԱȽϸ<C8BD>
}
}
2023-09-21 15:24:14 +08:00
}
2023-09-22 00:51:34 +08:00
/* <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> hs<68> <73> hr<68> <72> д<EFBFBD> <D0B4> ÿ<EFBFBD> <C3BF> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ŀ¼<C4BF> <C2BC> */
MATFile * pMatFile = matOpen ( outFilePath . string ( ) . c_str ( ) , " w " ) ;
2023-09-27 10:27:19 +08:00
SaveMtxDouble ( hs . data ( ) , pMatFile , " hs " , numGroup , numLiterature ) ;
SaveMtxDouble ( hr . data ( ) , pMatFile , " hr " , numLiterature , numGroup ) ;
2023-09-22 00:51:34 +08:00
matClose ( pMatFile ) ;
2023-09-21 15:24:14 +08:00
}
/* <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> */
2023-09-22 00:51:34 +08:00
// <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ӣ<EFBFBD> CalcEntropy d:\Twirls\runtime\ALS_test abs2class.mat d:\Twirls\runtime\pubmed_files\pubmed-multiplesc-set.mat hx_info.mat 12 word.mat
2023-09-21 15:24:14 +08:00
void CalcEntropy ( int argc , const char * * argv ) {
// argv
// 1. ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> ĸ<EFBFBD> Ŀ¼<C4BF> <C2BC> <EFBFBD> <EFBFBD>
// 2. <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ƶ<EFBFBD> ʻ <EFBFBD> <CABB> <EFBFBD> Ϣ<EFBFBD> <CFA2> mat<61> ļ<EFBFBD> <C4BC> ĺ<EFBFBD>
// 3. <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> pubmed<65> <64> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ϣ<EFBFBD> <CFA2> mat<61> ļ<EFBFBD> ·<EFBFBD> <C2B7>
2023-09-27 10:27:19 +08:00
// 4. <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> mat<61> ļ<EFBFBD> <C4BC> <EFBFBD> <EFBFBD> <EFBFBD> ÿ<EFBFBD> <C3BF> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> Ŀ¼<C4BF> <C2BC> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> һ <EFBFBD> <D2BB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ļ<EFBFBD> <C4BC> <EFBFBD>
2023-09-21 15:24:14 +08:00
// 5. <20> ߳<EFBFBD> <DFB3> <EFBFBD> <EFBFBD> <EFBFBD> (<28> <> ѡ )
if ( argc < 5 ) {
2023-09-22 00:51:34 +08:00
cout < < " This program should take at least 4 arguments(1.parrent Dir; 2. mat file suffix; 3. pubmed mat file; 4. out mat filename; [5. thread number]; [6. word out mat filepath])! " < < endl ;
2023-09-21 15:24:14 +08:00
return ;
}
clock_t begin , finish ;
string parrentDir ( argv [ 1 ] ) ; // ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> ĸ<EFBFBD> Ŀ¼<C4BF> <C2BC> <EFBFBD> <EFBFBD>
string wordMatSuffix ( argv [ 2 ] ) ; // <20> <> Ƶ<EFBFBD> ʻ <EFBFBD> <CABB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ӧ<EFBFBD> <D3A6> mat<61> ļ<EFBFBD> <C4BC> ĺ<EFBFBD> <EFBFBD> <D7BA> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ȫ<EFBFBD> ļ<EFBFBD> <C4BC> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ļ<EFBFBD> <C4BC> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <D7BA> <EFBFBD> <EFBFBD> <EFBFBD> 뱣֤Ψһ <CEA8> <D2BB>
2023-09-22 00:51:34 +08:00
fs : : path outFileName ( argv [ 4 ] ) ;
2023-09-21 15:24:14 +08:00
int numThread = 1 ;
2023-10-05 10:38:21 +08:00
if ( argc > 5 ) numThread = atoi ( argv [ 5 ] ) ;
2023-09-21 15:24:14 +08:00
if ( numThread < 1 ) numThread = 1 ;
/* <20> <> <EFBFBD> 봦<EFBFBD> <EBB4A6> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> pubmed<65> <64> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> Ϣ<EFBFBD> <CFA2> mat<61> ļ<EFBFBD> <C4BC> <EFBFBD> ֻ<EFBFBD> <D6BB> <EFBFBD> <EFBFBD> ժҪ<D5AA> <D2AA> Ϣ<EFBFBD> <CFA2> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> abs1 */
vector < string > vAbstract ;
int rowNum , colNum ;
ReadMtxString ( argv [ 3 ] , " abs1 " , vAbstract , & rowNum , & colNum ) ;
if ( vAbstract . size ( ) = = 0 ) { // ժҪ<D5AA> <D2AA> ϢΪ<CFA2> գ <EFBFBD> <D5A3> <EFBFBD> <EFBFBD> <EFBFBD>
cout < < " PubMed Abstract info is null! " < < endl ;
return ;
}
2023-09-22 00:51:34 +08:00
/* <20> <> ժҪ<D5AA> <D2AA> Ϣ<EFBFBD> ָ<EFBFBD> <D6B8> <EFBFBD> һ <EFBFBD> <D2BB> һ <EFBFBD> <D2BB> <EFBFBD> Ĵʻ <C4B4> */
2023-09-21 15:24:14 +08:00
begin = clock ( ) ;
unordered_set < char > usWordChars ; // <20> <> <EFBFBD> <EFBFBD> <EFBFBD> ɵ<EFBFBD> <C9B5> ʵ<EFBFBD> <CAB5> ַ<EFBFBD> <D6B7> <EFBFBD> Ҫ<EFBFBD> <D2AA> Ҫ<EFBFBD> <D2AA> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ֣<EFBFBD> ԭ<EFBFBD> <D4AD> matlab<61> <62> <EFBFBD> <EFBFBD> ȡ<EFBFBD> <C8A1> <EFBFBD> <EFBFBD> <EFBFBD> ֵ<EFBFBD>
for ( int i = 65 ; i < = 90 ; i + + ) usWordChars . insert ( char ( i ) ) ; // A - Z
for ( int i = 97 ; i < = 122 ; i + + ) usWordChars . insert ( char ( i ) ) ; // a - z
2023-09-22 00:51:34 +08:00
for ( int i = 48 ; i < = 57 ; i + + ) usWordChars . insert ( char ( i ) ) ; // 0 - 9
usWordChars . insert ( ' / ' ) ; usWordChars . insert ( ' + ' ) ; usWordChars . insert ( ' - ' ) ;
2023-09-21 15:24:14 +08:00
vector < vector < string > > vvWordMtx ( vAbstract . size ( ) ) ; // <20> <> ʼ <EFBFBD> <CABC> С Ϊ<D0A1> <CEAA> <EFBFBD> µĸ<C2B5> <C4B8> <EFBFBD>
vector < unordered_set < string > > vusAbsWord ( vAbstract . size ( ) ) ; // <20> <> ÿƪ<C3BF> <C6AA> <EFBFBD> <EFBFBD> ժҪ<D5AA> ĵ<EFBFBD> <C4B5> ʷ<EFBFBD> <CAB7> <EFBFBD> hash<73> <68>
for ( int i = 0 ; i < vAbstract . size ( ) ; i + + ) {
auto & strAbs = vAbstract [ i ] ;
// <20> <> <EFBFBD> <EFBFBD> ժҪ<D5AA> ַ<EFBFBD> <D6B7> <EFBFBD> <EFBFBD> <EFBFBD> ÿһ <C3BF> <D2BB> <EFBFBD> ַ<EFBFBD> <D6B7> <EFBFBD> ȡ<EFBFBD> <C8A1> ÿһ <C3BF> <D2BB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
vector < string > & vWord = vvWordMtx [ i ] ;
if ( strAbs . size ( ) = = 0 ) continue ; // ժҪ<D5AA> <D2AA> ϢΪ<CFA2> գ <EFBFBD> <D5A3> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> һ <EFBFBD> 㲻<EFBFBD> <E3B2BB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
int wordStartPos = 0 ;
while ( wordStartPos < strAbs . size ( ) & & usWordChars . find ( strAbs [ wordStartPos ] ) = = usWordChars . end ( ) )
wordStartPos + + ;
for ( int curPos = wordStartPos + 1 ; curPos < strAbs . size ( ) ; + + curPos ) {
if ( usWordChars . find ( strAbs [ curPos ] ) = = usWordChars . end ( ) ) { // <20> ҵ<EFBFBD> <D2B5> ˷ָ<CBB7> <D6B8> <EFBFBD>
vWord . push_back ( strAbs . substr ( wordStartPos , curPos - wordStartPos ) ) ;
wordStartPos = curPos + 1 ; // <20> <> <EFBFBD> <EFBFBD> һ <EFBFBD> <D2BB> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ʼ λ<CABC> <CEBB>
while ( wordStartPos < strAbs . size ( ) & & usWordChars . find ( strAbs [ wordStartPos ] ) = = usWordChars . end ( ) )
wordStartPos + + ;
curPos = wordStartPos ; // ѭ<> <D1AD> <EFBFBD> <EFBFBD> <EFBFBD> Զ<EFBFBD> <D4B6> <EFBFBD> 1
}
}
// <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> ժҪ֮<D2AA> <D6AE> <EFBFBD> <EFBFBD> ÿ<EFBFBD> <C3BF> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> hash<73> <68>
for ( auto & word : vWord ) {
string upWord ( word ) ;
transform ( upWord . begin ( ) , upWord . end ( ) , upWord . begin ( ) , : : toupper ) ;
vusAbsWord [ i ] . insert ( upWord ) ;
}
}
2023-09-27 10:27:19 +08:00
finish = clock ( ) ;
cout < < " read abstract time: " < < ( double ) ( finish - begin ) / CLOCKS_PER_SEC < < " s " < < endl ;
2023-09-22 00:51:34 +08:00
/* <20> <> <EFBFBD> ָ<EFBFBD> <D6B8> <EFBFBD> <EFBFBD> <EFBFBD> д<EFBFBD> <D0B4> mat<61> ļ<EFBFBD> */
2023-09-27 10:27:19 +08:00
begin = clock ( ) ;
2023-10-05 10:38:21 +08:00
if ( argc > 6 ) {
2023-09-22 00:51:34 +08:00
MATFile * pMatFile = matOpen ( argv [ 6 ] , " w " ) ;
mxArray * pCellMtx = mxCreateCellMatrix ( 1 , vvWordMtx . size ( ) ) ;
for ( int i = 0 ; i < vvWordMtx . size ( ) ; + + i ) {
mxArray * pChildCellMtx = mxCreateCellMatrix ( 1 , vvWordMtx [ i ] . size ( ) ) ;
for ( int j = 0 ; j < vvWordMtx [ i ] . size ( ) ; + + j ) {
mxArray * mxStr = mxCreateString ( vvWordMtx [ i ] [ j ] . c_str ( ) ) ;
mxSetCell ( pChildCellMtx , j , mxStr ) ;
}
mxSetCell ( pCellMtx , i , pChildCellMtx ) ;
}
2023-09-27 10:27:19 +08:00
matPutVariable ( pMatFile , " wd " , pCellMtx ) ;
2023-09-22 00:51:34 +08:00
matClose ( pMatFile ) ;
mxDestroyArray ( pCellMtx ) ;
}
2023-09-21 15:24:14 +08:00
finish = clock ( ) ;
2023-09-27 10:27:19 +08:00
cout < < " write abstract word time: " < < ( double ) ( finish - begin ) / CLOCKS_PER_SEC < < " s " < < endl ;
2023-09-21 15:24:14 +08:00
/* <20> <> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> е <EFBFBD> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> Ŀ¼<C4BF> <C2BC> <EFBFBD> <EFBFBD> һ <EFBFBD> <D2BB> <EFBFBD> д<EFBFBD> <D0B4> <EFBFBD> */
begin = clock ( ) ;
2023-10-05 23:12:02 +08:00
ThreadPool thPool ( numThread ) ;
2023-09-21 15:24:14 +08:00
// <20> 鿴֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD>
int numKnowledgeParticle = 0 ;
FOREACH_PARTICLE_START
numKnowledgeParticle + + ;
FOREACH_PARTICLE_END
// <20> <> <EFBFBD> <EFBFBD> ÿ<EFBFBD> <C3BF> ֪ʶ<D6AA> <CAB6> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> <EFBFBD> һ <EFBFBD> <D2BB> <EFBFBD> д<EFBFBD> <D0B4> <EFBFBD>
2023-10-05 23:12:02 +08:00
// vector<ThreadParamEntropy> vTP;
FOREACH_PARTICLE_START
ThreadParamEntropy tParam = { file , childDir / outFileName , & vusAbsWord } ;
thPool . enqueue ( ThreadProcessEntropy , tParam ) ;
//vTP.push_back({ file, childDir / outFileName, &vusAbsWord });
FOREACH_PARTICLE_END
2023-09-21 15:24:14 +08:00
// synchronize
2023-10-05 23:12:02 +08:00
thPool . ~ ThreadPool ( ) ;
// kt_for(numThread, ThreadProcessEntropy, vTP);
2023-09-21 15:24:14 +08:00
finish = clock ( ) ;
cout < < " thread pool time: " < < ( double ) ( finish - begin ) / CLOCKS_PER_SEC < < " s " < < endl ;
}