diff --git a/.gitignore b/.gitignore index 5d381cc..33068a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,406 @@ +x64/ +eigen-3.4.0/ +*.debug + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.tlog +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio 6 auto-generated project file (contains which files were open etc.) +*.vbp + +# Visual Studio 6 workspace and project file (working project files containing files to include in project) +*.dsw +*.dsp + +# Visual Studio 6 technical files +*.ncb +*.aps + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# Visual Studio History (VSHistory) files +.vshistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd + +# VS Code files for those working on multiple tools +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +# Windows Installer files from build outputs +*.cab +*.msi +*.msix +*.msm +*.msp + +# JetBrains Rider +*.sln.iml + # ---> Python # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/GMM/GMM.vcxproj b/GMM/GMM.vcxproj new file mode 100644 index 0000000..42bdea4 --- /dev/null +++ b/GMM/GMM.vcxproj @@ -0,0 +1,136 @@ + + + + + true + true + true + true + 15.0 + {a2b67815-1235-4f7c-874d-4fccb3b0c738} + Win32Proj + GMM + 10.0 + 10.0.17134.0 + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + Application + v143 + v142 + v141 + v140 + Unicode + + + true + true + + + false + true + false + + + + + + + + + + + + + + + + NotUsing + pch.h + $(IntDir)pch.pch + _CONSOLE;WIN32_LEAN_AND_MEAN;WINRT_LEAN_AND_MEAN;%(PreprocessorDefinitions) + Level4 + %(AdditionalOptions) /permissive- /bigobj + + + + + Disabled + _DEBUG;%(PreprocessorDefinitions) + D:\matlab2023a\extern\include;D:\apps\Twirls\eigen-3.4.0;%(AdditionalIncludeDirectories) + + + Console + false + D:\matlab2023a\extern\lib\win64\microsoft;%(AdditionalLibraryDirectories) + libmat.lib;libmx.lib;libmex.lib;libeng.lib;%(AdditionalDependencies) + + + + + WIN32;%(PreprocessorDefinitions) + + + + + MaxSpeed + true + true + NDEBUG;%(PreprocessorDefinitions) + D:\matlab2023a\extern\include;D:\apps\Twirls\eigen-3.4.0;%(AdditionalIncludeDirectories) + + + Console + true + true + false + D:\matlab2023a\extern\lib\win64\microsoft;%(AdditionalLibraryDirectories) + libmat.lib;libmx.lib;libmex.lib;libeng.lib;%(AdditionalDependencies) + + + + + + + + + + + + + + + + false + + + + + + + + + 这台计算机上缺少此项目引用的 NuGet 程序包。使用“NuGet 程序包还原”可下载这些程序包。有关更多信息,请参见 http://go.microsoft.com/fwlink/?LinkID=322105。缺少的文件是 {0}。 + + + + + \ No newline at end of file diff --git a/GMM/GMM.vcxproj.filters b/GMM/GMM.vcxproj.filters new file mode 100644 index 0000000..c29d2e2 --- /dev/null +++ b/GMM/GMM.vcxproj.filters @@ -0,0 +1,43 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + + + Source Files + + + Source Files + + + Source Files + + + + + + + + + + \ No newline at end of file diff --git a/GMM/PropertySheet.props b/GMM/PropertySheet.props new file mode 100644 index 0000000..b0c6226 --- /dev/null +++ b/GMM/PropertySheet.props @@ -0,0 +1,16 @@ + + + + + + + + \ No newline at end of file diff --git a/GMM/gmm.cpp b/GMM/gmm.cpp new file mode 100644 index 0000000..1d6300c --- /dev/null +++ b/GMM/gmm.cpp @@ -0,0 +1,620 @@ +/*************************************************************************** + +Module Name: + + Gaussian Mixture Model with Diagonal Covariance Matrix + +History: + + 2003/11/01 Fei Wang + 2013 luxiaoxun +***************************************************************************/ +#include +#include +#include +#include "gmm.h" +#include "kmeans.h" +using namespace std; + +GMM::GMM(int dimNum, int mixNum) +{ + m_dimNum = dimNum; + m_mixNum = mixNum; + + m_maxIterNum = 100; + m_endError = 0.001; + + Allocate(); + + for (int i = 0; i < m_mixNum; i++) + { + m_priors[i] = 1.0 / m_mixNum; + + for (int d = 0; d < m_dimNum; d++) + { + m_means[i][d] = 0; + m_vars[i][d] = 1; + } + } +} + +GMM::~GMM() +{ + Dispose(); +} + +void GMM::Allocate() +{ + m_priors = new double[m_mixNum]; + m_means = new double* [m_mixNum]; + m_vars = new double* [m_mixNum]; + + for (int i = 0; i < m_mixNum; i++) + { + m_means[i] = new double[m_dimNum]; + m_vars[i] = new double[m_dimNum]; + } + + m_minVars = new double[m_dimNum]; +} + +void GMM::Dispose() +{ + delete[] m_priors; + + for (int i = 0; i < m_mixNum; i++) + { + delete[] m_means[i]; + delete[] m_vars[i]; + } + delete[] m_means; + delete[] m_vars; + + delete[] m_minVars; +} + +void GMM::Copy(GMM* gmm) +{ + assert(m_mixNum == gmm->m_mixNum && m_dimNum == gmm->m_dimNum); + + for (int i = 0; i < m_mixNum; i++) + { + m_priors[i] = gmm->Prior(i); + memcpy(m_means[i], gmm->Mean(i), sizeof(double) * m_dimNum); + memcpy(m_vars[i], gmm->Variance(i), sizeof(double) * m_dimNum); + } + memcpy(m_minVars, gmm->m_minVars, sizeof(double) * m_dimNum); +} + +double GMM::GetProbability(const double* sample) +{ + double p = 0; + for (int i = 0; i < m_mixNum; i++) + { + p += m_priors[i] * GetProbability(sample, i); + } + return p; +} + +double GMM::GetProbability(const double* x, int j) +{ + double p = 1; + for (int d = 0; d < m_dimNum; d++) + { + p *= 1 / sqrt(2 * M_PI * m_vars[j][d]); + p *= exp(-0.5 * (x[d] - m_means[j][d]) * (x[d] - m_means[j][d]) / m_vars[j][d]); + } + return p; +} + +void GMM::Train(const char* sampleFileName) +{ + //DumpSampleFile(sampleFileName); + Init(sampleFileName); + + ifstream sampleFile(sampleFileName, ios_base::binary); + assert(sampleFile); + + int size = 0; + sampleFile.seekg(0, ios_base::beg); + sampleFile.read((char*)&size, sizeof(int)); + + // Reestimation + bool loop = true; + double iterNum = 0; + double lastL = 0; + double currL = 0; + int unchanged = 0; + double* x = new double[m_dimNum]; // Sample data + double* next_priors = new double[m_mixNum]; + double** next_vars = new double* [m_mixNum]; + double** next_means = new double* [m_mixNum]; + + for (int i = 0; i < m_mixNum; i++) + { + next_means[i] = new double[m_dimNum]; + next_vars[i] = new double[m_dimNum]; + } + + while (loop) + { + // Clear buffer for reestimation + memset(next_priors, 0, sizeof(double) * m_mixNum); + for (int i = 0; i < m_mixNum; i++) + { + memset(next_vars[i], 0, sizeof(double) * m_dimNum); + memset(next_means[i], 0, sizeof(double) * m_dimNum); + } + + lastL = currL; + currL = 0; + + // Predict + sampleFile.seekg(2 * sizeof(int), ios_base::beg); + for (int k = 0; k < size; k++) + { + sampleFile.read((char*)x, sizeof(double) * m_dimNum); + double p = GetProbability(x); + + for (int j = 0; j < m_mixNum; j++) + { + double pj = GetProbability(x, j) * m_priors[j] / p; + next_priors[j] += pj; + for (int d = 0; d < m_dimNum; d++) + { + next_means[j][d] += pj * x[d]; + next_vars[j][d] += pj * x[d] * x[d]; + } + } + currL += (p > 1E-20) ? log10(p) : -20; + } + currL /= size; + + // Reestimation: generate new priors, means and variances. + for (int j = 0; j < m_mixNum; j++) + { + m_priors[j] = next_priors[j] / size; + + if (m_priors[j] > 0) + { + for (int d = 0; d < m_dimNum; d++) + { + m_means[j][d] = next_means[j][d] / next_priors[j]; + m_vars[j][d] = next_vars[j][d] / next_priors[j] - m_means[j][d] * m_means[j][d]; + if (m_vars[j][d] < m_minVars[d]) + { + m_vars[j][d] = m_minVars[d]; + } + } + } + } + + // Terminal conditions + iterNum++; + if (fabs(currL - lastL) < m_endError * fabs(lastL)) + { + unchanged++; + } + if (iterNum >= m_maxIterNum || unchanged >= 3) + { + loop = false; + } + //--- Debug --- + //cout << "Iter: " << iterNum << ", Average Log-Probability: " << currL << endl; + } + + sampleFile.close(); + delete[] next_priors; + for (int i = 0; i < m_mixNum; i++) + { + delete[] next_means[i]; + delete[] next_vars[i]; + } + delete[] next_means; + delete[] next_vars; + delete[] x; +} + +void GMM::Train(double* data, int N) +{ + Init(data, N); + + int size = N; + + // Reestimation + bool loop = true; + double iterNum = 0; + double lastL = 0; + double currL = 0; + int unchanged = 0; + double* x = new double[m_dimNum]; // Sample data + double* next_priors = new double[m_mixNum]; + double** next_vars = new double* [m_mixNum]; + double** next_means = new double* [m_mixNum]; + + for (int i = 0; i < m_mixNum; i++) + { + next_means[i] = new double[m_dimNum]; + next_vars[i] = new double[m_dimNum]; + } + + while (loop) + { + // Clear buffer for reestimation + memset(next_priors, 0, sizeof(double) * m_mixNum); + for (int i = 0; i < m_mixNum; i++) + { + memset(next_vars[i], 0, sizeof(double) * m_dimNum); + memset(next_means[i], 0, sizeof(double) * m_dimNum); + } + + lastL = currL; + currL = 0; + + // Predict + for (int k = 0; k < size; k++) + { + for (int j = 0;j < m_dimNum;j++) + x[j] = data[k * m_dimNum + j]; + double p = GetProbability(x); + + for (int j = 0; j < m_mixNum; j++) + { + double pj = GetProbability(x, j) * m_priors[j] / p; + + next_priors[j] += pj; + + for (int d = 0; d < m_dimNum; d++) + { + next_means[j][d] += pj * x[d]; + next_vars[j][d] += pj * x[d] * x[d]; + } + } + + currL += (p > 1E-20) ? log10(p) : -20; + } + currL /= size; + + // Reestimation: generate new priors, means and variances. + for (int j = 0; j < m_mixNum; j++) + { + m_priors[j] = next_priors[j] / size; + + if (m_priors[j] > 0) + { + for (int d = 0; d < m_dimNum; d++) + { + m_means[j][d] = next_means[j][d] / next_priors[j]; + m_vars[j][d] = next_vars[j][d] / next_priors[j] - m_means[j][d] * m_means[j][d]; + if (m_vars[j][d] < m_minVars[d]) + { + m_vars[j][d] = m_minVars[d]; + } + } + } + } + + // Terminal conditions + iterNum++; + if (fabs(currL - lastL) < m_endError * fabs(lastL)) + { + unchanged++; + } + if (iterNum >= m_maxIterNum || unchanged >= 3) + { + loop = false; + } + + //--- Debug --- + //cout << "Iter: " << iterNum << ", Average Log-Probability: " << currL << endl; + } + delete[] next_priors; + for (int i = 0; i < m_mixNum; i++) + { + delete[] next_means[i]; + delete[] next_vars[i]; + } + delete[] next_means; + delete[] next_vars; + delete[] x; +} + +void GMM::Init(double* data, int N) +{ + const double MIN_VAR = 1E-10; + + KMeans* kmeans = new KMeans(m_dimNum, m_mixNum); + kmeans->SetInitMode(KMeans::InitUniform); + int* Label; + Label = new int[N]; + kmeans->Cluster(data, N, Label); + + int* counts = new int[m_mixNum]; + double* overMeans = new double[m_dimNum]; // Overall mean of training data + for (int i = 0; i < m_mixNum; i++) + { + counts[i] = 0; + m_priors[i] = 0; + memcpy(m_means[i], kmeans->GetMean(i), sizeof(double) * m_dimNum); + memset(m_vars[i], 0, sizeof(double) * m_dimNum); + } + memset(overMeans, 0, sizeof(double) * m_dimNum); + memset(m_minVars, 0, sizeof(double) * m_dimNum); + + int size = 0; + size = N; + + double* x = new double[m_dimNum]; + int label = -1; + + for (int i = 0; i < size; i++) + { + for (int j = 0;j < m_dimNum;j++) + x[j] = data[i * m_dimNum + j]; + label = Label[i]; + + // Count each Gaussian + counts[label]++; + double* m = kmeans->GetMean(label); + for (int d = 0; d < m_dimNum; d++) + { + m_vars[label][d] += (x[d] - m[d]) * (x[d] - m[d]); + } + + // Count the overall mean and variance. + for (int d = 0; d < m_dimNum; d++) + { + overMeans[d] += x[d]; + m_minVars[d] += x[d] * x[d]; + } + } + + // Compute the overall variance (* 0.01) as the minimum variance. + for (int d = 0; d < m_dimNum; d++) + { + overMeans[d] /= size; + m_minVars[d] = max(MIN_VAR, 0.01 * (m_minVars[d] / size - overMeans[d] * overMeans[d])); + } + + // Initialize each Gaussian. + for (int i = 0; i < m_mixNum; i++) + { + m_priors[i] = 1.0 * counts[i] / size; + + if (m_priors[i] > 0) + { + for (int d = 0; d < m_dimNum; d++) + { + m_vars[i][d] = m_vars[i][d] / counts[i]; + + // A minimum variance for each dimension is required. + if (m_vars[i][d] < m_minVars[d]) + { + m_vars[i][d] = m_minVars[d]; + } + } + } + else + { + memcpy(m_vars[i], m_minVars, sizeof(double) * m_dimNum); + cout << "[WARNING] Gaussian " << i << " of GMM is not used!\n"; + } + } + delete kmeans; + delete[] x; + delete[] counts; + delete[] overMeans; + delete[] Label; + +} + +void GMM::Init(const char* sampleFileName) +{ + const double MIN_VAR = 1E-10; + + KMeans* kmeans = new KMeans(m_dimNum, m_mixNum); + kmeans->SetInitMode(KMeans::InitUniform); + kmeans->Cluster(sampleFileName, "gmm_init.tmp"); + + int* counts = new int[m_mixNum]; + double* overMeans = new double[m_dimNum]; // Overall mean of training data + for (int i = 0; i < m_mixNum; i++) + { + counts[i] = 0; + m_priors[i] = 0; + memcpy(m_means[i], kmeans->GetMean(i), sizeof(double) * m_dimNum); + memset(m_vars[i], 0, sizeof(double) * m_dimNum); + } + memset(overMeans, 0, sizeof(double) * m_dimNum); + memset(m_minVars, 0, sizeof(double) * m_dimNum); + + // Open the sample and label file to initialize the model + ifstream sampleFile(sampleFileName, ios_base::binary); + assert(sampleFile); + + ifstream labelFile("gmm_init.tmp", ios_base::binary); + assert(labelFile); + + int size = 0; + sampleFile.read((char*)&size, sizeof(int)); + sampleFile.seekg(2 * sizeof(int), ios_base::beg); + labelFile.seekg(sizeof(int), ios_base::beg); + + double* x = new double[m_dimNum]; + int label = -1; + + for (int i = 0; i < size; i++) + { + sampleFile.read((char*)x, sizeof(double) * m_dimNum); + labelFile.read((char*)&label, sizeof(int)); + + // Count each Gaussian + counts[label]++; + double* m = kmeans->GetMean(label); + for (int d = 0; d < m_dimNum; d++) + { + m_vars[label][d] += (x[d] - m[d]) * (x[d] - m[d]); + } + + // Count the overall mean and variance. + for (int d = 0; d < m_dimNum; d++) + { + overMeans[d] += x[d]; + m_minVars[d] += x[d] * x[d]; + } + } + + // Compute the overall variance (* 0.01) as the minimum variance. + for (int d = 0; d < m_dimNum; d++) + { + overMeans[d] /= size; + m_minVars[d] = max(MIN_VAR, 0.01 * (m_minVars[d] / size - overMeans[d] * overMeans[d])); + } + + // Initialize each Gaussian. + for (int i = 0; i < m_mixNum; i++) + { + m_priors[i] = 1.0 * counts[i] / size; + + if (m_priors[i] > 0) + { + for (int d = 0; d < m_dimNum; d++) + { + m_vars[i][d] = m_vars[i][d] / counts[i]; + + // A minimum variance for each dimension is required. + if (m_vars[i][d] < m_minVars[d]) + { + m_vars[i][d] = m_minVars[d]; + } + } + } + else + { + memcpy(m_vars[i], m_minVars, sizeof(double) * m_dimNum); + cout << "[WARNING] Gaussian " << i << " of GMM is not used!\n"; + } + } + + delete kmeans; + delete[] x; + delete[] counts; + delete[] overMeans; + + sampleFile.close(); + labelFile.close(); +} + +void GMM::DumpSampleFile(const char* fileName) +{ + ifstream sampleFile(fileName, ios_base::binary); + assert(sampleFile); + + int size = 0; + sampleFile.read((char*)&size, sizeof(int)); + cout << size << endl; + + int dim = 0; + sampleFile.read((char*)&dim, sizeof(int)); + cout << dim << endl; + + double* f = new double[dim]; + for (int i = 0; i < size; i++) + { + sampleFile.read((char*)f, sizeof(double) * dim); + + cout << i << ":"; + for (int j = 0; j < dim; j++) + { + cout << " " << f[j]; + } + cout << endl; + } + + delete[] f; + sampleFile.close(); +} + +ostream& operator<<(ostream& out, GMM& gmm) +{ + out << "" << endl; + out << " " << gmm.m_dimNum << " " << endl; + out << " " << gmm.m_mixNum << " " << endl; + + out << " "; + for (int i = 0; i < gmm.m_mixNum; i++) + { + out << gmm.m_priors[i] << " "; + } + out << "" << endl; + + out << "" << endl; + for (int i = 0; i < gmm.m_mixNum; i++) + { + for (int d = 0; d < gmm.m_dimNum; d++) + { + out << gmm.m_means[i][d] << " "; + } + out << endl; + } + out << "" << endl; + + out << "" << endl; + for (int i = 0; i < gmm.m_mixNum; i++) + { + for (int d = 0; d < gmm.m_dimNum; d++) + { + out << gmm.m_vars[i][d] << " "; + } + out << endl; + } + out << "" << endl; + + out << "" << endl; + + return out; +} + +istream& operator>>(istream& in, GMM& gmm) +{ + char label[50]; + in >> label; // "" + assert(strcmp(label, "") == 0); + + gmm.Dispose(); + + in >> label >> gmm.m_dimNum >> label; // "" + in >> label >> gmm.m_mixNum >> label; // "" + + gmm.Allocate(); + + in >> label; // "" + for (int i = 0; i < gmm.m_mixNum; i++) + { + in >> gmm.m_priors[i]; + } + in >> label; + + in >> label; // "" + for (int i = 0; i < gmm.m_mixNum; i++) + { + for (int d = 0; d < gmm.m_dimNum; d++) + { + in >> gmm.m_means[i][d]; + } + } + in >> label; + + in >> label; // "" + for (int i = 0; i < gmm.m_mixNum; i++) + { + for (int d = 0; d < gmm.m_dimNum; d++) + { + in >> gmm.m_vars[i][d]; + } + } + in >> label; + + in >> label; // "" + return in; +} \ No newline at end of file diff --git a/GMM/gmm.h b/GMM/gmm.h new file mode 100644 index 0000000..3e2f329 --- /dev/null +++ b/GMM/gmm.h @@ -0,0 +1,68 @@ +/*************************************************************************** +Module Name: + Gaussian Mixture Model with Diagonal Covariance Matrix + +History: + 2003/11/01 Fei Wang + 2013 luxiaoxun +***************************************************************************/ + +#pragma once +#include +#define M_PI 3.1415926535897932384626433832795 + +class GMM +{ +public: + GMM(int dimNum = 1, int mixNum = 1); + ~GMM(); + + void Copy(GMM* gmm); + + void SetMaxIterNum(int i) { m_maxIterNum = i; } + void SetEndError(double f) { m_endError = f; } + + int GetDimNum() { return m_dimNum; } + int GetMixNum() { return m_mixNum; } + int GetMaxIterNum() { return m_maxIterNum; } + double GetEndError() { return m_endError; } + + double& Prior(int i) { return m_priors[i]; } + double* Mean(int i) { return m_means[i]; } + double* Variance(int i) { return m_vars[i]; } + + void setPrior(int i, double val) { m_priors[i] = val; } + void setMean(int i, double* val) { for (int j = 0;j < m_dimNum;j++) m_means[i][j] = val[j]; } + void setVariance(int i, double* val) { for (int j = 0;j < m_dimNum;j++) m_vars[i][j] = val[j]; } + + double GetProbability(const double* sample); + + /* SampleFile: ...*/ + void Init(const char* sampleFileName); + void Train(const char* sampleFileName); + void Init(double* data, int N); + void Train(double* data, int N); + + void DumpSampleFile(const char* fileName); + + friend std::ostream& operator<<(std::ostream& out, GMM& gmm); + friend std::istream& operator>>(std::istream& in, GMM& gmm); + +private: + int m_dimNum; // 样本维数 + int m_mixNum; // Gaussian数目 + double* m_priors; // Gaussian权重 + double** m_means; // Gaussian均值 + double** m_vars; // Gaussian方差 + + // A minimum variance is required. Now, it is the overall variance * 0.01. + double* m_minVars; + int m_maxIterNum; // The stopping criterion regarding the number of iterations + double m_endError; // The stopping criterion regarding the error + +private: + // Return the "j"th pdf, p(x|j). + double GetProbability(const double* x, int j); + void Allocate(); + void Dispose(); +}; \ No newline at end of file diff --git a/GMM/kmeans.cpp b/GMM/kmeans.cpp new file mode 100644 index 0000000..f284c42 --- /dev/null +++ b/GMM/kmeans.cpp @@ -0,0 +1,389 @@ +/*************************************************************************** +Module Name: + KMeans + +History: + 2003/10/16 Fei Wang + 2013 luxiaoxun +***************************************************************************/ +#include +#include +#include +#include +#include +#include "KMeans.h" +using namespace std; + + +KMeans::KMeans(int dimNum, int clusterNum) +{ + m_dimNum = dimNum; + m_clusterNum = clusterNum; + + m_means = new double* [m_clusterNum]; + for (int i = 0; i < m_clusterNum; i++) + { + m_means[i] = new double[m_dimNum]; + memset(m_means[i], 0, sizeof(double) * m_dimNum); + } + + m_initMode = InitRandom; + m_maxIterNum = 100; + m_endError = 0.001; +} + +KMeans::~KMeans() +{ + for (int i = 0; i < m_clusterNum; i++) + { + delete[] m_means[i]; + } + delete[] m_means; +} + +void KMeans::Cluster(const char* sampleFileName, const char* labelFileName) +{ + // Check the sample file + ifstream sampleFile(sampleFileName, ios_base::binary); + assert(sampleFile); + + int size = 0; + int dim = 0; + sampleFile.read((char*)&size, sizeof(int)); + sampleFile.read((char*)&dim, sizeof(int)); + assert(size >= m_clusterNum); + assert(dim == m_dimNum); + + // Initialize model + Init(sampleFile); + + // Recursion + double* x = new double[m_dimNum]; // Sample data + int label = -1; // Class index + double iterNum = 0; + double lastCost = 0; + double currCost = 0; + int unchanged = 0; + bool loop = true; + int* counts = new int[m_clusterNum]; + double** next_means = new double* [m_clusterNum]; // New model for reestimation + for (int i = 0; i < m_clusterNum; i++) + { + next_means[i] = new double[m_dimNum]; + } + + while (loop) + { + memset(counts, 0, sizeof(int) * m_clusterNum); + for (int i = 0; i < m_clusterNum; i++) + { + memset(next_means[i], 0, sizeof(double) * m_dimNum); + } + + lastCost = currCost; + currCost = 0; + + sampleFile.clear(); + sampleFile.seekg(sizeof(int) * 2, ios_base::beg); + + // Classification + for (int i = 0; i < size; i++) + { + sampleFile.read((char*)x, sizeof(double) * m_dimNum); + currCost += GetLabel(x, &label); + + counts[label]++; + for (int d = 0; d < m_dimNum; d++) + { + next_means[label][d] += x[d]; + } + } + currCost /= size; + + // Reestimation + for (int i = 0; i < m_clusterNum; i++) + { + if (counts[i] > 0) + { + for (int d = 0; d < m_dimNum; d++) + { + next_means[i][d] /= counts[i]; + } + memcpy(m_means[i], next_means[i], sizeof(double) * m_dimNum); + } + } + + // Terminal conditions + iterNum++; + if (fabs(lastCost - currCost) < m_endError * lastCost) + { + unchanged++; + } + if (iterNum >= m_maxIterNum || unchanged >= 3) + { + loop = false; + } + //DEBUG + //cout << "Iter: " << iterNum << ", Average Cost: " << currCost << endl; + } + + // Output the label file + ofstream labelFile(labelFileName, ios_base::binary); + assert(labelFile); + + labelFile.write((char*)&size, sizeof(int)); + sampleFile.clear(); + sampleFile.seekg(sizeof(int) * 2, ios_base::beg); + + for (int i = 0; i < size; i++) + { + sampleFile.read((char*)x, sizeof(double) * m_dimNum); + GetLabel(x, &label); + labelFile.write((char*)&label, sizeof(int)); + } + + sampleFile.close(); + labelFile.close(); + + delete[] counts; + delete[] x; + for (int i = 0; i < m_clusterNum; i++) + { + delete[] next_means[i]; + } + delete[] next_means; +} + +// +void KMeans::Cluster(double* data, int N, int* Label) +{ + int size = 0; + size = N; + + assert(size >= m_clusterNum); + + // Initialize model + Init(data, N); + + // Recursion + double* x = new double[m_dimNum]; // Sample data + int label = -1; // Class index + double iterNum = 0; + double lastCost = 0; + double currCost = 0; + int unchanged = 0; + bool loop = true; + int* counts = new int[m_clusterNum]; + double** next_means = new double* [m_clusterNum]; // New model for reestimation + for (int i = 0; i < m_clusterNum; i++) + { + next_means[i] = new double[m_dimNum]; + } + + while (loop) + { + memset(counts, 0, sizeof(int) * m_clusterNum); + for (int i = 0; i < m_clusterNum; i++) + { + memset(next_means[i], 0, sizeof(double) * m_dimNum); + } + + lastCost = currCost; + currCost = 0; + + // Classification + for (int i = 0; i < size; i++) + { + for (int j = 0; j < m_dimNum; j++) + x[j] = data[i * m_dimNum + j]; + + currCost += GetLabel(x, &label); + + counts[label]++; + for (int d = 0; d < m_dimNum; d++) + { + next_means[label][d] += x[d]; + } + } + currCost /= size; + + // Reestimation + for (int i = 0; i < m_clusterNum; i++) + { + if (counts[i] > 0) + { + for (int d = 0; d < m_dimNum; d++) + { + next_means[i][d] /= counts[i]; + } + memcpy(m_means[i], next_means[i], sizeof(double) * m_dimNum); + } + } + + // Terminal conditions + iterNum++; + if (fabs(lastCost - currCost) < m_endError * lastCost) + { + unchanged++; + } + if (iterNum >= m_maxIterNum || unchanged >= 3) + { + loop = false; + } + + //DEBUG + //cout << "Iter: " << iterNum << ", Average Cost: " << currCost << endl; + } + + // Output the label file + for (int i = 0; i < size; i++) + { + for (int j = 0; j < m_dimNum; j++) + x[j] = data[i * m_dimNum + j]; + GetLabel(x, &label); + Label[i] = label; + } + delete[] counts; + delete[] x; + for (int i = 0; i < m_clusterNum; i++) + { + delete[] next_means[i]; + } + delete[] next_means; +} + +void KMeans::Init(double* data, int N) +{ + int size = N; + + if (m_initMode == InitRandom) + { + int inteval = size / m_clusterNum; + double* sample = new double[m_dimNum]; + + // Seed the random-number generator with current time + srand((unsigned)time(NULL)); + + for (int i = 0; i < m_clusterNum; i++) + { + int select = inteval * i + (inteval - 1) * rand() / RAND_MAX; + for (int j = 0; j < m_dimNum; j++) + sample[j] = data[select * m_dimNum + j]; + memcpy(m_means[i], sample, sizeof(double) * m_dimNum); + } + + delete[] sample; + } + else if (m_initMode == InitUniform) + { + double* sample = new double[m_dimNum]; + + for (int i = 0; i < m_clusterNum; i++) + { + int select = i * size / m_clusterNum; + for (int j = 0; j < m_dimNum; j++) + sample[j] = data[select * m_dimNum + j]; + memcpy(m_means[i], sample, sizeof(double) * m_dimNum); + } + + delete[] sample; + } + else if (m_initMode == InitManual) + { + // Do nothing + } +} + +void KMeans::Init(ifstream& sampleFile) +{ + int size = 0; + sampleFile.seekg(0, ios_base::beg); + sampleFile.read((char*)&size, sizeof(int)); + + if (m_initMode == InitRandom) + { + int inteval = size / m_clusterNum; + double* sample = new double[m_dimNum]; + + // Seed the random-number generator with current time + srand((unsigned)time(NULL)); + + for (int i = 0; i < m_clusterNum; i++) + { + int select = inteval * i + (inteval - 1) * rand() / RAND_MAX; + int offset = sizeof(int) * 2 + select * sizeof(double) * m_dimNum; + + sampleFile.seekg(offset, ios_base::beg); + sampleFile.read((char*)sample, sizeof(double) * m_dimNum); + memcpy(m_means[i], sample, sizeof(double) * m_dimNum); + } + + delete[] sample; + } + else if (m_initMode == InitUniform) + { + double* sample = new double[m_dimNum]; + + for (int i = 0; i < m_clusterNum; i++) + { + int select = i * size / m_clusterNum; + int offset = sizeof(int) * 2 + select * sizeof(double) * m_dimNum; + + sampleFile.seekg(offset, ios_base::beg); + sampleFile.read((char*)sample, sizeof(double) * m_dimNum); + memcpy(m_means[i], sample, sizeof(double) * m_dimNum); + } + + delete[] sample; + } + else if (m_initMode == InitManual) + { + // Do nothing + } +} + +double KMeans::GetLabel(const double* sample, int* label) +{ + double dist = -1; + for (int i = 0; i < m_clusterNum; i++) + { + double temp = CalcDistance(sample, m_means[i], m_dimNum); + if (temp < dist || dist == -1) + { + dist = temp; + *label = i; + } + } + return dist; +} + +double KMeans::CalcDistance(const double* x, const double* u, int dimNum) +{ + double temp = 0; + for (int d = 0; d < dimNum; d++) + { + temp += (x[d] - u[d]) * (x[d] - u[d]); + } + return sqrt(temp); +} + +ostream& operator<<(ostream& out, KMeans& kmeans) +{ + out << "" << endl; + out << " " << kmeans.m_dimNum << " " << endl; + out << " " << kmeans.m_clusterNum << " " << endl; + + out << "" << endl; + for (int i = 0; i < kmeans.m_clusterNum; i++) + { + for (int d = 0; d < kmeans.m_dimNum; d++) + { + out << kmeans.m_means[i][d] << " "; + } + out << endl; + } + out << "" << endl; + + out << "" << endl; + return out; +} \ No newline at end of file diff --git a/GMM/kmeans.h b/GMM/kmeans.h new file mode 100644 index 0000000..2687808 --- /dev/null +++ b/GMM/kmeans.h @@ -0,0 +1,57 @@ +/*************************************************************************** +Module Name: + KMeans + +History: + 2003/10/16 Fei Wang + 2013 luxiaoxun +***************************************************************************/ + +#pragma once +#include + +class KMeans +{ +public: + enum InitMode + { + InitRandom, + InitManual, + InitUniform, + }; + + KMeans(int dimNum = 1, int clusterNum = 1); + ~KMeans(); + + void SetMean(int i, const double* u) { memcpy(m_means[i], u, sizeof(double) * m_dimNum); } + void SetInitMode(int i) { m_initMode = i; } + void SetMaxIterNum(int i) { m_maxIterNum = i; } + void SetEndError(double f) { m_endError = f; } + + double* GetMean(int i) { return m_means[i]; } + int GetInitMode() { return m_initMode; } + int GetMaxIterNum() { return m_maxIterNum; } + double GetEndError() { return m_endError; } + + + /* SampleFile: ... + LabelFile: