Machine learning algorithms in C++
KMeans.hpp
Go to the documentation of this file.
1 
7 #ifndef MACHINE_LEARNING_KMEANS_HPP
8 #define MACHINE_LEARNING_KMEANS_HPP
9 
10 #include "../include/matrix/Matrix.hpp"
11 #include "Metrics.hpp"
12 #include "../include/mersenne_twister/MersenneTwister.hpp"
13 
17 class KMeans {
18  public:
20  private:
21  MatrixD X, y, centroids;
22  unsigned int k, totalIterations;
23  double distance, sse;
25 
29  double SSE() {
30  return Metrics::euclidean(X, centroids, false).sum();
31  }
32 
33  public:
34  KMeans() {}
35 
41  MatrixD predict(MatrixD data) {
42  if (centroids.nCols() != data.nCols())
43  throw invalid_argument("Data elements and cluster centroids don't have the same number of dimensions.");
44 
45  MatrixD distances = Metrics::minkowski(data, centroids, distance, false);
46  MatrixD results = MatrixD::zeros(data.nRows(), 1);
47 
48  for (size_t i = 0; i < distances.nRows(); i++) {
49  for (size_t j = 1; j < distances.nCols(); j++) {
50  double current_c = results(i, 0);
51  double current_d = distances(i, current_c);
52  double new_c = j;
53  double new_d = distances(i, new_c);
54  if (new_d < current_d)
55  results(i, 0) = new_c;
56  }
57  }
58 
59  return results;
60  }
61 
72  void fit(MatrixD data,
73  unsigned int k,
74  unsigned int iters = 100,
75  unsigned int inits = 100,
76  double distance = 2,
77  InitializationMethod initMethod = SAMPLE, bool verbose = false) {
78  this->X = data.standardize();
79  this->k = k;
80  this->initMethod = initMethod;
81  this->distance = distance;
82  this->totalIterations = 0;
83  MersenneTwister twister;
84  double minSSE;
85 
86  for (int currentInit = 0; currentInit < inits; currentInit++) {
87 
88  if (initMethod == RANDOM) {
89  centroids = MatrixD(k, X.nCols());
90  for (size_t i = 0; i < centroids.nRows(); i++)
91  for (size_t j = 0; j < centroids.nCols(); j++)
92  centroids(i, j) = twister.d_random(X.getColumn(j).min(),
93  X.getColumn(j).max());
94  } else {
95  vector<int> sample = twister.randomValues(X.nRows(), k, false);
96  centroids = MatrixD();
97  for (int i = 0; i < sample.size(); i++)
98  centroids.addRow(X.getRow(sample[i]));
99  }
100 
101  MatrixD yPrev;
102 
103  for (int currentIteration = 0; currentIteration < iters; currentIteration++) {
104  if (verbose)
105  cout << currentInit << '/' << inits + 1 << '\t'
106  << currentIteration << '/' << iters + 1 << '\t'
107  << SSE() << endl;
108 
109  MatrixD yCurr = predict(X); // assignment
110 
111  if (yCurr == yPrev)
112  break;
113 
114  yPrev = yCurr;
115  centroids = X.mean(yCurr); // centroid update
116  }
117  if (currentInit == 0 or SSE() < minSSE) {
118  minSSE = SSE();
119  y = yPrev;
120  }
121  }
122  this->sse = minSSE;
123  }
124 
125  const MatrixD &getY() const {
126  return y;
127  }
128 
129  const MatrixD &getCentroids() const {
130  return centroids;
131  }
132 
133  unsigned int getK() const {
134  return k;
135  }
136 
137  unsigned int getTotalIterations() const {
138  return totalIterations;
139  }
140 
141  double getDistance() const {
142  return distance;
143  }
144 
145  double getSse() const {
146  return sse;
147  }
148 
149 };
150 
151 #endif //MACHINE_LEARNING_KMEANS_HPP
void fit(MatrixD data, unsigned int k, unsigned int iters=100, unsigned int inits=100, double distance=2, InitializationMethod initMethod=SAMPLE, bool verbose=false)
Find the k centroids that best fit the data.
Definition: KMeans.hpp:72
static MatrixD minkowski(MatrixD m, double p, bool root=true)
Calculates the Minkowski distances between elements in a matrix.
Definition: Metrics.hpp:22
MatrixD y
Definition: KMeans.hpp:21
double getSse() const
Definition: KMeans.hpp:145
MatrixD predict(MatrixD data)
Assigns elements of a data set to clusters.
Definition: KMeans.hpp:41
InitializationMethod
Definition: KMeans.hpp:19
MatrixD X
Definition: KMeans.hpp:21
double sse
Definition: KMeans.hpp:23
unsigned int k
Definition: KMeans.hpp:22
static MatrixD euclidean(MatrixD m, bool root=true)
Calculates the Euclidean distances between elements in a matrix.
Definition: Metrics.hpp:61
MatrixD centroids
Definition: KMeans.hpp:21
Implementation of the k-means algorithm.
Definition: KMeans.hpp:17
double distance
Definition: KMeans.hpp:23
unsigned int getTotalIterations() const
Definition: KMeans.hpp:137
double getDistance() const
Definition: KMeans.hpp:141
unsigned int totalIterations
Definition: KMeans.hpp:22
double SSE()
Definition: KMeans.hpp:29
const MatrixD & getY() const
Definition: KMeans.hpp:125
unsigned int getK() const
Definition: KMeans.hpp:133
KMeans()
Definition: KMeans.hpp:34
const MatrixD & getCentroids() const
Definition: KMeans.hpp:129
InitializationMethod initMethod
Definition: KMeans.hpp:24