Machine learning algorithms in C++
MLP.hpp
Go to the documentation of this file.
1 
7 #ifndef MACHINE_LEARNING_MLP_HPP
8 #define MACHINE_LEARNING_MLP_HPP
9 
10 #include <vector>
11 #include <chrono>
12 #include <iostream>
13 #include "../include/matrix/Matrix.hpp"
14 #include "../include/mersenne_twister/MersenneTwister.hpp"
15 #include "Timer.hpp"
16 
17 using namespace std;
18 using myClock = chrono::high_resolution_clock;
19 
23 class MLP {
24  private:
25  MatrixD data, dataMean, dataDev, classes, originalClasses;
26  vector<MatrixD> W;
27 
28  //region Activation functions
29 
32  static double pow2(double x) {
33  return x * x;
34  }
35 
38  static double sigmoid(double x) {
39  return 1 / (1 + exp(-x));
40  }
41 
44  static double sigmoidDerivative(double x) {
45  double z = sigmoid(x);
46  return z * (1 - z);
47  }
48 
51  static double tanh(double x) {
52  return 2 * sigmoid(2 * x) - 1;
53  }
54 
57  static double tanhDerivative(double x) {
58  return 1 - pow(tanh(x), 2);
59  }
60 
61  //endregion
62 
67  static MatrixD initNormal(size_t in, size_t out) {
68  MersenneTwister twister;
69  MatrixD result(in, out, twister.vecFromNormal(in * out));
70  return result;
71  }
72 
79  static MatrixD initNormal(size_t in, size_t out, double mean, double stddev) {
80  MersenneTwister twister;
81  MatrixD result(in, out, twister.vecFromNormal(in * out, mean, stddev));
82  return result;
83  }
84 
89  static MatrixD initUniform(size_t in, size_t out) {
90  MersenneTwister twister;
91  MatrixD result(in, out, twister.vecFromUniform(in * out));
92  return result;
93  }
94 
101  static MatrixD initUniform(size_t in, size_t out, double min, double max) {
102  MersenneTwister twister;
103  MatrixD result(in, out, twister.vecFromUniform(in * out, min, max));
104  return result;
105  }
106 
107  static MatrixD binarize(MatrixD m) {
108  for (size_t i = 0; i < m.nRows(); i++) {
109  size_t largest = 0;
110  for (size_t j = 0; j < m.nCols(); j++)
111  if (m(i, j) > m(i, largest)) largest = j;
112 
113  for (size_t j = 0; j < m.nCols(); j++) {
114  m(i, j) = j == largest;
115  }
116  }
117  return m;
118  }
119 
120  MatrixD summarize(MatrixD m) {
121  MatrixD result(m.nRows(), 1);
122 
123  for (size_t i = 0; i < m.nRows(); i++) {
124  size_t largest = 0;
125  for (size_t j = 0; j < m.nCols(); j++)
126  if (m(i, j) > m(i, largest)) largest = j;
127 
128  result(i, 0) = originalClasses(largest, 0);
129  }
130 
131  return result;
132  }
133 
134  static MatrixD softmax(MatrixD m) {
135  m = m.apply(static_cast<double (*)(double)>(exp));
136  for (size_t i = 0; i < m.nRows(); i++) {
137  double sum = 0;
138 
139  for (size_t j = 0; j < m.nCols(); j++)
140  sum += m(i, j);
141 
142  for (size_t j = 0; j < m.nCols(); j++)
143  m(i, j) /= sum;
144  }
145  return m;
146  }
147 
148  public:
149 
150  enum ActivationFunction { SIGMOID, TANH };
151  enum WeightInitialization { NORMAL, UNIFORM, GLOROT };
152  enum OutputFormat { ACTIVATION, SOFTMAX, ONEHOT, SUMMARY };
153 
154  MLP() {
155  }
156 
171  void fit(MatrixD X,
172  MatrixD y,
173  vector<size_t> hiddenConfig,
174  int maxIters,
175  size_t batchSize = 0,
176  double learningRate = 0.01,
177  double errorThreshold = 0.0001,
178  double regularization = 0,
179  ActivationFunction func = SIGMOID,
180  WeightInitialization weightInit = UNIFORM,
181  bool adaptiveLR = false,
182  bool standardize = true,
183  bool verbose = true) {
184  size_t outputEncodingSize = y.unique().nRows();
185 
186  // number of layers. even if there are no hidden layers,
187  // there will exist at least one layer of weights that will need to be fitted
188  size_t nLayers = hiddenConfig.size() < 1 ? 1 : hiddenConfig.size() + 1;
189  // initialize vector of weight matrices
190  vector<MatrixD> w = vector<MatrixD>(nLayers);
191 
192  // initialize weights
193  for (int i = 0; i < nLayers; i++) {
194  size_t nIn, nOut;
195 
196  // number of inputs (+1 accounts for the bias weight)
197  nIn = (i == 0 ? X : w[i - 1]).nCols() + 1;
198 
199  // number of outputs
200  nOut = i == w.size() - 1 ? outputEncodingSize : hiddenConfig[i];
201 
202  // initialize layer with random numbers from a distribution
203  if (weightInit == UNIFORM) {
204  w[i] = initUniform(nIn, nOut);
205  } else if (weightInit == NORMAL) {
206  w[i] = initNormal(nIn, nOut);
207  } else if (weightInit == GLOROT) {
208  w[i] = initUniform(nIn, nOut, -sqrt(nIn), sqrt(nIn));
209  }
210  }
211 
212  fit(X,
213  y,
214  w,
215  maxIters,
216  batchSize,
217  learningRate,
218  errorThreshold,
219  regularization,
220  func,
221  adaptiveLR,
222  standardize,
223  verbose);
224  }
225 
239  void fit(MatrixD X,
240  MatrixD y,
241  vector<MatrixD> hiddenLayers,
242  unsigned int maxIters,
243  size_t batchSize = 0,
244  double learningRate = 0.01,
245  double errorThreshold = 0.0001,
246  double regularization = 0,
247  ActivationFunction func = SIGMOID,
248  bool adaptiveLR = false,
249  bool standardize = true,
250  bool verbose = true) {
251  // create one-hot encoding for classes
252  classes = y.oneHot();
253  originalClasses = y.unique();
254  originalClasses.sort();
255  size_t outputEncodingSize = classes.nCols();
256 
257  for (int i = 0; i < hiddenLayers.size(); ++i) {
258  size_t correct_nIn = (i == 0 ? X : hiddenLayers[i - 1]).nCols() + 1,
259  correct_nOut = i == hiddenLayers.size() - 1 ? outputEncodingSize : hiddenLayers[i + 1].nRows() - 1;
260  if (hiddenLayers[i].nRows() != correct_nIn) {
261  throw invalid_argument(
262  "Weight matrix " + to_string(i) + " input (" + to_string(hiddenLayers[i].nRows()) + ") should be ("
263  + to_string(correct_nIn) + ")");
264  }
265  if (hiddenLayers[i].nCols() != correct_nOut) {
266  throw invalid_argument(
267  "Weight matrix " + to_string(i) + " output (" + to_string(hiddenLayers[i].nCols()) + ") should be ("
268  + to_string(correct_nOut) + ")");
269  }
270  }
271 
272  W = hiddenLayers;
273 
274  // number of layers. even if there are no hidden layers,
275  // there will exist at least one layer of weights that will need to be fitted
276  size_t nLayers = hiddenLayers.size();
277 
278  if (standardize) {
279  // if standardization takes place, mean and stddev are stored to be used in future predictions
280  dataMean = X.mean();
281  dataDev = X.stdev();
282  data = X.standardize(dataMean, dataDev);
283  } else {
284  data = X;
285  dataMean = dataDev = MatrixD();
286  }
287 
288  function<double(double)> activationFunction, activationDerivative;
289  if (func == SIGMOID) {
290  activationFunction = sigmoid;
291  activationDerivative = sigmoidDerivative;
292  } else {
293  activationFunction = tanh;
294  activationDerivative = tanhDerivative;
295  }
296 
297  float lastStdout = 0;
298  double previousLoss;
299  Timer timer(1, maxIters);
300  timer.start();
301  // training iterations
302  for (int iter = 0; iter < maxIters; iter++) {
303  chrono::time_point<chrono::system_clock> iterStart = myClock::now();
304 
305  // matrices used in forward pass
306  // Z holds the outputs of each layer
307  vector<MatrixD> Z(nLayers);
308 
309  // F are activation derivatives, they are needed for all but the last layer
310  vector<MatrixD> F(nLayers - 1);
311 
312  // matrices used on backpropagation
313  // D contains the loss signals for each layer
314  vector<MatrixD> D(nLayers);
315 
316  Matrix<int> filter;
317 
318  MatrixD currentInput;
319  if (batchSize > 0) {
320  MersenneTwister t;
321  MatrixI indices(batchSize, 1, t.randomValues(0, data.nRows(), batchSize, false));
322 
323  filter = MatrixI::zeros(data.nRows(), 1);
324 
325  for (size_t i = 0; i < indices.nRows(); i++) {
326  filter(indices(i, 0), 0) = 1;
327  }
328 
329  currentInput = data.getRows(filter);
330  } else
331  currentInput = data;
332 
333  //forward pass
334  for (int i = 0; i < nLayers; i++) {
335  // add the bias column to the input of the current layer
336  currentInput.addColumn(MatrixD::ones(currentInput.nRows(), 1), 0);
337 
338  MatrixD S = currentInput * W[i]; // multiply input by weights
339 
340  // calculate derivatives
341  if (i < nLayers - 1) // derivative of the last layer is not used, so no need to do it
342  F[i] = S.apply(activationDerivative).transpose();
343 
344  //apply activation function, whose resulting matrix will be the next input
345  currentInput = Z[i] = S.apply(activationFunction);
346  }
347 
348  // backpropagation
349  // last layer error signal
350  MatrixD batchClasses = filter.isEmpty() ? classes : classes.getRows(filter);
351  D[nLayers - 1] = (Z[nLayers - 1] - batchClasses).transpose();
352 
353  // calculate loss
354  double loss = (D[nLayers - 1]).apply(pow2).sum() / (2 * batchClasses.nRows());
355 
356  double regularizationTerm = 0;
357  for (auto w:W)
358  regularizationTerm += w.apply(pow2).sum();
359  regularizationTerm = regularization > 0 ? regularization / (2 * batchClasses.nRows()) : 0;
360 
361  loss += regularizationTerm;
362 
363  // error signals for the intermediate layers
364  for (int i = nLayers - 2; i >= 0; i--) {
365  MatrixD W_noBias = W[i + 1].transpose();
366  W_noBias.removeColumn(0);
367  W_noBias = W_noBias.transpose();
368  // mxb mxb mxn nxb
369  D[i] = F[i].hadamard(W_noBias * D[i + 1]);
370  }
371 
372  // learning rate is linearly scaled down with passing iterations
373  double lr = adaptiveLR ? (learningRate / maxIters) * (maxIters - iter) : learningRate;
374 
375  // weight updates
376  for (size_t i = 0; i < nLayers; i++) {
377  MatrixD input;
378  if (i == 0)
379  if (filter.isEmpty())
380  input = data;
381  else
382  input = data.getRows(filter);
383  else
384  input = Z[i - 1];
385 
386  input.addColumn(MatrixD::ones(input.nRows(), 1), 0); // add the bias once again
387  MatrixD dW = -lr * (D[i] * input).transpose();
388 // W[i] += dW;
389  W[i] = (1 - ((learningRate * regularization) / batchClasses.nRows())) * W[i] + dW;
390  }
391 
392  if (verbose and timer.activate(iter)) {
393  char errorChar = (loss == previousLoss or iter == 0) ? '=' : loss > previousLoss ? '+' : '-';
394  cout << "loss: " << loss << ' ' << errorChar << endl;
395  }
396 
397  if (loss < errorThreshold)
398  break;
399 
400  previousLoss = loss;
401  }
402  if (verbose)
403  cout << "Total training time: " << timer.runningTime() << endl;
404  }
405 
410  MatrixD predict(MatrixD X, OutputFormat of = ACTIVATION) {
411  if (!dataMean.isEmpty() && !dataDev.isEmpty())
412  X = X.standardize(dataMean, dataDev);
413 
414  // even when there are no hidden layers, there
415  // must be at least one of each of the following
416  size_t nLayers = W.size();
417 
418  MatrixD currentInput = X;
419  for (int i = 0; i < nLayers; i++) {
420  // add the bias column to the input of the current layer
421  currentInput.addColumn(MatrixD::ones(currentInput.nRows(), 1), 0);
422  MatrixD S = currentInput * W[i];
423  currentInput = S.apply(sigmoid);
424  }
425 
426  if (of == SOFTMAX)
427  return MLP::softmax(currentInput);
428  if (of == ONEHOT)
429  return MLP::binarize(currentInput);
430  if (of == SUMMARY)
431  return MLP::summarize(currentInput);
432 
433  return currentInput;
434  }
435 };
436 
437 #endif //MACHINE_LEARNING_MLP_HPP
OutputFormat
Definition: MLP.hpp:152
MatrixD summarize(MatrixD m)
Definition: MLP.hpp:120
string runningTime()
Definition: Timer.hpp:149
static MatrixD softmax(MatrixD m)
Definition: MLP.hpp:134
static MatrixD initUniform(size_t in, size_t out, double min, double max)
Initialize a matrix according to the uniform distribution U(min; max)
Definition: MLP.hpp:101
k-nearest neighbors algorithm, able to do regression and classification
WeightInitialization
Definition: MLP.hpp:151
MatrixD originalClasses
Definition: MLP.hpp:25
MatrixD predict(MatrixD X, OutputFormat of=ACTIVATION)
Predict the classes of a data set.
Definition: MLP.hpp:410
static double tanh(double x)
Definition: MLP.hpp:51
vector< MatrixD > W
Definition: MLP.hpp:26
static MatrixD initNormal(size_t in, size_t out, double mean, double stddev)
Initialize a matrix according to a normal distribution N(mean; stddev)
Definition: MLP.hpp:79
static double pow2(double x)
Definition: MLP.hpp:32
static MatrixD initUniform(size_t in, size_t out)
Initialize a matrix according to the uniform distribution U(0;1)
Definition: MLP.hpp:89
Multi-layer perceptron.
Definition: MLP.hpp:23
chrono::high_resolution_clock myClock
Definition: MLP.hpp:18
bool activate(unsigned int currentIter=0)
Checks if the time interval passed in the constructor has passed.
Definition: Timer.hpp:110
void start()
Start the timer.
Definition: Timer.hpp:101
void fit(MatrixD X, MatrixD y, vector< size_t > hiddenConfig, int maxIters, size_t batchSize=0, double learningRate=0.01, double errorThreshold=0.0001, double regularization=0, ActivationFunction func=SIGMOID, WeightInitialization weightInit=UNIFORM, bool adaptiveLR=false, bool standardize=true, bool verbose=true)
Train a multiplayer perceptron.
Definition: MLP.hpp:171
static double sigmoidDerivative(double x)
Definition: MLP.hpp:44
static double tanhDerivative(double x)
Definition: MLP.hpp:57
static double sigmoid(double x)
Definition: MLP.hpp:38
MLP()
Definition: MLP.hpp:154
void fit(MatrixD X, MatrixD y, vector< MatrixD > hiddenLayers, unsigned int maxIters, size_t batchSize=0, double learningRate=0.01, double errorThreshold=0.0001, double regularization=0, ActivationFunction func=SIGMOID, bool adaptiveLR=false, bool standardize=true, bool verbose=true)
Train a multiplayer perceptron.
Definition: MLP.hpp:239
ActivationFunction
Definition: MLP.hpp:150
static MatrixD binarize(MatrixD m)
Definition: MLP.hpp:107
A timer that keeps track of time, prints formatted time, checks if an interval has passed etc...
Definition: Timer.hpp:18
static MatrixD initNormal(size_t in, size_t out)
Initialize a matrix according to a normal distribution N(0; 1)
Definition: MLP.hpp:67