machine_learning/MLP_8hpp_source.html

 #ifndef MACHINE_LEARNING_MLP_HPP
 #define MACHINE_LEARNING_MLP_HPP

 #include <vector>
 #include <chrono>
 #include <iostream>
 #include "../include/matrix/Matrix.hpp"
 #include "../include/mersenne_twister/MersenneTwister.hpp"
 #include "Timer.hpp"

 using namespace std;
 using myClock = chrono::high_resolution_clock;

 class MLP {
  private:
   MatrixD data, dataMean, dataDev, classes, originalClasses;
   vector<MatrixD> W;

   //region Activation functions

   static double pow2(double x) {
     return x * x;
   }

   static double sigmoid(double x) {
     return 1 / (1 + exp(-x));
   }

   static double sigmoidDerivative(double x) {
     double z = sigmoid(x);
     return z * (1 - z);
   }

   static double tanh(double x) {
     return 2 * sigmoid(2 * x) - 1;
   }

   static double tanhDerivative(double x) {
     return 1 - pow(tanh(x), 2);
   }

   //endregion

   static MatrixD initNormal(size_t in, size_t out) {
     MersenneTwister twister;
     MatrixD result(in, out, twister.vecFromNormal(in * out));
     return result;
   }

   static MatrixD initNormal(size_t in, size_t out, double mean, double stddev) {
     MersenneTwister twister;
     MatrixD result(in, out, twister.vecFromNormal(in * out, mean, stddev));
     return result;
   }

   static MatrixD initUniform(size_t in, size_t out) {
     MersenneTwister twister;
     MatrixD result(in, out, twister.vecFromUniform(in * out));
     return result;
   }

   static MatrixD initUniform(size_t in, size_t out, double min, double max) {
     MersenneTwister twister;
     MatrixD result(in, out, twister.vecFromUniform(in * out, min, max));
     return result;
   }

   static MatrixD binarize(MatrixD m) {
     for (size_t i = 0; i < m.nRows(); i++) {
       size_t largest = 0;
       for (size_t j = 0; j < m.nCols(); j++)
         if (m(i, j) > m(i, largest)) largest = j;

       for (size_t j = 0; j < m.nCols(); j++) {
         m(i, j) = j == largest;
       }
     }
     return m;
   }

   MatrixD summarize(MatrixD m) {
     MatrixD result(m.nRows(), 1);

     for (size_t i = 0; i < m.nRows(); i++) {
       size_t largest = 0;
       for (size_t j = 0; j < m.nCols(); j++)
         if (m(i, j) > m(i, largest)) largest = j;

       result(i, 0) = originalClasses(largest, 0);
     }

     return result;
   }

   static MatrixD softmax(MatrixD m) {
     m = m.apply(static_cast<double (*)(double)>(exp));
     for (size_t i = 0; i < m.nRows(); i++) {
       double sum = 0;

       for (size_t j = 0; j < m.nCols(); j++)
         sum += m(i, j);

       for (size_t j = 0; j < m.nCols(); j++)
         m(i, j) /= sum;
     }
     return m;
   }

  public:

   enum ActivationFunction { SIGMOID, TANH };
   enum WeightInitialization { NORMAL, UNIFORM, GLOROT };
   enum OutputFormat { ACTIVATION, SOFTMAX, ONEHOT, SUMMARY };

   MLP() {
   }

   void fit(MatrixD X,
            MatrixD y,
            vector<size_t> hiddenConfig,
            int maxIters,
            size_t batchSize = 0,
            double learningRate = 0.01,
            double errorThreshold = 0.0001,
            double regularization = 0,
            ActivationFunction func = SIGMOID,
            WeightInitialization weightInit = UNIFORM,
            bool adaptiveLR = false,
            bool standardize = true,
            bool verbose = true) {
     size_t outputEncodingSize = y.unique().nRows();

     // number of layers. even if there are no hidden layers,
     // there will exist at least one layer of weights that will need to be fitted
     size_t nLayers = hiddenConfig.size() < 1 ? 1 : hiddenConfig.size() + 1;
     // initialize vector of weight matrices
     vector<MatrixD> w = vector<MatrixD>(nLayers);

     // initialize weights
     for (int i = 0; i < nLayers; i++) {
       size_t nIn, nOut;

       // number of inputs (+1 accounts for the bias weight)
       nIn = (i == 0 ? X : w[i - 1]).nCols() + 1;

       // number of outputs
       nOut = i == w.size() - 1 ? outputEncodingSize : hiddenConfig[i];

       // initialize layer with random numbers from a distribution
       if (weightInit == UNIFORM) {
         w[i] = initUniform(nIn, nOut);
       } else if (weightInit == NORMAL) {
         w[i] = initNormal(nIn, nOut);
       } else if (weightInit == GLOROT) {
         w[i] = initUniform(nIn, nOut, -sqrt(nIn), sqrt(nIn));
       }
     }

     fit(X,
         y,
         w,
         maxIters,
         batchSize,
         learningRate,
         errorThreshold,
         regularization,
         func,
         adaptiveLR,
         standardize,
         verbose);
   }

   void fit(MatrixD X,
            MatrixD y,
            vector<MatrixD> hiddenLayers,
            unsigned int maxIters,
            size_t batchSize = 0,
            double learningRate = 0.01,
            double errorThreshold = 0.0001,
            double regularization = 0,
            ActivationFunction func = SIGMOID,
            bool adaptiveLR = false,
            bool standardize = true,
            bool verbose = true) {
     // create one-hot encoding for classes
     classes = y.oneHot();
     originalClasses = y.unique();
     originalClasses.sort();
     size_t outputEncodingSize = classes.nCols();

     for (int i = 0; i < hiddenLayers.size(); ++i) {
       size_t correct_nIn = (i == 0 ? X : hiddenLayers[i - 1]).nCols() + 1,
           correct_nOut = i == hiddenLayers.size() - 1 ? outputEncodingSize : hiddenLayers[i + 1].nRows() - 1;
       if (hiddenLayers[i].nRows() != correct_nIn) {
         throw invalid_argument(
             "Weight matrix " + to_string(i) + " input (" + to_string(hiddenLayers[i].nRows()) + ") should be ("
                 + to_string(correct_nIn) + ")");
       }
       if (hiddenLayers[i].nCols() != correct_nOut) {
         throw invalid_argument(
             "Weight matrix " + to_string(i) + " output (" + to_string(hiddenLayers[i].nCols()) + ") should be ("
                 + to_string(correct_nOut) + ")");
       }
     }

     W = hiddenLayers;

     // number of layers. even if there are no hidden layers,
     // there will exist at least one layer of weights that will need to be fitted
     size_t nLayers = hiddenLayers.size();

     if (standardize) {
       // if standardization takes place, mean and stddev are stored to be used in future predictions
       dataMean = X.mean();
       dataDev = X.stdev();
       data = X.standardize(dataMean, dataDev);
     } else {
       data = X;
       dataMean = dataDev = MatrixD();
     }

     function<double(double)> activationFunction, activationDerivative;
     if (func == SIGMOID) {
       activationFunction = sigmoid;
       activationDerivative = sigmoidDerivative;
     } else {
       activationFunction = tanh;
       activationDerivative = tanhDerivative;
     }

     float lastStdout = 0;
     double previousLoss;
     Timer timer(1, maxIters);
     timer.start();
     // training iterations
     for (int iter = 0; iter < maxIters; iter++) {
       chrono::time_point<chrono::system_clock> iterStart = myClock::now();

       // matrices used in forward pass
       // Z holds the outputs of each layer
       vector<MatrixD> Z(nLayers);

       // F are activation derivatives, they are needed for all but the last layer
       vector<MatrixD> F(nLayers - 1);

       // matrices used on backpropagation
       // D contains the loss signals for each layer
       vector<MatrixD> D(nLayers);

       Matrix<int> filter;

       MatrixD currentInput;
       if (batchSize > 0) {
         MersenneTwister t;
         MatrixI indices(batchSize, 1, t.randomValues(0, data.nRows(), batchSize, false));

         filter = MatrixI::zeros(data.nRows(), 1);

         for (size_t i = 0; i < indices.nRows(); i++) {
           filter(indices(i, 0), 0) = 1;
         }

         currentInput = data.getRows(filter);
       } else
         currentInput = data;

       //forward pass
       for (int i = 0; i < nLayers; i++) {
         // add the bias column to the input of the current layer
         currentInput.addColumn(MatrixD::ones(currentInput.nRows(), 1), 0);

         MatrixD S = currentInput * W[i]; // multiply input by weights

         // calculate derivatives
         if (i < nLayers - 1) // derivative of the last layer is not used, so no need to do it
           F[i] = S.apply(activationDerivative).transpose();

         //apply activation function, whose resulting matrix will be the next input
         currentInput = Z[i] = S.apply(activationFunction);
       }

       // backpropagation
       // last layer error signal
       MatrixD batchClasses = filter.isEmpty() ? classes : classes.getRows(filter);
       D[nLayers - 1] = (Z[nLayers - 1] - batchClasses).transpose();

       // calculate loss
       double loss = (D[nLayers - 1]).apply(pow2).sum() / (2 * batchClasses.nRows());

       double regularizationTerm = 0;
       for (auto w:W)
         regularizationTerm += w.apply(pow2).sum();
       regularizationTerm = regularization > 0 ? regularization / (2 * batchClasses.nRows()) : 0;

       loss += regularizationTerm;

       // error signals for the intermediate layers
       for (int i = nLayers - 2; i >= 0; i--) {
         MatrixD W_noBias = W[i + 1].transpose();
         W_noBias.removeColumn(0);
         W_noBias = W_noBias.transpose();
         // mxb  mxb            mxn       nxb
         D[i] = F[i].hadamard(W_noBias * D[i + 1]);
       }

       // learning rate is linearly scaled down with passing iterations
       double lr = adaptiveLR ? (learningRate / maxIters) * (maxIters - iter) : learningRate;

       // weight updates
       for (size_t i = 0; i < nLayers; i++) {
         MatrixD input;
         if (i == 0)
           if (filter.isEmpty())
             input = data;
           else
             input = data.getRows(filter);
         else
           input = Z[i - 1];

         input.addColumn(MatrixD::ones(input.nRows(), 1), 0); // add the bias once again
         MatrixD dW = -lr * (D[i] * input).transpose();
 //        W[i] += dW;
         W[i] = (1 - ((learningRate * regularization) / batchClasses.nRows())) * W[i] + dW;
       }

       if (verbose and timer.activate(iter)) {
         char errorChar = (loss == previousLoss or iter == 0) ? '=' : loss > previousLoss ? '+' : '-';
         cout << "loss: " << loss << ' ' << errorChar << endl;
       }

       if (loss < errorThreshold)
         break;

       previousLoss = loss;
     }
     if (verbose)
       cout << "Total training time: " << timer.runningTime() << endl;
   }

   MatrixD predict(MatrixD X, OutputFormat of = ACTIVATION) {
     if (!dataMean.isEmpty() && !dataDev.isEmpty())
       X = X.standardize(dataMean, dataDev);

     // even when there are no hidden layers, there
     // must be at least one of each of the following
     size_t nLayers = W.size();

     MatrixD currentInput = X;
     for (int i = 0; i < nLayers; i++) {
       // add the bias column to the input of the current layer
       currentInput.addColumn(MatrixD::ones(currentInput.nRows(), 1), 0);
       MatrixD S = currentInput * W[i];
       currentInput = S.apply(sigmoid);
     }

     if (of == SOFTMAX)
       return MLP::softmax(currentInput);
     if (of == ONEHOT)
       return MLP::binarize(currentInput);
     if (of == SUMMARY)
       return MLP::summarize(currentInput);

     return currentInput;
   }
 };

 #endif //MACHINE_LEARNING_MLP_HPP
MLP::OutputFormat
OutputFormat
Definition: MLP.hpp:152

MLP::summarize
MatrixD summarize(MatrixD m)
Definition: MLP.hpp:120

Timer::runningTime
string runningTime()
Definition: Timer.hpp:149

MLP::softmax
static MatrixD softmax(MatrixD m)
Definition: MLP.hpp:134

MLP::initUniform
static MatrixD initUniform(size_t in, size_t out, double min, double max)
Initialize a matrix according to the uniform distribution U(min; max)
Definition: MLP.hpp:101

std
k-nearest neighbors algorithm, able to do regression and classification

MLP::WeightInitialization
WeightInitialization
Definition: MLP.hpp:151

MLP::originalClasses
MatrixD originalClasses
Definition: MLP.hpp:25

MLP::predict
MatrixD predict(MatrixD X, OutputFormat of=ACTIVATION)
Predict the classes of a data set.
Definition: MLP.hpp:410

MLP::tanh
static double tanh(double x)
Definition: MLP.hpp:51

MLP::W
vector< MatrixD > W
Definition: MLP.hpp:26

MLP::initNormal
static MatrixD initNormal(size_t in, size_t out, double mean, double stddev)
Initialize a matrix according to a normal distribution N(mean; stddev)
Definition: MLP.hpp:79

MLP::pow2
static double pow2(double x)
Definition: MLP.hpp:32

MLP::initUniform
static MatrixD initUniform(size_t in, size_t out)
Initialize a matrix according to the uniform distribution U(0;1)
Definition: MLP.hpp:89

MLP
Multi-layer perceptron.
Definition: MLP.hpp:23

myClock
chrono::high_resolution_clock myClock
Definition: MLP.hpp:18

Timer::activate
bool activate(unsigned int currentIter=0)
Checks if the time interval passed in the constructor has passed.
Definition: Timer.hpp:110

Timer::start
void start()
Start the timer.
Definition: Timer.hpp:101

MLP::fit
void fit(MatrixD X, MatrixD y, vector< size_t > hiddenConfig, int maxIters, size_t batchSize=0, double learningRate=0.01, double errorThreshold=0.0001, double regularization=0, ActivationFunction func=SIGMOID, WeightInitialization weightInit=UNIFORM, bool adaptiveLR=false, bool standardize=true, bool verbose=true)
Train a multiplayer perceptron.
Definition: MLP.hpp:171

MLP::sigmoidDerivative
static double sigmoidDerivative(double x)
Definition: MLP.hpp:44

MLP::tanhDerivative
static double tanhDerivative(double x)
Definition: MLP.hpp:57

MLP::sigmoid
static double sigmoid(double x)
Definition: MLP.hpp:38

MLP::MLP
MLP()
Definition: MLP.hpp:154

MLP::fit
void fit(MatrixD X, MatrixD y, vector< MatrixD > hiddenLayers, unsigned int maxIters, size_t batchSize=0, double learningRate=0.01, double errorThreshold=0.0001, double regularization=0, ActivationFunction func=SIGMOID, bool adaptiveLR=false, bool standardize=true, bool verbose=true)
Train a multiplayer perceptron.
Definition: MLP.hpp:239

MLP::ActivationFunction
ActivationFunction
Definition: MLP.hpp:150

MLP::binarize
static MatrixD binarize(MatrixD m)
Definition: MLP.hpp:107

Timer
A timer that keeps track of time, prints formatted time, checks if an interval has passed etc...
Definition: Timer.hpp:18

Timer.hpp

MLP::initNormal
static MatrixD initNormal(size_t in, size_t out)
Initialize a matrix according to a normal distribution N(0; 1)
Definition: MLP.hpp:67