7 #ifndef MACHINE_LEARNING_MLP_HPP 8 #define MACHINE_LEARNING_MLP_HPP 13 #include "../include/matrix/Matrix.hpp" 14 #include "../include/mersenne_twister/MersenneTwister.hpp" 18 using myClock = chrono::high_resolution_clock;
32 static double pow2(
double x) {
39 return 1 / (1 + exp(-x));
45 double z = sigmoid(x);
51 static double tanh(
double x) {
52 return 2 * sigmoid(2 * x) - 1;
58 return 1 - pow(tanh(x), 2);
68 MersenneTwister twister;
69 MatrixD result(in, out, twister.vecFromNormal(in * out));
79 static MatrixD
initNormal(
size_t in,
size_t out,
double mean,
double stddev) {
80 MersenneTwister twister;
81 MatrixD result(in, out, twister.vecFromNormal(in * out, mean, stddev));
90 MersenneTwister twister;
91 MatrixD result(in, out, twister.vecFromUniform(in * out));
101 static MatrixD
initUniform(
size_t in,
size_t out,
double min,
double max) {
102 MersenneTwister twister;
103 MatrixD result(in, out, twister.vecFromUniform(in * out, min, max));
108 for (
size_t i = 0; i < m.nRows(); i++) {
110 for (
size_t j = 0; j < m.nCols(); j++)
111 if (m(i, j) > m(i, largest)) largest = j;
113 for (
size_t j = 0; j < m.nCols(); j++) {
114 m(i, j) = j == largest;
121 MatrixD result(m.nRows(), 1);
123 for (
size_t i = 0; i < m.nRows(); i++) {
125 for (
size_t j = 0; j < m.nCols(); j++)
126 if (m(i, j) > m(i, largest)) largest = j;
128 result(i, 0) = originalClasses(largest, 0);
135 m = m.apply(
static_cast<double (*)(
double)
>(exp));
136 for (
size_t i = 0; i < m.nRows(); i++) {
139 for (
size_t j = 0; j < m.nCols(); j++)
142 for (
size_t j = 0; j < m.nCols(); j++)
173 vector<size_t> hiddenConfig,
175 size_t batchSize = 0,
176 double learningRate = 0.01,
177 double errorThreshold = 0.0001,
178 double regularization = 0,
181 bool adaptiveLR =
false,
182 bool standardize =
true,
183 bool verbose =
true) {
184 size_t outputEncodingSize = y.unique().nRows();
188 size_t nLayers = hiddenConfig.size() < 1 ? 1 : hiddenConfig.size() + 1;
190 vector<MatrixD> w = vector<MatrixD>(nLayers);
193 for (
int i = 0; i < nLayers; i++) {
197 nIn = (i == 0 ? X : w[i - 1]).nCols() + 1;
200 nOut = i == w.size() - 1 ? outputEncodingSize : hiddenConfig[i];
203 if (weightInit == UNIFORM) {
204 w[i] = initUniform(nIn, nOut);
205 }
else if (weightInit == NORMAL) {
206 w[i] = initNormal(nIn, nOut);
207 }
else if (weightInit == GLOROT) {
208 w[i] = initUniform(nIn, nOut, -sqrt(nIn), sqrt(nIn));
241 vector<MatrixD> hiddenLayers,
242 unsigned int maxIters,
243 size_t batchSize = 0,
244 double learningRate = 0.01,
245 double errorThreshold = 0.0001,
246 double regularization = 0,
248 bool adaptiveLR =
false,
249 bool standardize =
true,
250 bool verbose =
true) {
252 classes = y.oneHot();
253 originalClasses = y.unique();
254 originalClasses.sort();
255 size_t outputEncodingSize = classes.nCols();
257 for (
int i = 0; i < hiddenLayers.size(); ++i) {
258 size_t correct_nIn = (i == 0 ? X : hiddenLayers[i - 1]).nCols() + 1,
259 correct_nOut = i == hiddenLayers.size() - 1 ? outputEncodingSize : hiddenLayers[i + 1].nRows() - 1;
260 if (hiddenLayers[i].nRows() != correct_nIn) {
261 throw invalid_argument(
262 "Weight matrix " + to_string(i) +
" input (" + to_string(hiddenLayers[i].nRows()) +
") should be (" 263 + to_string(correct_nIn) +
")");
265 if (hiddenLayers[i].nCols() != correct_nOut) {
266 throw invalid_argument(
267 "Weight matrix " + to_string(i) +
" output (" + to_string(hiddenLayers[i].nCols()) +
") should be (" 268 + to_string(correct_nOut) +
")");
276 size_t nLayers = hiddenLayers.size();
282 data = X.standardize(dataMean, dataDev);
285 dataMean = dataDev = MatrixD();
288 function<double(double)> activationFunction, activationDerivative;
289 if (func == SIGMOID) {
290 activationFunction = sigmoid;
291 activationDerivative = sigmoidDerivative;
293 activationFunction = tanh;
294 activationDerivative = tanhDerivative;
297 float lastStdout = 0;
299 Timer timer(1, maxIters);
302 for (
int iter = 0; iter < maxIters; iter++) {
303 chrono::time_point<chrono::system_clock> iterStart = myClock::now();
307 vector<MatrixD> Z(nLayers);
310 vector<MatrixD> F(nLayers - 1);
314 vector<MatrixD> D(nLayers);
318 MatrixD currentInput;
321 MatrixI indices(batchSize, 1, t.randomValues(0, data.nRows(), batchSize,
false));
323 filter = MatrixI::zeros(data.nRows(), 1);
325 for (
size_t i = 0; i < indices.nRows(); i++) {
326 filter(indices(i, 0), 0) = 1;
329 currentInput = data.getRows(filter);
334 for (
int i = 0; i < nLayers; i++) {
336 currentInput.addColumn(MatrixD::ones(currentInput.nRows(), 1), 0);
338 MatrixD S = currentInput * W[i];
342 F[i] = S.apply(activationDerivative).transpose();
345 currentInput = Z[i] = S.apply(activationFunction);
350 MatrixD batchClasses = filter.isEmpty() ? classes : classes.getRows(filter);
351 D[nLayers - 1] = (Z[nLayers - 1] - batchClasses).transpose();
354 double loss = (D[nLayers - 1]).apply(pow2).sum() / (2 * batchClasses.nRows());
356 double regularizationTerm = 0;
358 regularizationTerm += w.apply(pow2).sum();
359 regularizationTerm = regularization > 0 ? regularization / (2 * batchClasses.nRows()) : 0;
361 loss += regularizationTerm;
364 for (
int i = nLayers - 2; i >= 0; i--) {
365 MatrixD W_noBias = W[i + 1].transpose();
366 W_noBias.removeColumn(0);
367 W_noBias = W_noBias.transpose();
369 D[i] = F[i].hadamard(W_noBias * D[i + 1]);
373 double lr = adaptiveLR ? (learningRate / maxIters) * (maxIters - iter) : learningRate;
376 for (
size_t i = 0; i < nLayers; i++) {
379 if (filter.isEmpty())
382 input = data.getRows(filter);
386 input.addColumn(MatrixD::ones(input.nRows(), 1), 0);
387 MatrixD dW = -lr * (D[i] * input).transpose();
389 W[i] = (1 - ((learningRate * regularization) / batchClasses.nRows())) * W[i] + dW;
392 if (verbose and timer.
activate(iter)) {
393 char errorChar = (loss == previousLoss or iter == 0) ?
'=' : loss > previousLoss ?
'+' :
'-';
394 cout <<
"loss: " << loss <<
' ' << errorChar << endl;
397 if (loss < errorThreshold)
403 cout <<
"Total training time: " << timer.
runningTime() << endl;
411 if (!dataMean.isEmpty() && !dataDev.isEmpty())
412 X = X.standardize(dataMean, dataDev);
416 size_t nLayers = W.size();
418 MatrixD currentInput = X;
419 for (
int i = 0; i < nLayers; i++) {
421 currentInput.addColumn(MatrixD::ones(currentInput.nRows(), 1), 0);
422 MatrixD S = currentInput * W[i];
423 currentInput = S.apply(sigmoid);
437 #endif //MACHINE_LEARNING_MLP_HPP
MatrixD summarize(MatrixD m)
static MatrixD softmax(MatrixD m)
static MatrixD initUniform(size_t in, size_t out, double min, double max)
Initialize a matrix according to the uniform distribution U(min; max)
k-nearest neighbors algorithm, able to do regression and classification
MatrixD predict(MatrixD X, OutputFormat of=ACTIVATION)
Predict the classes of a data set.
static double tanh(double x)
static MatrixD initNormal(size_t in, size_t out, double mean, double stddev)
Initialize a matrix according to a normal distribution N(mean; stddev)
static double pow2(double x)
static MatrixD initUniform(size_t in, size_t out)
Initialize a matrix according to the uniform distribution U(0;1)
chrono::high_resolution_clock myClock
bool activate(unsigned int currentIter=0)
Checks if the time interval passed in the constructor has passed.
void start()
Start the timer.
void fit(MatrixD X, MatrixD y, vector< size_t > hiddenConfig, int maxIters, size_t batchSize=0, double learningRate=0.01, double errorThreshold=0.0001, double regularization=0, ActivationFunction func=SIGMOID, WeightInitialization weightInit=UNIFORM, bool adaptiveLR=false, bool standardize=true, bool verbose=true)
Train a multiplayer perceptron.
static double sigmoidDerivative(double x)
static double tanhDerivative(double x)
static double sigmoid(double x)
void fit(MatrixD X, MatrixD y, vector< MatrixD > hiddenLayers, unsigned int maxIters, size_t batchSize=0, double learningRate=0.01, double errorThreshold=0.0001, double regularization=0, ActivationFunction func=SIGMOID, bool adaptiveLR=false, bool standardize=true, bool verbose=true)
Train a multiplayer perceptron.
static MatrixD binarize(MatrixD m)
A timer that keeps track of time, prints formatted time, checks if an interval has passed etc...
static MatrixD initNormal(size_t in, size_t out)
Initialize a matrix according to a normal distribution N(0; 1)