Machine learning algorithms in C++
GridWorld.hpp
Go to the documentation of this file.
1 
8 #ifndef MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP
9 #define MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP
10 
11 #include "../include/matrix/Matrix.hpp"
12 #include "../include/mersenne_twister/MersenneTwister.hpp"
13 #include "Timer.hpp"
14 
18 class GridWorld {
19  private:
20  MatrixD V, Q, rewards, policy;
21  double gamma;
22  unsigned long nStates;
23  vector<pair<size_t, size_t>> goals;
24 
25  enum ActionType { UP = 0, DOWN = 1, LEFT = 2, RIGHT = 3 };
26  vector<ActionType> actions = {UP, DOWN, LEFT, RIGHT};
27 
28  void initialize(size_t height, size_t width, vector<pair<size_t, size_t>> goals, double gamma = 1) {
29  if (goals.size() == 0)
30  throw invalid_argument("No goal state, must pass at least one");
31 
32  this->nStates = height * width;
33 
34  // state and actions value functions starts as 0 everywhere
35  V = MatrixD::zeros(height, width);
36  Q = MatrixD::zeros(nStates, actions.size());
37 
38  // set goal rewards are 0, all other states are -1
39  rewards = MatrixD::fill(height, width, -1);
40  for (auto goal:goals)
41  rewards(goal.first, goal.second) = 0;
42 
43  if (rewards.unique().sum() == 0)
44  throw invalid_argument("All states are goal!");
45 
46  this->goals = goals;
47  this->gamma = gamma;
48 
49  // initialize the policy matrix giving equal probability of choice for every action
50  policy = MatrixD::fill(height * width, actions.size(), 1.0 / actions.size());
51  }
52 
59  size_t fromCoord(size_t row, size_t col) {
60  return row * V.nCols() + col;
61  }
62 
68  pair<size_t, size_t> toCoord(size_t s) {
69  size_t row = static_cast<size_t>(ceil((s + 1) / (double) V.nCols()) - 1);
70  size_t col = s - row * V.nCols();
71 
72  return {row, col};
73  }
74 
81  double actionValue(size_t s, ActionType a) {
82  double q = 0;
83 
84  pair<size_t, size_t> coords = toCoord(s);
85  double r = rewards(coords.first, coords.second);
86  for (size_t i = 0; i < V.nRows(); ++i) {
87  for (size_t j = 0; j < V.nCols(); ++j) {
88  size_t s2 = fromCoord(i, j);
89  double t = transition(s, a, s2);
90  double v = V(i, j);
91 
92  // WARNING: here the ActionType a is being used as an index
93  q += t * (r + gamma * v);
94 
95  }
96  }
97 
98  return q;
99  }
100 
106  Matrix<double> normalizeToOne(MatrixD m) {
107  return m / m.sum();
108  }
109 
115  vector<double> actionValuesForState(size_t s) {
116  vector<double> result(actions.size());
117 
118  for (int i = 0; i < actions.size(); i++)
119  result[i] = actionValue(s, actions[i]);
120 
121  return result;
122  }
123 
130  Matrix<double> policyIncrement(size_t s) {
131  MatrixD result = MatrixD::zeros(actions.size(), 1);
132 
133  vector<double> actionValues = actionValuesForState(s);
134  const MatrixD &currentPolicy = policyForState(s);
135  double bestQ = actionValues[0];
136 
137  for (size_t i = 1; i < actionValues.size(); i++) {
138  // store best action value
139  if (actionValues[i] > bestQ)
140  bestQ = actionValues[i];
141  }
142 
143  // actions with best action value receive equal probability of being chosen,
144  // all others have prob 0
145  for (size_t i = 0; i < actions.size(); i++) {
146  if (actionValues[i] == bestQ)
147  result(i, 0) = 1;
148  }
149 
150  return normalizeToOne(result);
151  }
152 
156  string prettifyPolicy() {
157  string s = "";
158  for (size_t i = 0; i < V.nRows(); i++) {
159  for (size_t j = 0; j < V.nCols(); ++j) {
160  size_t state = fromCoord(i, j);
161 
162  if (isGoal(state)) {
163  s += "☻";
164  continue;
165  }
166 
167  const MatrixD p = policyForState(state);
168  double maxProb = p.max();
169 
170  if (p(UP, 0) == maxProb and p(DOWN, 0) == maxProb and p(LEFT, 0) == maxProb and p(RIGHT, 0) == maxProb)
171  s += "╬";
172  else if (p(UP, 0) == maxProb and p(DOWN, 0) == maxProb and p(LEFT, 0) == maxProb)
173  s += "╣";
174  else if (p(UP, 0) == maxProb and p(DOWN, 0) == maxProb and p(RIGHT, 0) == maxProb)
175  s += "╠";
176  else if (p(UP, 0) == maxProb and p(LEFT, 0) == maxProb and p(RIGHT, 0) == maxProb)
177  s += "╩";
178  else if (p(DOWN, 0) == maxProb and p(LEFT, 0) == maxProb and p(RIGHT, 0) == maxProb)
179  s += "╦";
180  else if (p(UP, 0) == maxProb and p(DOWN, 0) == maxProb)
181  s += "║";
182  else if (p(UP, 0) == maxProb and p(LEFT, 0) == maxProb)
183  s += "╝";
184  else if (p(UP, 0) == maxProb and p(RIGHT, 0) == maxProb)
185  s += "╚";
186  else if (p(DOWN, 0) == maxProb and p(LEFT, 0) == maxProb)
187  s += "╗";
188  else if (p(DOWN, 0) == maxProb and p(RIGHT, 0) == maxProb)
189  s += "╔";
190  else if (p(LEFT, 0) == maxProb and p(RIGHT, 0) == maxProb)
191  s += "═";
192  else if (p(UP, 0) == maxProb)
193  s += "⇧";
194  else if (p(DOWN, 0) == maxProb)
195  s += "⇩";
196  else if (p(LEFT, 0) == maxProb)
197  s += "⇦";
198  else if (p(RIGHT, 0) == maxProb)
199  s += "⇨";
200  }
201  s += '\n';
202  }
203  return s;
204  }
205 
211  void iterativePolicyEvaluation(double threshold, bool verbose) {
212  double delta;
213  int iter = 0;
214  do {
215  iter++;
216  delta = 0;
217  for (size_t i = 0; i < V.nRows(); i++) {
218  for (size_t j = 0; j < V.nCols(); j++) {
219  size_t state1 = fromCoord(i, j);
220 
221  double currentV = V(i, j), newV = 0;
222 
223  for (ActionType action : actions)
224  newV += policy(state1, action) * actionValue(state1, action);
225 
226  V(i, j) = newV;
227 
228  double newDelta = abs(currentV - V(i, j));
229 
230  if (newDelta > delta)
231  delta = newDelta;
232  }
233  }
234  } while (delta >= threshold);
235  if (verbose)
236  cout << iter << " iterations of policy evaluation" << endl;
237  }
238 
244  bool isGoal(size_t s) {
245  pair<size_t, size_t> stateCoords = toCoord(s);
246  return std::find(goals.begin(), goals.end(), stateCoords) != goals.end();
247  }
248 
257  double transition(size_t currentState, ActionType action, size_t nextState) {
258  // the agent never leaves the goal
259  if (isGoal(currentState))
260  return 0;
261 
262  // return 1 if applying the given action actually takes the agent to the desired state
263  size_t resultingState = applyAction(currentState, action);
264  return resultingState == nextState;
265  }
266 
273  size_t applyAction(size_t currentState, ActionType action) {
274  pair<size_t, size_t> s1 = toCoord(currentState);
275  size_t s1row = s1.first, s1col = s1.second, s2row, s2col;
276 
277  switch (action) {
278  case UP:s2col = s1col;
279  s2row = s1row != 0 ? s1row - 1 : s1row;
280  break;
281  case DOWN:s2col = s1col;
282  s2row = s1row != V.nCols() - 1 ? s1row + 1 : s1row;
283  break;
284  case LEFT:s2row = s1row;
285  s2col = s1col != 0 ? s1col - 1 : s1col;
286  break;
287  case RIGHT:s2row = s1row;
288  s2col = s1col != V.nRows() - 1 ? s1col + 1 : s1col;
289  break;
290  default:return 0;
291  }
292 
293  return fromCoord(s2row, s2col);
294  }
295 
301  MatrixD policyForState(size_t s) {
302  return normalizeToOne(policy.getRow(s));
303  }
304 
309  size_t getNonGoalState() {
310  MersenneTwister t;
311  size_t s;
312 
313  do {
314  s = static_cast<size_t>(t.i_random(static_cast<int>(nStates - 1)));
315  } while (isGoal(s));
316 
317  return s;
318  }
319 
327  ActionType eGreedy(size_t s, double epsilon) {
328  MersenneTwister t;
329 
330  if (t.d_random() <= epsilon) {
331  ActionType bestAction = UP;
332  double bestQ = Q(s, 0);
333 
334  for (size_t j = 1; j < actions.size(); j++) {
335  if (bestQ <= Q(s, j)) {
336  if (bestQ == Q(s, j))
337  bestAction = t.d_random(1) <= .5 ? actions[j] : bestAction;
338  else
339  bestAction = actions[j];
340  bestQ = Q(s, j);
341  }
342  }
343 
344  return bestAction;
345 
346  } else {
347  return actions[t.i_random(static_cast<int>(actions.size() - 1))];
348  }
349  }
350 
356  double bestQForState(size_t s) {
357  double bestQ = Q(s, 0);
358 
359  for (size_t j = 1; j < actions.size(); j++) {
360  if (bestQ < Q(s, j)) {
361  bestQ = Q(s, j);
362  }
363  }
364  return bestQ;
365  }
366 
371  MatrixD newPolicy = MatrixD::zeros(nStates, actions.size());
372  for (size_t state = 0; state < nStates; state++) {
373  // store best action value for the current state
374  double bestQ = Q(state, 0);
375  for (size_t j = 1; j < actions.size(); j++) {
376  if (bestQ < Q(state, j))
377  bestQ = Q(state, j);
378  }
379 
380  // actions with best action value receive equal probability of being chosen,
381  // all others have prob 0
382  for (size_t j = 0; j < actions.size(); j++) {
383  if (Q(state, j) == bestQ)
384  newPolicy(state, j) = 1;
385  }
386 
387  newPolicy.setRow(state, normalizeToOne(newPolicy.getRow(state).transpose()));
388  }
389  return newPolicy;
390  }
391 
392  public:
393 
403  void policyIteration(size_t height,
404  size_t width,
405  vector<pair<size_t, size_t>> goals,
406  double gamma = 1,
407  double threshold = .001,
408  bool verbose = true) {
409  // step 1: initialization
410  initialize(height, width, goals, gamma);
411 
412  bool stablePolicy;
413  int iter = 0;
414  MatrixD currentPolicy;
415 
416  do {
417  currentPolicy = policy;
418  iter++;
419  // step 2: policy evaluation
420  iterativePolicyEvaluation(threshold, verbose);
421 
422 
423  // step 3: policy improvement
424  for (size_t i = 0; i < V.nRows(); ++i) {
425  for (size_t j = 0; j < V.nCols(); ++j) {
426  size_t state = fromCoord(i, j);
427 
428  // workaround to optimize running time
429  if (isGoal(state))
430  continue;
431 
432  // retrieve policy for current state
433  MatrixD currentStatePolicy = policyForState(state);
434 
435  // generate a better policy
436  MatrixD betterStatePolicy = policyIncrement(state);
437 
438  if (currentStatePolicy != betterStatePolicy)
439  policy.setRow(state, betterStatePolicy.transpose());
440  }
441  }
442  if (verbose) cout << "iteration " << iter << " of policy improvement" << endl << prettifyPolicy() << endl;
443  } while (currentPolicy != policy);
444  }
445 
455  void valueIteration(size_t height,
456  size_t width,
457  vector<pair<size_t, size_t>> goals,
458  double gamma = 1,
459  double threshold = .000001,
460  bool verbose = true) {
461  initialize(height, width, goals, gamma);
462  double delta;
463  int iter = 0;
464 
465  do {
466  iter++;
467  delta = 0;
468  for (size_t i = 0; i < V.nRows(); i++) {
469  for (size_t j = 0; j < V.nCols(); j++) {
470  size_t state1 = fromCoord(i, j);
471 
472  // workaround to optimize running time
473  if (isGoal(state1))
474  continue;
475 
476  vector<double> actionValues = actionValuesForState(state1);
477 
478  size_t action = 0;
479  for (size_t k = 1; k < actionValues.size(); k++) {
480  if (actionValues[k] > actionValues[action]) {
481  action = k;
482  }
483  }
484 
485  double currentV = V(i, j);
486  V(i, j) = actionValues[action];
487 
488  policy.setRow(state1, policyIncrement(state1).transpose());
489 
490  double newDelta = abs(currentV - V(i, j));
491 
492  if (newDelta > delta)
493  delta = newDelta;
494  }
495  }
496  if (verbose) cout << "iteration " << iter << endl << prettifyPolicy() << endl;
497  } while (delta >= threshold);
498  }
499 
509  void MonteCarloEstimatingStarts(size_t height,
510  size_t width,
511  vector<pair<size_t, size_t>> goals,
512  double gamma = 1,
513  unsigned maxIters = 10000,
514  bool verbose = true) {
515  initialize(height, width, goals, gamma);
516 
517  MatrixI visits = MatrixI::zeros(nStates, actions.size());
518  MatrixD QSum = MatrixD::zeros(nStates, actions.size());
519 
520  MersenneTwister twister;
521  ActionType action;
522  size_t state;
523  chrono::time_point<chrono::system_clock> start = myClock::now();
524  Timer timer(1, maxIters);
525  timer.start();
526 
527  for (unsigned iter = 0; iter < maxIters; iter++) {
528 
529  vector<size_t> visitedStates;
530  vector<ActionType> appliedActions;
531 
532  // select a random initial state
533  state = getNonGoalState();
534 
535  // loop that generates the current episode
536  do {
537  // select random action
538  action = actions[twister.i_random(static_cast<int>(actions.size() - 1))];
539 
540  // store the current state and action
541  visitedStates.push_back(state);
542  appliedActions.push_back(action);
543 
544  // generate next state
545  state = applyAction(state, action);
546  } while (!isGoal(state)); // terminate when a goal state is generated
547 
548  vector<size_t> processedStates;
549  vector<ActionType> processedActions;
550 
551  double G = 0;
552 
553  // backwards loop that will update Q values
554  for (size_t i = visitedStates.size() - 1; i != (size_t) 0; i--) {
555  state = visitedStates[i];
556  action = appliedActions[i];
557 
558  // skip this state/action pair if it has already appeared in the episode
559  bool processed = false;
560  for (int j = 0; j < processedStates.size(); j++) {
561  if (processedStates[i] == state and processedActions[i] == action) {
562  processed = true;
563  break;
564  }
565  }
566 
567  if (processed)
568  continue;
569 
570  processedStates.push_back(state);
571  processedActions.push_back(action);
572 
573  pair<size_t, size_t> stateCoords = toCoord(state);
574 
575  G += rewards(stateCoords.first, stateCoords.second);
576 
577  QSum(state, action) += G;
578  visits(state, action)++;
579  }
580  for (state = 0; state < nStates; state++) {
581  // build Q matrix from sums and n. visits
582  for (size_t j = 0; j < actions.size(); j++) {
583  Q(state, j) = isGoal(state) ? 0 : QSum(state, j) / visits(state, j);
584  }
585  }
586 
587  MatrixD newPolicy = getOptimalPolicyFromQ();
588  if (newPolicy != policy) {
589  policy = newPolicy;
590  if (verbose)
591  cout << iter << endl << prettifyPolicy() << endl;
592  }
593  }
594  }
595 
607  void Sarsa(size_t height, size_t width,
608  vector<pair<size_t, size_t>> goals,
609  double gamma = 1,
610  double alpha = .3,
611  double epsilon = .8,
612  unsigned int maxIters = 10000,
613  bool verbose = true) {
614  initialize(height, width, goals, gamma);
615  Timer timer(1, maxIters);
616  timer.start();
617 
618  for (unsigned int iter = 0; iter < maxIters; iter++) {
619  size_t s = getNonGoalState();
620  ActionType a = eGreedy(s, epsilon);
621 
622  while (!isGoal(s)) {
623  pair<size_t, size_t> stateCoords = toCoord(s);
624  double r = rewards(stateCoords.first, stateCoords.second);
625  size_t newState = applyAction(s, a);
626  ActionType newAction = eGreedy(newState, epsilon);
627  Q(s, a) += alpha * (r + gamma * Q(newState, newAction) - Q(s, a));
628  s = newState;
629  a = newAction;
630  }
631 
632  MatrixD newPolicy = getOptimalPolicyFromQ();
633  if (newPolicy != policy) {
634  policy = newPolicy;
635  if (verbose)
636  cout << iter << endl << prettifyPolicy() << endl;
637  }
638  }
639  }
640 
650  void QLearning(size_t height, size_t width,
651  vector<pair<size_t, size_t>> goals,
652  double gamma = 1,
653  double alpha = .3,
654  double epsilon = .8,
655  unsigned int maxIters = 10000,
656  bool verbose = true) {
657  initialize(height, width, goals, gamma);
658  Timer timer(1, maxIters);
659  timer.start();
660 
661  for (unsigned int iter = 0; iter < maxIters; iter++) {
662  size_t s = getNonGoalState();
663 
664  while (!isGoal(s)) {
665  pair<size_t, size_t> stateCoords = toCoord(s);
666 
667  ActionType a = eGreedy(s, epsilon);
668  double r = rewards(stateCoords.first, stateCoords.second);
669 
670  size_t newState = applyAction(s, a);
671 
672  Q(s, a) += alpha * (r + gamma * bestQForState(newState) - Q(s, a));
673  s = newState;
674  }
675 
676  MatrixD newPolicy = getOptimalPolicyFromQ();
677 
678  if (newPolicy != policy) {
679  policy = newPolicy;
680  if (verbose)
681  cout << iter << endl << prettifyPolicy() << endl;
682  }
683  }
684  }
685 
686  const MatrixD &getV() const {
687  return V;
688  }
689  const MatrixD &getQ() const {
690  return Q;
691  }
692  const MatrixD &getRewards() const {
693  return rewards;
694  }
695  const MatrixD &getPolicy() const {
696  return policy;
697  }
698  double getGamma() const {
699  return gamma;
700  }
701  unsigned long getNStates() const {
702  return nStates;
703  }
704  const vector<pair<size_t, size_t>> &getGoals() const {
705  return goals;
706  }
707  const vector<ActionType> &getActions() const {
708  return actions;
709  }
710 };
711 
712 #endif //MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP
Matrix< double > normalizeToOne(MatrixD m)
Normalizes a matriz so its sum equals 1.
Definition: GridWorld.hpp:106
MatrixD getOptimalPolicyFromQ()
Updates the policy matrix according to the action values from the Q matrix.
Definition: GridWorld.hpp:370
ActionType eGreedy(size_t s, double epsilon)
Selects an action for a state s following an e-greedy policy.
Definition: GridWorld.hpp:327
vector< ActionType > actions
Definition: GridWorld.hpp:26
const MatrixD & getRewards() const
Definition: GridWorld.hpp:692
bool isGoal(size_t s)
Informs whether a state is a goal state in the grid world.
Definition: GridWorld.hpp:244
size_t applyAction(size_t currentState, ActionType action)
Returns the next state that results from applying an action to a state.
Definition: GridWorld.hpp:273
const MatrixD & getPolicy() const
Definition: GridWorld.hpp:695
MatrixD policyForState(size_t s)
Gets the policy for state s
Definition: GridWorld.hpp:301
Matrix< double > policyIncrement(size_t s)
Creates a new policy for a given state giving preference to the actions with maximum value...
Definition: GridWorld.hpp:130
Class to solve the grid world toy problem as a Markov decision process.
Definition: GridWorld.hpp:18
void policyIteration(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double threshold=.001, bool verbose=true)
Policy iteration method.
Definition: GridWorld.hpp:403
double actionValue(size_t s, ActionType a)
Gets the q value of action a on state s
Definition: GridWorld.hpp:81
pair< size_t, size_t > toCoord(size_t s)
Transforms a raster coordinate from the grid world into its corresponding row x column representation...
Definition: GridWorld.hpp:68
const vector< ActionType > & getActions() const
Definition: GridWorld.hpp:707
double transition(size_t currentState, ActionType action, size_t nextState)
Returns the transition probability to nextState, given currentState and action
Definition: GridWorld.hpp:257
void MonteCarloEstimatingStarts(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, unsigned maxIters=10000, bool verbose=true)
Monte Carlo Estimating Starts algorithm for finding an optimal policy.
Definition: GridWorld.hpp:509
unsigned long getNStates() const
Definition: GridWorld.hpp:701
const MatrixD & getQ() const
Definition: GridWorld.hpp:689
void start()
Start the timer.
Definition: Timer.hpp:101
double bestQForState(size_t s)
Gets the best action value for state s.
Definition: GridWorld.hpp:356
size_t getNonGoalState()
Selects a random non-goal state.
Definition: GridWorld.hpp:309
const MatrixD & getV() const
Definition: GridWorld.hpp:686
double getGamma() const
Definition: GridWorld.hpp:698
string prettifyPolicy()
Definition: GridWorld.hpp:156
vector< pair< size_t, size_t > > goals
Definition: GridWorld.hpp:23
void initialize(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1)
Definition: GridWorld.hpp:28
void iterativePolicyEvaluation(double threshold, bool verbose)
Iterative policy evaluation implemented as decribed in Sutton and Barto, 2017.
Definition: GridWorld.hpp:211
unsigned long nStates
Definition: GridWorld.hpp:22
size_t fromCoord(size_t row, size_t col)
Transforms row x column coordinates from the grid world into a raster representation.
Definition: GridWorld.hpp:59
MatrixD rewards
Definition: GridWorld.hpp:20
vector< double > actionValuesForState(size_t s)
Gets the q values of all actions for a given state.
Definition: GridWorld.hpp:115
void valueIteration(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double threshold=.000001, bool verbose=true)
Value iteration method.
Definition: GridWorld.hpp:455
double gamma
Definition: GridWorld.hpp:21
void QLearning(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double alpha=.3, double epsilon=.8, unsigned int maxIters=10000, bool verbose=true)
Temporal difference method for finding the optimal policy using Q-Learning.
Definition: GridWorld.hpp:650
A timer that keeps track of time, prints formatted time, checks if an interval has passed etc...
Definition: Timer.hpp:18
MatrixD Q
Definition: GridWorld.hpp:20
const vector< pair< size_t, size_t > > & getGoals() const
Definition: GridWorld.hpp:704
void Sarsa(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double alpha=.3, double epsilon=.8, unsigned int maxIters=10000, bool verbose=true)
Temporal difference method for finding the optimal policy using SARSA.
Definition: GridWorld.hpp:607
MatrixD policy
Definition: GridWorld.hpp:20
MatrixD V
Definition: GridWorld.hpp:20