8 #ifndef MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP     9 #define MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP    11 #include "../include/matrix/Matrix.hpp"    12 #include "../include/mersenne_twister/MersenneTwister.hpp"    23   vector<pair<size_t, size_t>> 
goals;
    29     if (
goals.size() == 0)
    30       throw invalid_argument(
"No goal state, must pass at least one");
    32     this->nStates = height * width;
    35     V = MatrixD::zeros(height, width);
    39     rewards = MatrixD::fill(height, width, -1);
    41       rewards(goal.first, goal.second) = 0;
    43     if (
rewards.unique().sum() == 0)
    44       throw invalid_argument(
"All states are goal!");
    60     return row * 
V.nCols() + col;
    69     size_t row = 
static_cast<size_t>(ceil((s + 1) / (
double) 
V.nCols()) - 1);
    70     size_t col = s - row * 
V.nCols();
    84     pair<size_t, size_t> coords = 
toCoord(s);
    85     double r = 
rewards(coords.first, coords.second);
    86     for (
size_t i = 0; i < 
V.nRows(); ++i) {
    87       for (
size_t j = 0; j < 
V.nCols(); ++j) {
    93         q += t * (r + 
gamma * v);
   116     vector<double> result(
actions.size());
   118     for (
int i = 0; i < 
actions.size(); i++)
   131     MatrixD result = MatrixD::zeros(
actions.size(), 1);
   135     double bestQ = actionValues[0];
   137     for (
size_t i = 1; i < actionValues.size(); i++) {
   139       if (actionValues[i] > bestQ)
   140         bestQ = actionValues[i];
   145     for (
size_t i = 0; i < 
actions.size(); i++) {
   146       if (actionValues[i] == bestQ)
   158     for (
size_t i = 0; i < 
V.nRows(); i++) {
   159       for (
size_t j = 0; j < 
V.nCols(); ++j) {
   168         double maxProb = p.max();
   170         if (p(
UP, 0) == maxProb and p(
DOWN, 0) == maxProb and p(
LEFT, 0) == maxProb and p(
RIGHT, 0) == maxProb)
   172         else if (p(
UP, 0) == maxProb and p(
DOWN, 0) == maxProb and p(
LEFT, 0) == maxProb)
   174         else if (p(
UP, 0) == maxProb and p(
DOWN, 0) == maxProb and p(
RIGHT, 0) == maxProb)
   176         else if (p(
UP, 0) == maxProb and p(
LEFT, 0) == maxProb and p(
RIGHT, 0) == maxProb)
   178         else if (p(
DOWN, 0) == maxProb and p(
LEFT, 0) == maxProb and p(
RIGHT, 0) == maxProb)
   180         else if (p(
UP, 0) == maxProb and p(
DOWN, 0) == maxProb)
   182         else if (p(
UP, 0) == maxProb and p(
LEFT, 0) == maxProb)
   184         else if (p(
UP, 0) == maxProb and p(
RIGHT, 0) == maxProb)
   186         else if (p(
DOWN, 0) == maxProb and p(
LEFT, 0) == maxProb)
   188         else if (p(
DOWN, 0) == maxProb and p(
RIGHT, 0) == maxProb)
   190         else if (p(
LEFT, 0) == maxProb and p(
RIGHT, 0) == maxProb)
   192         else if (p(
UP, 0) == maxProb)
   194         else if (p(
DOWN, 0) == maxProb)
   196         else if (p(
LEFT, 0) == maxProb)
   198         else if (p(
RIGHT, 0) == maxProb)
   217       for (
size_t i = 0; i < 
V.nRows(); i++) {
   218         for (
size_t j = 0; j < 
V.nCols(); j++) {
   221           double currentV = 
V(i, j), newV = 0;
   228           double newDelta = abs(currentV - 
V(i, j));
   230           if (newDelta > delta)
   234     } 
while (delta >= threshold);
   236       cout << iter << 
" iterations of policy evaluation" << endl;
   245     pair<size_t, size_t> stateCoords = 
toCoord(s);
   246     return std::find(
goals.begin(), 
goals.end(), stateCoords) != 
goals.end();
   263     size_t resultingState = 
applyAction(currentState, action);
   264     return resultingState == nextState;
   274     pair<size_t, size_t> s1 = 
toCoord(currentState);
   275     size_t s1row = s1.first, s1col = s1.second, s2row, s2col;
   278       case UP:s2col = s1col;
   279         s2row = s1row != 0 ? s1row - 1 : s1row;
   281       case DOWN:s2col = s1col;
   282         s2row = s1row != 
V.nCols() - 1 ? s1row + 1 : s1row;
   284       case LEFT:s2row = s1row;
   285         s2col = s1col != 0 ? s1col - 1 : s1col;
   287       case RIGHT:s2row = s1row;
   288         s2col = s1col != 
V.nRows() - 1 ? s1col + 1 : s1col;
   314       s = 
static_cast<size_t>(t.i_random(static_cast<int>(
nStates - 1)));
   330     if (t.d_random() <= epsilon) {
   332       double bestQ = 
Q(s, 0);
   334       for (
size_t j = 1; j < 
actions.size(); j++) {
   335         if (bestQ <= 
Q(s, j)) {
   336           if (bestQ == 
Q(s, j))
   337             bestAction = t.d_random(1) <= .5 ? 
actions[j] : bestAction;
   357     double bestQ = 
Q(s, 0);
   359     for (
size_t j = 1; j < 
actions.size(); j++) {
   360       if (bestQ < 
Q(s, j)) {
   372     for (
size_t state = 0; state < 
nStates; state++) {
   374       double bestQ = 
Q(state, 0);
   375       for (
size_t j = 1; j < 
actions.size(); j++) {
   376         if (bestQ < 
Q(state, j))
   382       for (
size_t j = 0; j < 
actions.size(); j++) {
   383         if (
Q(state, j) == bestQ)
   384           newPolicy(state, j) = 1;
   387       newPolicy.setRow(state, 
normalizeToOne(newPolicy.getRow(state).transpose()));
   405                        vector<pair<size_t, size_t>> 
goals,
   407                        double threshold = .001,
   408                        bool verbose = 
true) {
   414     MatrixD currentPolicy;
   424       for (
size_t i = 0; i < 
V.nRows(); ++i) {
   425         for (
size_t j = 0; j < 
V.nCols(); ++j) {
   438           if (currentStatePolicy != betterStatePolicy)
   439             policy.setRow(state, betterStatePolicy.transpose());
   442       if (verbose) cout << 
"iteration " << iter << 
" of policy improvement" << endl << 
prettifyPolicy() << endl;
   443     } 
while (currentPolicy != 
policy);
   457                       vector<pair<size_t, size_t>> 
goals,
   459                       double threshold = .000001,
   460                       bool verbose = 
true) {
   468       for (
size_t i = 0; i < 
V.nRows(); i++) {
   469         for (
size_t j = 0; j < 
V.nCols(); j++) {
   479           for (
size_t k = 1; k < actionValues.size(); k++) {
   480             if (actionValues[k] > actionValues[action]) {
   485           double currentV = 
V(i, j);
   486           V(i, j) = actionValues[action];
   490           double newDelta = abs(currentV - 
V(i, j));
   492           if (newDelta > delta)
   496       if (verbose) cout << 
"iteration " << iter << endl << 
prettifyPolicy() << endl;
   497     } 
while (delta >= threshold);
   511                                   vector<pair<size_t, size_t>> 
goals,
   513                                   unsigned maxIters = 10000,
   514                                   bool verbose = 
true) {
   520     MersenneTwister twister;
   523     chrono::time_point<chrono::system_clock> start = myClock::now();
   524     Timer timer(1, maxIters);
   527     for (
unsigned iter = 0; iter < maxIters; iter++) {
   529       vector<size_t> visitedStates;
   530       vector<ActionType> appliedActions;
   538         action = 
actions[twister.i_random(static_cast<int>(
actions.size() - 1))];
   541         visitedStates.push_back(state);
   542         appliedActions.push_back(action);
   548       vector<size_t> processedStates;
   549       vector<ActionType> processedActions;
   554       for (
size_t i = visitedStates.size() - 1; i != (size_t) 0; i--) {
   555         state = visitedStates[i];
   556         action = appliedActions[i];
   559         bool processed = 
false;
   560         for (
int j = 0; j < processedStates.size(); j++) {
   561           if (processedStates[i] == state and processedActions[i] == action) {
   570         processedStates.push_back(state);
   571         processedActions.push_back(action);
   573         pair<size_t, size_t> stateCoords = 
toCoord(state);
   575         G += 
rewards(stateCoords.first, stateCoords.second);
   577         QSum(state, action) += G;
   578         visits(state, action)++;
   580       for (state = 0; state < 
nStates; state++) {
   582         for (
size_t j = 0; j < 
actions.size(); j++) {
   583           Q(state, j) = 
isGoal(state) ? 0 : QSum(state, j) / visits(state, j);
   588       if (newPolicy != 
policy) {
   607   void Sarsa(
size_t height, 
size_t width,
   608              vector<pair<size_t, size_t>> 
goals,
   612              unsigned int maxIters = 10000,
   613              bool verbose = 
true) {
   615     Timer timer(1, maxIters);
   618     for (
unsigned int iter = 0; iter < maxIters; iter++) {
   623         pair<size_t, size_t> stateCoords = 
toCoord(s);
   624         double r = 
rewards(stateCoords.first, stateCoords.second);
   627         Q(s, a) += alpha * (r + 
gamma * 
Q(newState, newAction) - 
Q(s, a));
   633       if (newPolicy != 
policy) {
   651                  vector<pair<size_t, size_t>> 
goals,
   655                  unsigned int maxIters = 10000,
   656                  bool verbose = 
true) {
   658     Timer timer(1, maxIters);
   661     for (
unsigned int iter = 0; iter < maxIters; iter++) {
   665         pair<size_t, size_t> stateCoords = 
toCoord(s);
   668         double r = 
rewards(stateCoords.first, stateCoords.second);
   678       if (newPolicy != 
policy) {
   704   const vector<pair<size_t, size_t>> &
getGoals()
 const {
   712 #endif //MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP Matrix< double > normalizeToOne(MatrixD m)
Normalizes a matriz so its sum equals 1. 
MatrixD getOptimalPolicyFromQ()
Updates the policy matrix according to the action values from the Q matrix. 
ActionType eGreedy(size_t s, double epsilon)
Selects an action for a state s following an e-greedy policy. 
vector< ActionType > actions
const MatrixD & getRewards() const
bool isGoal(size_t s)
Informs whether a state is a goal state in the grid world. 
size_t applyAction(size_t currentState, ActionType action)
Returns the next state that results from applying an action to a state. 
const MatrixD & getPolicy() const
MatrixD policyForState(size_t s)
Gets the policy for state s 
Matrix< double > policyIncrement(size_t s)
Creates a new policy for a given state giving preference to the actions with maximum value...
Class to solve the grid world toy problem as a Markov decision process. 
void policyIteration(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double threshold=.001, bool verbose=true)
Policy iteration method. 
double actionValue(size_t s, ActionType a)
Gets the q value of action a on state s 
pair< size_t, size_t > toCoord(size_t s)
Transforms a raster coordinate from the grid world into its corresponding row x column representation...
const vector< ActionType > & getActions() const
double transition(size_t currentState, ActionType action, size_t nextState)
Returns the transition probability to nextState, given currentState and action 
void MonteCarloEstimatingStarts(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, unsigned maxIters=10000, bool verbose=true)
Monte Carlo Estimating Starts algorithm for finding an optimal policy. 
unsigned long getNStates() const
const MatrixD & getQ() const
void start()
Start the timer. 
double bestQForState(size_t s)
Gets the best action value for state s. 
size_t getNonGoalState()
Selects a random non-goal state. 
const MatrixD & getV() const
vector< pair< size_t, size_t > > goals
void initialize(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1)
void iterativePolicyEvaluation(double threshold, bool verbose)
Iterative policy evaluation implemented as decribed in Sutton and Barto, 2017. 
size_t fromCoord(size_t row, size_t col)
Transforms row x column coordinates from the grid world into a raster representation. 
vector< double > actionValuesForState(size_t s)
Gets the q values of all actions for a given state. 
void valueIteration(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double threshold=.000001, bool verbose=true)
Value iteration method. 
void QLearning(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double alpha=.3, double epsilon=.8, unsigned int maxIters=10000, bool verbose=true)
Temporal difference method for finding the optimal policy using Q-Learning. 
A timer that keeps track of time, prints formatted time, checks if an interval has passed etc...
const vector< pair< size_t, size_t > > & getGoals() const
void Sarsa(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double alpha=.3, double epsilon=.8, unsigned int maxIters=10000, bool verbose=true)
Temporal difference method for finding the optimal policy using SARSA.