8 #ifndef MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP 9 #define MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP 11 #include "../include/matrix/Matrix.hpp" 12 #include "../include/mersenne_twister/MersenneTwister.hpp" 23 vector<pair<size_t, size_t>>
goals;
29 if (
goals.size() == 0)
30 throw invalid_argument(
"No goal state, must pass at least one");
32 this->nStates = height * width;
35 V = MatrixD::zeros(height, width);
39 rewards = MatrixD::fill(height, width, -1);
41 rewards(goal.first, goal.second) = 0;
43 if (
rewards.unique().sum() == 0)
44 throw invalid_argument(
"All states are goal!");
60 return row *
V.nCols() + col;
69 size_t row =
static_cast<size_t>(ceil((s + 1) / (
double)
V.nCols()) - 1);
70 size_t col = s - row *
V.nCols();
84 pair<size_t, size_t> coords =
toCoord(s);
85 double r =
rewards(coords.first, coords.second);
86 for (
size_t i = 0; i <
V.nRows(); ++i) {
87 for (
size_t j = 0; j <
V.nCols(); ++j) {
93 q += t * (r +
gamma * v);
116 vector<double> result(
actions.size());
118 for (
int i = 0; i <
actions.size(); i++)
131 MatrixD result = MatrixD::zeros(
actions.size(), 1);
135 double bestQ = actionValues[0];
137 for (
size_t i = 1; i < actionValues.size(); i++) {
139 if (actionValues[i] > bestQ)
140 bestQ = actionValues[i];
145 for (
size_t i = 0; i <
actions.size(); i++) {
146 if (actionValues[i] == bestQ)
158 for (
size_t i = 0; i <
V.nRows(); i++) {
159 for (
size_t j = 0; j <
V.nCols(); ++j) {
168 double maxProb = p.max();
170 if (p(
UP, 0) == maxProb and p(
DOWN, 0) == maxProb and p(
LEFT, 0) == maxProb and p(
RIGHT, 0) == maxProb)
172 else if (p(
UP, 0) == maxProb and p(
DOWN, 0) == maxProb and p(
LEFT, 0) == maxProb)
174 else if (p(
UP, 0) == maxProb and p(
DOWN, 0) == maxProb and p(
RIGHT, 0) == maxProb)
176 else if (p(
UP, 0) == maxProb and p(
LEFT, 0) == maxProb and p(
RIGHT, 0) == maxProb)
178 else if (p(
DOWN, 0) == maxProb and p(
LEFT, 0) == maxProb and p(
RIGHT, 0) == maxProb)
180 else if (p(
UP, 0) == maxProb and p(
DOWN, 0) == maxProb)
182 else if (p(
UP, 0) == maxProb and p(
LEFT, 0) == maxProb)
184 else if (p(
UP, 0) == maxProb and p(
RIGHT, 0) == maxProb)
186 else if (p(
DOWN, 0) == maxProb and p(
LEFT, 0) == maxProb)
188 else if (p(
DOWN, 0) == maxProb and p(
RIGHT, 0) == maxProb)
190 else if (p(
LEFT, 0) == maxProb and p(
RIGHT, 0) == maxProb)
192 else if (p(
UP, 0) == maxProb)
194 else if (p(
DOWN, 0) == maxProb)
196 else if (p(
LEFT, 0) == maxProb)
198 else if (p(
RIGHT, 0) == maxProb)
217 for (
size_t i = 0; i <
V.nRows(); i++) {
218 for (
size_t j = 0; j <
V.nCols(); j++) {
221 double currentV =
V(i, j), newV = 0;
228 double newDelta = abs(currentV -
V(i, j));
230 if (newDelta > delta)
234 }
while (delta >= threshold);
236 cout << iter <<
" iterations of policy evaluation" << endl;
245 pair<size_t, size_t> stateCoords =
toCoord(s);
246 return std::find(
goals.begin(),
goals.end(), stateCoords) !=
goals.end();
263 size_t resultingState =
applyAction(currentState, action);
264 return resultingState == nextState;
274 pair<size_t, size_t> s1 =
toCoord(currentState);
275 size_t s1row = s1.first, s1col = s1.second, s2row, s2col;
278 case UP:s2col = s1col;
279 s2row = s1row != 0 ? s1row - 1 : s1row;
281 case DOWN:s2col = s1col;
282 s2row = s1row !=
V.nCols() - 1 ? s1row + 1 : s1row;
284 case LEFT:s2row = s1row;
285 s2col = s1col != 0 ? s1col - 1 : s1col;
287 case RIGHT:s2row = s1row;
288 s2col = s1col !=
V.nRows() - 1 ? s1col + 1 : s1col;
314 s =
static_cast<size_t>(t.i_random(static_cast<int>(
nStates - 1)));
330 if (t.d_random() <= epsilon) {
332 double bestQ =
Q(s, 0);
334 for (
size_t j = 1; j <
actions.size(); j++) {
335 if (bestQ <=
Q(s, j)) {
336 if (bestQ ==
Q(s, j))
337 bestAction = t.d_random(1) <= .5 ?
actions[j] : bestAction;
357 double bestQ =
Q(s, 0);
359 for (
size_t j = 1; j <
actions.size(); j++) {
360 if (bestQ <
Q(s, j)) {
372 for (
size_t state = 0; state <
nStates; state++) {
374 double bestQ =
Q(state, 0);
375 for (
size_t j = 1; j <
actions.size(); j++) {
376 if (bestQ <
Q(state, j))
382 for (
size_t j = 0; j <
actions.size(); j++) {
383 if (
Q(state, j) == bestQ)
384 newPolicy(state, j) = 1;
387 newPolicy.setRow(state,
normalizeToOne(newPolicy.getRow(state).transpose()));
405 vector<pair<size_t, size_t>>
goals,
407 double threshold = .001,
408 bool verbose =
true) {
414 MatrixD currentPolicy;
424 for (
size_t i = 0; i <
V.nRows(); ++i) {
425 for (
size_t j = 0; j <
V.nCols(); ++j) {
438 if (currentStatePolicy != betterStatePolicy)
439 policy.setRow(state, betterStatePolicy.transpose());
442 if (verbose) cout <<
"iteration " << iter <<
" of policy improvement" << endl <<
prettifyPolicy() << endl;
443 }
while (currentPolicy !=
policy);
457 vector<pair<size_t, size_t>>
goals,
459 double threshold = .000001,
460 bool verbose =
true) {
468 for (
size_t i = 0; i <
V.nRows(); i++) {
469 for (
size_t j = 0; j <
V.nCols(); j++) {
479 for (
size_t k = 1; k < actionValues.size(); k++) {
480 if (actionValues[k] > actionValues[action]) {
485 double currentV =
V(i, j);
486 V(i, j) = actionValues[action];
490 double newDelta = abs(currentV -
V(i, j));
492 if (newDelta > delta)
496 if (verbose) cout <<
"iteration " << iter << endl <<
prettifyPolicy() << endl;
497 }
while (delta >= threshold);
511 vector<pair<size_t, size_t>>
goals,
513 unsigned maxIters = 10000,
514 bool verbose =
true) {
520 MersenneTwister twister;
523 chrono::time_point<chrono::system_clock> start = myClock::now();
524 Timer timer(1, maxIters);
527 for (
unsigned iter = 0; iter < maxIters; iter++) {
529 vector<size_t> visitedStates;
530 vector<ActionType> appliedActions;
538 action =
actions[twister.i_random(static_cast<int>(
actions.size() - 1))];
541 visitedStates.push_back(state);
542 appliedActions.push_back(action);
548 vector<size_t> processedStates;
549 vector<ActionType> processedActions;
554 for (
size_t i = visitedStates.size() - 1; i != (size_t) 0; i--) {
555 state = visitedStates[i];
556 action = appliedActions[i];
559 bool processed =
false;
560 for (
int j = 0; j < processedStates.size(); j++) {
561 if (processedStates[i] == state and processedActions[i] == action) {
570 processedStates.push_back(state);
571 processedActions.push_back(action);
573 pair<size_t, size_t> stateCoords =
toCoord(state);
575 G +=
rewards(stateCoords.first, stateCoords.second);
577 QSum(state, action) += G;
578 visits(state, action)++;
580 for (state = 0; state <
nStates; state++) {
582 for (
size_t j = 0; j <
actions.size(); j++) {
583 Q(state, j) =
isGoal(state) ? 0 : QSum(state, j) / visits(state, j);
588 if (newPolicy !=
policy) {
607 void Sarsa(
size_t height,
size_t width,
608 vector<pair<size_t, size_t>>
goals,
612 unsigned int maxIters = 10000,
613 bool verbose =
true) {
615 Timer timer(1, maxIters);
618 for (
unsigned int iter = 0; iter < maxIters; iter++) {
623 pair<size_t, size_t> stateCoords =
toCoord(s);
624 double r =
rewards(stateCoords.first, stateCoords.second);
627 Q(s, a) += alpha * (r +
gamma *
Q(newState, newAction) -
Q(s, a));
633 if (newPolicy !=
policy) {
651 vector<pair<size_t, size_t>>
goals,
655 unsigned int maxIters = 10000,
656 bool verbose =
true) {
658 Timer timer(1, maxIters);
661 for (
unsigned int iter = 0; iter < maxIters; iter++) {
665 pair<size_t, size_t> stateCoords =
toCoord(s);
668 double r =
rewards(stateCoords.first, stateCoords.second);
678 if (newPolicy !=
policy) {
704 const vector<pair<size_t, size_t>> &
getGoals()
const {
712 #endif //MACHINE_LEARNING_DYNAMICPROGRAMMING_HPP Matrix< double > normalizeToOne(MatrixD m)
Normalizes a matriz so its sum equals 1.
MatrixD getOptimalPolicyFromQ()
Updates the policy matrix according to the action values from the Q matrix.
ActionType eGreedy(size_t s, double epsilon)
Selects an action for a state s following an e-greedy policy.
vector< ActionType > actions
const MatrixD & getRewards() const
bool isGoal(size_t s)
Informs whether a state is a goal state in the grid world.
size_t applyAction(size_t currentState, ActionType action)
Returns the next state that results from applying an action to a state.
const MatrixD & getPolicy() const
MatrixD policyForState(size_t s)
Gets the policy for state s
Matrix< double > policyIncrement(size_t s)
Creates a new policy for a given state giving preference to the actions with maximum value...
Class to solve the grid world toy problem as a Markov decision process.
void policyIteration(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double threshold=.001, bool verbose=true)
Policy iteration method.
double actionValue(size_t s, ActionType a)
Gets the q value of action a on state s
pair< size_t, size_t > toCoord(size_t s)
Transforms a raster coordinate from the grid world into its corresponding row x column representation...
const vector< ActionType > & getActions() const
double transition(size_t currentState, ActionType action, size_t nextState)
Returns the transition probability to nextState, given currentState and action
void MonteCarloEstimatingStarts(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, unsigned maxIters=10000, bool verbose=true)
Monte Carlo Estimating Starts algorithm for finding an optimal policy.
unsigned long getNStates() const
const MatrixD & getQ() const
void start()
Start the timer.
double bestQForState(size_t s)
Gets the best action value for state s.
size_t getNonGoalState()
Selects a random non-goal state.
const MatrixD & getV() const
vector< pair< size_t, size_t > > goals
void initialize(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1)
void iterativePolicyEvaluation(double threshold, bool verbose)
Iterative policy evaluation implemented as decribed in Sutton and Barto, 2017.
size_t fromCoord(size_t row, size_t col)
Transforms row x column coordinates from the grid world into a raster representation.
vector< double > actionValuesForState(size_t s)
Gets the q values of all actions for a given state.
void valueIteration(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double threshold=.000001, bool verbose=true)
Value iteration method.
void QLearning(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double alpha=.3, double epsilon=.8, unsigned int maxIters=10000, bool verbose=true)
Temporal difference method for finding the optimal policy using Q-Learning.
A timer that keeps track of time, prints formatted time, checks if an interval has passed etc...
const vector< pair< size_t, size_t > > & getGoals() const
void Sarsa(size_t height, size_t width, vector< pair< size_t, size_t >> goals, double gamma=1, double alpha=.3, double epsilon=.8, unsigned int maxIters=10000, bool verbose=true)
Temporal difference method for finding the optimal policy using SARSA.