;
(use-package :RLI) (defun complete-example () (let (agent env sim) (setq agent (make-instance 'my-Q-agent :alpha .01 :gamma .99)) (setq env (make-instance 'maintenance-task)) (setq sim (make-instance 'my-simulation)) (sim-init sim agent env) (sim-steps sim 10000))) (defclass MY-SIMULATION (simulation) ((sum-rewards :initform 0.0) (num-rewards :initform 0) (sum-interval :initform 1000 :initarg :sum-interval))) (defmethod sim-collect-data ((sim my-simulation) s a s-prime r) (declare (ignore s a s-prime)) (with-slots (sum-rewards num-rewards sum-interval) sim (incf sum-rewards r) (incf num-rewards) (when (= num-rewards sum-interval) (format t "~%Average reward over ~A steps: ~A" sum-interval (/ sum-rewards num-rewards)) (setq sum-rewards 0.0) (setq num-rewards 0)))) (defclass my-Q-AGENT (Q-table tabular-1step-Q-learning egreedy-policy agent) (last-sensation last-action)) (defmethod agent-start-episode ((agent my-Q-AGENT) sensation) (with-slots (last-sensation last-action) agent (setq last-sensation sensation) (setq last-action (policy agent sensation)))) (defclass Q-TABLE () ((num-states :accessor num-states :initarg :num-states) (num-actions :accessor num-actions :initarg :num-actions) (initial-value :accessor :initarg :initform 0) (Q))) (defmethod agent-init :before ((agent Q-table)) (with-slots (Q num-states num-actions initial-value) agent (setq num-states (num-states (agent-env agent))) (setq num-actions (num-actions (agent-env agent))) (setf Q (make-array (list num-states num-actions) :initial-element initial-value)))) (defmethod action-values ((agent Q-table) s) (with-slots (Q num-actions) agent (loop for a below num-actions collect (aref Q s a)))) (defmethod state-value ((agent Q-table) s) (if (eq s :terminal-state) 0 (apply #'max (action-values agent s)))) (defclass TABULAR-1STEP-Q-LEARNING () ((alpha :initarg :alpha :initform 0.1) (gamma :accessor gamma :initarg :gamma :initform .9))) (defmethod agent-step ((agent tabular-1step-Q-learning) s-prime r) (with-slots (Q alpha gamma last-sensation last-action) agent (let ((s last-sensation) (a last-action)) (incf (aref Q s a) (* alpha (+ r (* gamma (state-value agent s-prime)) (- (aref Q s a))))) (setq last-sensation s-prime) (setq last-action (policy agent s-prime))))) (defclass EGREEDY-POLICY () ((epsilon :accessor agent-epsilon :initarg :epsilon :initform 0))) (defmethod policy ((agent egreedy-policy) state) (with-slots (epsilon num-actions) agent (with-prob epsilon (random num-actions) (arg-max-random-tiebreak (action-values agent state))))) (defun arg-max-random-tiebreak (list) "Returns an index to the largest value in the non-null list" (loop with best-args = (list 0) with best-value = (first list) for i from 1 for value in (rest list) do (cond ((< value best-value)) ((> value best-value) (setq best-value value) (setq best-args (list i))) ((= value best-value) (push i best-args))) finally (return (values (nth (random (length best-args)) best-args) best-value)))) ;;; TABULAR ENVIRONMENTS (defclass finite-MDP (environment) ((num-states :initarg :num-states :accessor num-states) (num-actions :initarg :num-actions :accessor num-actions) (state :initarg :initial-state :initform 0))) (defmethod env-start-episode ((env finite-MDP)) (with-slots (state) env (setq state 0))) (defmethod env-step ((env finite-MDP) action) (with-slots (state) env (let ((old-state state)) (values (setq state (env-next-state env state action)) (env-next-reward env old-state state action))))) ;; This completes the lowest-level specification of a Markov decision task. ;; It is something which can return samples of the next state and reward. ;; Next we have an example MDP and agent: the maintenance task. This is a continual ;; task, with no episodes or resets. You are running a machine to maximize reward. ;; The only way to get rewards is to operate your machine. If that works well, ;; you earn $1 and proceed from your current state i to i+1. But the machine ;; also might break. Then you go to a broken state (with zero reward), where you ;; stay with probability Q until you get out to state 0. If you chose not to ;; operate the machine, you can instead do maintenance. This doesn't get any ;; reward, and takes you back to state 0. The probability that operating the ;; machine will work is P^(i+1), except for state N, which never works. ;; Counting the broken state, there are N+2 states (defclass maintenance-task (finite-MDP) ((N :initform 10 :initarg :N) (P :initform .9 :initarg :P) (Q :initform .9 :initarg :Q) (num-actions :initform 2))) (defmethod env-init :after ((env maintenance-task)) (with-slots (N num-states) env (setf num-states (+ N 2)))) (defmethod env-next-state ((env maintenance-task) x a) (with-slots (N P Q) env (cond ((= x (+ n 1)) ; already broken (with-prob Q x 0)) ; with prob Q stay, else reset to 0 ((= a 0) ; maintenance action 0) ; always causes reset ((= x n) ; but if in final state (+ n 1)) ; then must fail (t (with-prob (expt P (+ x 1)) ; otherwise take your chances (+ x 1) ; to get to next state (+ n 1)))))) ; or break! (defmethod env-next-reward ((env maintenance-task) x y a) (declare (ignore x a)) (with-slots (N) env (if (< 0 y (+ n 1)) 1 0))) (defun with-prob (p x y &optional (random-state *random-state*)) (if (< (random 1.0 random-state) p) x y)) ;