; Copyright 1993 GTE Laboratories Incorporated
; Permission granted to copy and change for research and educational purposes
; Written by Richard S. Sutton
; General Markov Control Process with s simple illustrative version of Dyna-AHC (Dyna-PI)
; This case is a 4 X 4 grid world with start and goal states
; and a barrier making access to the goal a little difficult.
; Like this:
;
; S . . . ; S is the start state
; . . . .
; . . B . ; B is a barrier - can't be entered
; . . B G ; G is the goal state - no penalty and restart
;
; The states are numbered 0 to 15 in the obvious way, left to right
; A reward of +1 is delivered upon reaching the goal. Reward is 0 otehrwise.
; This simulation includes a simple model learner and user.
; The model learner assumes the environment is deterministic
; and simply records next states as they are found.
;
; The model user simply picks a state at random from a distribution
; reflecting how often each state has occurred before in past experience.
; It looks ahead one step and then picks another. Many model steps
; are done for each real step.
; To run an experiment call (setup) and then either (ex) or else (init) then (trials 20)
(defvar n 16) ; (Maximum) number of states
(defvar m 4) ; (Maximum) number of actions at a state
(defvar v) ; memory for evaluation function
(defvar w) ; memory for reaction function
(defvar alpha 1.0) ; learning rate for reaction function
(defvar beta 0.1) ; learning rate for evaluation function
(defvar tau 1.0) ; randomness parameter for reaction function
(defvar gamma 0.9) ; discounting parameter
(defvar x) ; For a single transition, x is start state
(defvar y) ; y is result state
(defvar a) ; a is action
(defvar r) ; r is reward
(defvar e) ; e is Eval(x)
(defvar e-prime) ; e is better estimate using r and Eval(y)
(defvar rhat) ; e-prime - e
(defvar model-next-state) ; Model memory: next-state function
(defvar model-reward) ; and reward function
(defvar times-visited) ; Vector of how many times each state has been visited
(defvar total-visits) ; How many times all states together have been visited
(defvar num-model-steps 10) ;number of steps to do with model for each real step
(defvar current-state)
(defvar start-state 0)
(defvar goal-state 15)
(defvar real-next-state)
(defvar height 4)
(defvar width 4)
(defun setup ()
(setq v (make-array n))
(setq w (make-array (list n m)))
(setq model-next-state (make-array (list n m)))
(setq model-reward (make-array (list n m)))
(setq times-visited (make-array n))
(setq real-next-state (make-array (list n m)))
(make-real-next-state real-next-state)
)
(defun init ()
(setq total-visits 0)
(loop for i from 0 below n do
(setf (aref v i) 0.0)
(setf (aref times-visited i) 0)
(loop for j from 0 below m do
(setf (aref w i j) 0.0)
(setf (aref model-reward i j) nil)
(setf (aref model-next-state i j) nil)
)))
(defun learn ()
(setq e (aref v x))
(setq e-prime (+ r (* gamma (aref v y))))
(incf (aref v x) (* beta (- e-prime e)))
(incf (aref w x a) (* alpha (- e-prime e))))
(defun learn-model ()
(setf (aref model-next-state x a) y)
(setf (aref model-reward x a) r))
(defun pick-from-visited-states ()
(loop with random = (random total-visits)
for i from 0 below n summing (aref times-visited i) into sum
until (> sum random)
finally (return i)))
(defun trial ()
"Runs a single trial, from start to goal states"
(setq current-state start-state)
(loop while (not (= current-state goal-state))
with time = 0
do
(incf (aref times-visited current-state))
(incf total-visits)
(loop repeat num-model-steps do
(setq x (pick-from-visited-states))
(setq a (action w x))
(setq y (aref model-next-state x a))
(setq r (aref model-reward x a))
when y do (learn))
(setq x current-state)
(setq a (action w x))
(setq y (aref real-next-state x a))
(setq r (if (= y goal-state) 1 0))
(learn)
(learn-model)
(setq current-state y)
(incf time)
finally (return time)))
(defun action (w x)
(loop with max = -1000000.0
with best
for i from 0 below m
for sum-i = (+ (aref w x i) (random-exponential tau))
when (< max sum-i)
do (setq max sum-i)
(setq best i)
finally (return best)))
(defun make-real-next-state (Q)
(loop for x below n do
(loop for a below m do
(setf (aref Q x a)
(let ((w (x-from-state x))
(h (y-from-state x)))
(case a
(0 (incf h) (if (>= h height) (decf h)))
(1 (incf w) (if (>= w width) (decf w)))
(2 (decf h) (if (< h 0) (incf h)))
(3 (decf w) (if (< w 0) (incf w)))
(otherwise (error "illegal action")))
(state-from-xy w h)))
(if (or (= 10 (aref Q x a))
(= 14 (aref Q x a)))
(setf (aref Q x a) x)))))
(defun x-from-state (x)
(rem x width))
(defun y-from-state (x)
(floor x width))
(defun state-from-xy (w h)
(+ (* h width)
w))
(defun show-v ()
(loop for h below height do
(format t "~%~2D:" h)
(loop for w below width do
(format t "~8,3F" (- 0.0 (aref v (state-from-xy w h)))))))
(defun show-w ()
(loop for h below height do
(format t "~%~D:" h)
(loop for wi below width do
(format t "~D(" wi)
(loop for i below m do
(format t "~,3F " (aref w (state-from-xy wi h) i))))))
(defun show1 (v)
(loop for h below height do
(format t "~%~2D:" h)
(loop for w below width do
(format t "~8,3F" (- 0.0 (aref v (state-from-xy w h)))))))
(defun show2 (w)
(loop for h below height do
(format t "~%~D:" h)
(loop for wi below width do
(format t "~D(" wi)
(loop for i below m do
(format t "~,3F " (aref w (state-from-xy wi h) i))))))
(defun trials (num-trials)
(loop repeat num-trials collect (trial)))
(defvar data)
(defun ex ()
(setq data nil)
(loop repeat 100 do
(init)
(push (trials 20) data))
(multi-mean data))
(defun multi-mean (list-of-lists)
(loop for list in (reorder-list-of-lists list-of-lists)
collect (mean list)))
(defun random-exponential (tau)
(- (* tau
(log (- 1
(random 1.0))))))
(defun reorder-list-of-lists (list-of-lists)
(loop for n from 0 below (length (first list-of-lists))
collect (loop for list in list-of-lists collect (nth n list))))
(defun mean (l)
(float
(/ (loop for i in l sum i)
(length l))))