;;; Jack's car rental problem. The state is n1 and n2, the number of cars
;;; at each location a the end of the day, at most 20. Actions are numbers of cars
;;; to switch from location 1 to location 2, a number between -5 and +5.
;;; P1(n1,new-n1) is a 26x21 array giving the probability that the number of cars at
;;; location 1 is new-n1, given that it starts the day at n1. Similarly for P2
;;; R1(n1) is a 26 array giving the expected reward due to satisfied requests at
;;; location, given that the day starts with n1 cars at location 1. SImilarly for R2.
;;; Here we implement policy iteration.
(defvar V)
(defvar lambda-requests1 3)
(defvar lambda-requests2 4)
(defvar lambda-dropoffs1 3)
(defvar lambda-dropoffs2 2)
(defvar policy)
(defvar P1)
(defvar P2)
(defvar R1)
(defvar R2)
(defvar gamma .9)
(defvar theta .0000001)
(defun setup ()
(setq V (make-array '(21 21) :initial-element 0))
(setq policy (make-array '(21 21) :initial-element 0))
(setq P1 (make-array '(26 21) :initial-element 0))
(setq P2 (make-array '(26 21) :initial-element 0))
(setq R1 (make-array 26 :initial-element 0))
(setq R2 (make-array 26 :initial-element 0))
(load-P-and-R P1 R1 lambda-requests1 lambda-dropoffs1)
(load-P-and-R P2 R2 lambda-requests2 lambda-dropoffs2))
(defun load-P-and-R (P R lambda-requests lambda-dropoffs)
(loop for requests from 0
for request-prob = (poisson requests lambda-requests)
until (< request-prob .000001)
do (loop for n upto 25 do
(incf (aref R n) (* 10 request-prob (min requests n))))
do (loop for dropoffs from 0
for drop-prob = (poisson dropoffs lambda-dropoffs)
until (< drop-prob .000001)
do (loop for n upto 25
for satisfied-requests = (min requests n)
for new-n = (max 0 (min 20 (- (+ n dropoffs)
satisfied-requests)))
do (incf (aref P n new-n) (* request-prob drop-prob))))))
(defun poisson (n lambda)
"The probability of n events according to the poisson distribution"
(* (exp (- lambda))
(/ (expt lambda n)
(factorial n))))
(defun factorial (n)
(if (= n 0)
1
(* n (factorial (- n 1)))))
(defun backup-action (n1 n2 a)
(setq a (max (- n2) (min a n1)))
(setq a (max -5 (min 5 a)))
(+ (* -2 (abs a))
(loop with morning-n1 = (- n1 a)
with morning-n2 = (+ n2 a)
for new-n1 upto 20 sum
(loop for new-n2 upto 20 sum
(* (aref P1 morning-n1 new-n1)
(aref P2 morning-n2 new-n2)
(+ (aref R1 morning-n1)
(aref R2 morning-n2)
(* gamma (aref V new-n1 new-n2))))))))
(defun policy-eval ()
(loop while (< theta
(loop for n1 upto 20 maximize
(loop for n2 upto 20
for old-v = (aref V n1 n2)
for a = (aref policy n1 n2)
do (setf (aref V n1 n2) (backup-action n1 n2 a))
maximize (abs (- old-v (aref V n1 n2))))))))
(defun policy (n1 n2 &optional (epsilon .0000000001))
(loop with best-value = -1
with best-action
for a from (max -5 (- n2)) upto (min 5 n1)
for this-value = (backup-action n1 n2 a)
do (when (> this-value (+ best-value epsilon))
(setq best-value this-value)
(setq best-action a))
finally (return best-action)))
(defun show-greedy-policy ()
(loop for n1 from 0 upto 20 do
(format t "~%")
(loop for n2 from 0 upto 20 do
(format t "~3A" (policy n1 n2)))))
(defun greedify ()
(loop with policy-improved = nil
for n1 from 0 upto 20 do
(loop for n2 from 0 upto 20
for b = (aref policy n1 n2) do
(setf (aref policy n1 n2) (policy n1 n2))
(unless (= b (aref policy n1 n2))
(setq policy-improved t)))
finally (progn (show-policy) (return policy-improved))))
(defun show-policy ()
(loop for n1 from 0 upto 20 do
(format t "~%")
(loop for n2 from 0 upto 20 do
(format t "~3A" (aref policy n1 n2)))))
(defun policy-iteration ()
(loop for count from 0
do (policy-eval)
do (print count)
while (greedify)))