;;; Code for creating random unstructured and structured MDPs (utilities at end)
(in-package cl-user)
(locally
(declare (speed 3) (safety 0))
;;; Random, finite, unstructured MDPs ;;;;;;;;;;;;;;;;;;;;;;;
#| Unstructured random MDPs
N = number of states
M = number of actions
B = branching factor
P = array giving, for each state and action, a list of next states and probabilities
R = array giving expected reward for each state and action
Example: Evaluate the random 2-action policy on a set of 50 random MDPs.
Random policy: (defun policy (state) '(0.5 0.5))
Make 50 MDPs, each of 100 states and a branching factor of 3:
(setq MDPs (loop for i below 50 collect (make-random-MDP 100 2 3)))
Evaluate them: (loop for MDP in MDPs do (print (evaluate-policy 'policy MDP .9 0)))
|#
(defclass MDP () (N M B P R))
(defun make-random-MDP (num-states num-actions branching-factor)
(let ((MDP (make-instance 'MDP)))
(with-slots (N M B P R) MDP
(setq N num-states)
(setq M num-actions)
(setq B branching-factor)
(setq P (make-array (list N M) :initial-element nil))
(setq R (make-array (list N M)))
(loop for s below N do
(loop for a below M do
(setf (aref R s a) (random-normal))
(loop for prob in (random-simplex branching-factor)
for sprime in (random-k-of-n B N)
do (setf (aref P s a)
(add-prob prob sprime (aref P s a))))))
MDP)))
(defmethod next-state ((MDP MDP) s a &optional (random-state *random-state*))
(with-slots (P) MDP
(let ((R (random 1.0 random-state)))
(loop for (s-prime . prob) in (aref P s a) sum prob into cumulative-prob
until (>= cumulative-prob R)
finally (return s-prime)))))
(defmethod expected-reward ((MDP MDP) s a)
(with-slots (R) MDP
(aref R s a)))
(defun evaluate-policy (policy MDP gamma start-state &optional (threshold .0000001))
"DP policy evaluation of the policy (mapping from states to action probs)"
(with-slots (N M B P R) MDP
(let ((V (make-array N :initial-element 0)))
(loop for delta = 0 do
(loop for s below N
for old-V = (aref V s)
do (setf (aref V s)
(loop for a below M
for prob in (funcall policy s)
sum (* prob (+ (aref R s a)
(loop for (sp . pr) in (aref P s a)
sum (* pr gamma (aref V sp)))))))
(if (< delta (abs (- old-V (aref V s))))
(setq delta (abs (- old-V (aref V s))))))
until (< delta threshold))
(aref V start-state))))
(defun add-prob (increment outcome prob-list)
(if (loop for s.prob in prob-list
for (s . prob) in prob-list
thereis (if (= s outcome)
(setf (cdr s.prob) (+ prob increment))))
prob-list
(cons (cons outcome increment) prob-list)))
;;; Random structured MDPs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#| Structured MDPs are MDPs with state VARIABLES, each of which can take on a (small)
number of values in {0,1,2,...,k}. Actions correspond to rules, one per variable,
for describing the probability distribution over values of that variable, plus another
set of rules for additive components of the expected reward.
A structed MDP is a list of actions and an array of numbers of values for the variables.
Each action is a list of transition rules (one per variable) and a list of reward rules.
A transition rule for a variable, a list of antecedents, and an array of simplexes (a table of
probabilities for the variable taking on each of its values, indexed by the antecedents' values).
A reward rule is a list of antecedents and a table of reward components.
Example: Evaluate the random 2-action policy on a set of 50 random structured MDPs.
Random policy: (defun sample-policy (state) (random 2))
Make 50 MDPs, each with 10 binary state vars, where both actions have 4
transition rules and 1 reward rule, and all rules depend on 3 antecedent variables
(setq MDPs (loop for i below 50 collect
(make-random-structured-MDP (loop repeat 10 collect 2)
2 4 3 1 3)))
Evaluate them: (loop for MDP in MDPs do (print (MC-policy-evaluation 'sample-policy MDP .9 100 .01)))
|#
(defclass Structured-MDP () (actions num-values-of-each-var hash-table))
(defclass action () (transition-rules reward-rules))
(defclass transition-rule () (consequent antecedents simplexes))
(defclass reward-rule () (antecedents reward-components))
(defun make-structured-MDP (num-values-of-each-var &optional actions)
(let ((MDP (make-instance 'Structured-MDP)))
(setf (slot-value MDP 'actions) actions)
(setf (slot-value MDP 'num-values-of-each-var) num-values-of-each-var)
(setf (slot-value MDP 'hash-table) (make-hash-table :test #'equal))
MDP))
(defun num-values (Structured-MDP variable-num)
(with-slots (num-values-of-each-var) Structured-MDP
(nth variable-num num-values-of-each-var)))
(defun num-actions (Structured-MDP)
(with-slots (actions) Structured-MDP
(length actions)))
(defun num-variables (structured-MDP)
(with-slots (num-values-of-each-var) Structured-MDP
(length num-values-of-each-var)))
(defun add-action (Structured-MDP &optional transition-rules reward-rules)
"Adds a new action to the structured MDP, returns the action number"
(let ((action (make-instance 'action)))
(setf (slot-value action 'transition-rules) transition-rules)
(setf (slot-value action 'reward-rules) reward-rules)
(with-slots (actions) structured-MDP
(setf actions (append actions (list action)))
(- (length actions) 1))))
(defun add-transition-rule (Structured-MDP action-num consequent antecedents simplexes)
"Adds a new transition-rule to the indicated action of the structured MDP"
(let ((transition-rule (make-instance 'transition-rule)))
(setf (slot-value transition-rule 'consequent) consequent)
(setf (slot-value transition-rule 'antecedents) antecedents)
(setf (slot-value transition-rule 'simplexes) simplexes)
(with-slots (transition-rules) (nth action-num (slot-value structured-MDP 'actions))
(setf transition-rules (append transition-rules (list transition-rule))))))
(defun add-reward-rule (Structured-MDP action-num antecedents reward-components)
"Adds a new reward-rule to the indicated action of the structured MDP"
(let ((reward-rule (make-instance 'reward-rule)))
(setf (slot-value reward-rule 'antecedents) antecedents)
(setf (slot-value reward-rule 'reward-components) reward-components)
(with-slots (reward-rules) (nth action-num (slot-value structured-MDP 'actions))
(setf reward-rules (append reward-rules (list reward-rule))))))
(defun add-random-reward-rule (Structured-MDP action-num K)
(let* ((antecedents (random-k-of-n K (num-variables Structured-MDP)))
(num-values (loop for a in antecedents collect (num-values Structured-MDP a))))
(add-reward-rule Structured-MDP action-num antecedents
(make-array num-values
:initial-contents (structure-of-random-numbers num-values)))))
(defun structure-of-random-numbers (structure)
(if (null structure)
(random-normal)
(loop repeat (first structure) collect (structure-of-random-numbers (rest structure)))))
(defun add-random-transition-rule (Structured-MDP action-num consequent K simplex-generator)
(let* ((antecedents (random-k-of-n K (num-variables Structured-MDP)))
(num-values (loop for a in antecedents collect (num-values structured-MDP a)))
(CPT (make-array num-values)))
(funcall-on-indices nil num-values (lambda (antecedent-values)
(setf (apply #'aref CPT antecedent-values)
(simplex-to-fastsimplex
(funcall simplex-generator antecedent-values action-num
(num-values structured-MDP consequent))))))
(add-transition-rule Structured-MDP
action-num consequent antecedents
CPT)))
(defun funcall-on-indices (prefix remaining function)
(if (null remaining)
(funcall function prefix)
(loop for i below (first remaining) do
(funcall-on-indices (append prefix (list i)) (rest remaining) function))))
(defun make-random-Structured-MDP
(num-values-of-each-var num-actions num-transition-rules num-transition-antecedents
num-reward-rules num-reward-antecedents
&optional (simplex-generator #'random-simplex-generator))
(let ((MDP (make-structured-MDP num-values-of-each-var)))
(loop repeat num-actions
for action-num = (add-action MDP)
do (loop repeat num-transition-rules
for consequent in (random-k-of-n num-transition-rules (num-variables MDP))
do (add-random-transition-rule MDP action-num consequent
num-transition-antecedents simplex-generator))
do (loop repeat num-reward-rules
do (add-random-reward-rule MDP action-num num-reward-antecedents)))
MDP))
(defun random-simplex-generator (antecedent-values action num-values)
(declare (ignore antecedent-values) (ignore action))
(random-simplex num-values))
#|
(defmethod next-state ((MDP Structured-MDP) state action &optional (random-state *random-state*))
"The new list of state variable values as a consequence of taking action"
(let ((new-state (copy-list state)))
(loop for rule in (slot-value (nth action (slot-value MDP 'actions)) 'transition-rules)
do (with-slots (consequent antecedents simplexes) rule
(setf (nth consequent new-state)
(random-from-simplex (apply #'aref simplexes (loop for a in antecedents
collect (nth a state)))
random-state))))
new-state))
|#
(defmethod next-state ((MDP Structured-MDP) state action &optional (random-state *random-state*))
"The new list of state variable values as a consequence of taking action"
(with-slots (hash-table) MDP
(let ((simplexes (gethash (cons action state) hash-table)))
(unless simplexes
(setq simplexes (collect-simplexes MDP state action))
(setf (gethash (cons action state) hash-table) simplexes))
(loop for simplex in simplexes collect (random-from-fastsimplex simplex random-state)))))
(defun collect-simplexes (MDP state action)
(with-slots (transition-rules) (nth action (slot-value MDP 'actions))
(loop for var below (num-variables MDP) collect
(loop for rule in transition-rules do
(with-slots (consequent antecedents simplexes) rule
(if (= var consequent)
(return (apply #'aref simplexes (loop for a in antecedents collect (nth a state))))))
finally (return (deterministic-simplex (nth var state)))))))
(defmethod expected-reward ((MDP Structured-MDP) state action)
(loop for rule in (slot-value (nth action (slot-value MDP 'actions)) 'reward-rules)
sum (with-slots (antecedents reward-components) rule
(apply #'aref reward-components (loop for a in antecedents
collect (nth a state))))))
(defun MC-policy-evaluation (sample-policy MDP gamma &optional
(num-samples 1000) (threshold .001)
(start-state (loop repeat (num-variables MDP) collect 0)))
"Monte-Carlo policy evaluation of the policy (mapping from states to action probs)"
(stats
(loop with state = start-state
repeat num-samples
collect (loop for multiplier = 1 then (* multiplier gamma) until (< multiplier threshold)
for action = (funcall sample-policy state)
sum (* multiplier (expected-reward MDP state action))
do (setq state (next-state MDP state action))))))
;;; utility routines ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defmacro square (x)
`(if (> (abs ,x) 1e10) 1e20 (* ,x ,x)))
(defun random-normal (&optional (random-state *random-state*))
(do ((u 0.0)
(v 0.0))
((progn
(setq u (random 1.0 random-state) ; U is bounded (0 1)
v (* 2.0 (sqrt 2.0) (exp -0.5) ; V is bounded (-MAX MAX)
(- (random 1.0 random-state) 0.5)))
(<= (* v v) (* -4.0 u u (log u)))) ; < should be <=
(/ v u))
(declare (double-float u v))))
(defun mean (l)
(float
(/ (loop for i in l sum i)
(length l))))
(defun mse (target values)
(mean (loop for v in values collect (square (- v target)))))
(defun rmse (target values) ;root mean square error
(sqrt (mse target values)))
(defun stdev (l)
(rmse (mean l) l))
(defun stats (list)
(list (mean list) (/ (stdev list) (sqrt (length list)))))
(defun random-k-of-n (k n &optional (random-state *random-state*))
(loop for i = (random n random-state)
unless (member i result) collect i into result
until (= k (length result))
finally (return result)))
(defun random-simplex (n &optional (random-state *random-state*))
"returns n positive numbers that sum to 1.0 -- a random partition of unity"
(loop for last-r = 0 then r
for r in (sort (cons 1.0 (loop repeat (- n 1)
collect (random 1.0 random-state))) #'<)
collect (- r last-r)))
(defun random-from-simplex (simplex &optional (random-state *random-state*))
"random sample from {0,...,n-1} where simplex is list of the n probabilities (must sum to 1)"
(let ((R (random 1.0 random-state)))
(loop for i below (- (length simplex) 1)
for prob in simplex
for cumulative-prob = prob then (+ cumulative-prob prob) ; little optimizations
until (>= cumulative-prob R)
finally (return i))))
(defconstant fast-simplex-max 10000)
(defun random-from-fastsimplex (integer-csimplex &optional (random-state *random-state*))
"like random-simplex but from an integer cumulative simplex, a CDF from 0 to fast-simplex-max instead of 0 to 1.0"
(let ((R (random fast-simplex-max random-state)))
(loop for i below (- (length integer-csimplex) 1)
for cumulative-prob in integer-csimplex
until (>= cumulative-prob R)
finally (return i))))
(defun deterministic-simplex (k)
"returns the simplex that puts all the weight on the kth choice"
(loop with list = (list 1)
repeat k
do (push 0 list)
finally (return list)))
(defun deterministic-fastsimplex (k)
"returns the simplex that puts all the weight on the kth choice"
(loop with list = (list fast-simplex-max)
repeat k
do (push 0 list)
finally (return list)))
(defun simplex-to-fastsimplex (simplex)
(append (loop for p in (butlast simplex)
sum p into sump
collect (round (* sump fast-simplex-max)))
(list fast-simplex-max)))
)