; This is code to implement the Tic-Tac-Toe example in Chapter 1 of the
; book "Learning by Interacting". Read that chapter before trying to
; understand this code.
; States are lists of two lists and an index, e.g., ((1 2 3) (4 5 6) index),
; where the first list is the location of the X's and the second list is
; the location of the O's. The index is into a large array holding the value
; of the states. There is a one-to-one mapping from index to the lists.
; The locations refer not to the standard positions, but to the "magic square"
; positions:
;
; 2 9 4
; 7 5 3
; 6 1 8
;
; Labelling the locations of the Tic-Tac-Toe board in this way is useful because
; then we can just add up any three positions, and if the sum is 15, then we
; know they are three in a row. The following function then tells us if a list
; of X or O positions contains any that are three in a row.
(defvar magic-square '(2 9 4 7 5 3 6 1 8))
(defun any-n-sum-to-k? (n k list)
(cond ((= n 0)
(= k 0))
((< k 0)
nil)
((null list)
nil)
((any-n-sum-to-k? (- n 1) (- k (first list)) (rest list))
t) ; either the first element is included
((any-n-sum-to-k? n k (rest list))
t))) ; or it's not
; This representation need not be confusing. To see any state, print it with:
(defun show-state (state)
(let ((X-moves (first state))
(O-moves (second state)))
(format t "~%")
(loop for location in magic-square
for i from 0
do
(format t (cond ((member location X-moves)
" X")
((member location O-moves)
" O")
(t " -")))
(when (= i 5) (format t " ~,3F" (value state)))
(when (= 2 (mod i 3)) (format t "~%"))))
(values))
; The value function will be implemented as a big, mostly empty array. Remember
; that a state is of the form (X-locations O-locations index), where the index
; is an index into the value array. The index is computed from the locations.
; Basically, each side gets a bit for each position. The bit is 1 is that side
; has played there. The index is the integer with those bits on. X gets the
; first (low-order) nine bits, O the second nine. Here is the function that
; computes the indices:
(defvar powers-of-2
(make-array 10
:initial-contents
(cons nil (loop for i below 9 collect (expt 2 i)))))
(defun state-index (X-locations O-locations)
(+ (loop for l in X-locations sum (aref powers-of-2 l))
(* 512 (loop for l in O-locations sum (aref powers-of-2 l)))))
(defvar value-table)
(defvar initial-state)
(defun init ()
(setq value-table (make-array (* 512 512) :initial-element nil))
(setq initial-state '(nil nil 0))
(set-value initial-state 0.5)
(values))
(defun value (state)
(aref value-table (third state)))
(defun set-value (state value)
(setf (aref value-table (third state)) value))
(defun next-state (player state move)
"returns new state after making the indicated move by the indicated player"
(let ((X-moves (first state))
(O-moves (second state)))
(if (eq player :X)
(push move X-moves)
(push move O-moves))
(setq state (list X-moves O-moves (state-index X-moves O-moves)))
(when (null (value state))
(set-value state (cond ((any-n-sum-to-k? 3 15 X-moves)
0)
((any-n-sum-to-k? 3 15 O-moves)
1)
((= 9 (+ (length X-moves) (length O-moves)))
0)
(t 0.5))))
state))
(defun terminal-state-p (state)
(integerp (value state)))
(defvar alpha 0.5)
(defvar epsilon 0.01)
(defun possible-moves (state)
"Returns a list of unplayed locations"
(loop for i from 1 to 9
unless (or (member i (first state))
(member i (second state)))
collect i))
(defun random-move (state)
"Returns one of the unplayed locations, selected at random"
(let ((possible-moves (possible-moves state)))
(if (null possible-moves)
nil
(nth (random (length possible-moves))
possible-moves))))
(defun greedy-move (player state)
"Returns the move that, when played, gives the highest valued position"
(let ((possible-moves (possible-moves state)))
(if (null possible-moves)
nil
(loop with best-value = -1
with best-move
for move in possible-moves
for move-value = (value (next-state player state move))
do (when (> move-value best-value)
(setf best-value move-value)
(setf best-move move))
finally (return best-move)))))
; Now here is the main function
(defvar state)
(defun game (&optional quiet)
"Plays 1 game against the random player. Also learns and prints.
:X moves first and is random. :O learns"
(setq state initial-state)
(unless quiet (show-state state))
(loop for new-state = (next-state :X state (random-move state))
for exploratory-move? = (< (random 1.0) epsilon)
do
(when (terminal-state-p new-state)
(unless quiet (show-state new-state))
(update state new-state quiet)
(return (value new-state)))
(setf new-state (next-state :O new-state
(if exploratory-move?
(random-move new-state)
(greedy-move :O new-state))))
(unless exploratory-move?
(update state new-state quiet))
(unless quiet (show-state new-state))
(when (terminal-state-p new-state) (return (value new-state)))
(setq state new-state)))
(defun update (state new-state &optional quiet)
"This is the learning rule"
(set-value state (+ (value state)
(* alpha
(- (value new-state)
(value state)))))
(unless quiet (format t " ~,3F" (value state))))
(defun run ()
(loop repeat 40 do (print (/ (loop repeat 100 sum (game t))
100.0))))
(defun runs (num-runs num-bins bin-size) ; e.g., (runs 10 40 100)
(loop with array = (make-array num-bins :initial-element 0.0)
repeat num-runs do
(init)
(loop for i below num-bins do
(incf (aref array i)
(loop repeat bin-size sum (game t))))
finally (loop for i below num-bins
do (print (/ (aref array i)
(* bin-size num-runs))))))