;;; Monte Carlo solution of simple blackjack. ;;; The state is (dc,pc,ace01), i.e., (dealer-card, player-count, usable-ace?), ;;; in the ranges ([1-10],[12-21],[0-1]). ;;; The actions are hit or stick, t or nil ;;; Here we do just Monte Carlo estimation of a single state. ;;; We learn the value of state (2,13,T) under the fixed policy of sticking only ;;; with 20 or 21 (section 5.1) ;;; The answer seems to be -0.27726, with a standard deviation <0.0001 ;;; This is the average of 100,000,000 episode outcomes, each with variance<1.0 (defvar V) (defvar policy) ; this now holds the deterministic target policy (defvar N) ; Number of returns seen for this state (defvar dc) ; count of dealer's showing card (defvar pc) ; total count of player's hand (defvar ace) ; does play have a usable ace? (defvar episode) (defvar episodes NIL) (defun offMCeval (num-episodes) (loop with v = 0.0 for episode-num below num-episodes for episode in episodes collect (loop with G = (first episode) for ep = (reverse (rest episode)) then (cddr ep) while ep for s = (first ep) for dc = (first s) for pc = (second s) for ace = (if (third s) 1 0) for a = (if (second ep) 1 0) for rho = (/ (if (= (aref policy dc pc ace) a) 1 0) 0.5) for ratio = rho then (* ratio rho) finally ;(print (list ep s dc pc ace a rho ratio)) (return (incf v (* (/ (+ episode-num 1)) (- (* ratio G) v))))))) (defun card () (min 10 (+ 1 (random 13)))) (defun setup () (setq V (make-array '(11 22 2) :initial-element 0.0)) (setq N (make-array '(11 22 2) :initial-element 0)) (setq policy (make-array '(11 22 2) :initial-element 1)) (loop for dc from 1 to 10 do (loop for pc from 20 to 21 do (loop for ace from 0 to 1 do (setf (aref policy dc pc ace) 0))))) (defun episode () (let (dc-hidden) (setq episode nil) (setq dc-hidden (card)) (setq dc 2) (setq ace T) (setq pc 13) (unless (= pc 21) ; natural blackjack ends all (loop do (push (list dc pc ace) episode) while (first (push (= 1 (aref policy dc pc (if ace 1 0))) episode)) do (draw-card) until (bust?))) ;(learn episode outcome) (cons (outcome dc dc-hidden) episode))) (defun learn (episode outcome) (loop for (dc pc ace-boolean) in episode for ace = (if ace-boolean 1 0) do (when (> pc 11) (incf (aref N dc pc ace)) (incf (aref V dc pc ace) (/ (- outcome (aref V dc pc ace)) (aref N dc pc ace)))))) (defun outcome (dc dc-hidden) (let (dcount dace dnatural pnatural) (setq dace (OR (= 1 dc) (= 1 dc-hidden))) (setq dcount (+ dc dc-hidden)) (if dace (incf dcount 10)) (setq dnatural (= dcount 21)) (setq pnatural (not episode)) (cond ((AND pnatural dnatural) 0) (pnatural 1) (dnatural -1) ((bust?) -1) (t (loop while (< dcount 17) for card = (card) do (incf dcount card) (when (AND (not dace) (= card 1)) (incf dcount 10) (setf dace t)) (when (AND dace (> dcount 21)) (decf dcount 10) (setq dace nil)) finally (return (cond ((> dcount 21) 1) ((> dcount pc) -1) ((= dcount pc) 0) (t 1)))))))) (defun draw-card () (let (card) (setq card (card)) (incf pc card) (when (AND (not ace) (= card 1)) (incf pc 10) (setf ace t)) (when (AND ace (> pc 21)) (decf pc 10) (setq ace nil)))) (defun bust? () (> pc 21)) (defvar w) (defvar array (make-array '(10 10))) (defun gr (source ace &optional (arr array)) (loop with ace = (if ace 1 0) for i below 10 do (loop for j below 10 do (setf (aref arr i j) (aref source (+ i 1) (+ j 12) ace)))) (g::graph-surface w arr)) (defun experiment () (setup) (loop for count below 500 for ar0 = (make-array '(10 10)) for ar1 = (make-array '(10 10)) do (print count) (gr V nil ar0) (gr V t ar1) collect ar0 collect ar1 do (loop repeat 1000 do (episode))))