/****************************************************************************** * blackjack.c * * e-soft, on-policy Monte Carlo control for Blackjack * (as described in example 5.1) * * * * NOTES: * * - We don't consider cases in which player has a natural, or * has less than twelve, because the action to be taken by the * player in those cases is obvious. * * - The game described in this example doesn't recognize the * the situation in which player reaches 5 cards without * going busted as a special situation. * ******************************************************************************/ #include <stdio.h> #include <stdlib.h> #include <string.h> void initialize(); int request_card(); int select_action(int p, int d, int u); void show_POL(); void update_card_sum(int *current, int *new, int *usable); /* Show episodes on screen? */ #define SHOW_EPISODES 0 /* How many sample episode to simulate */ #define MAX_EPISODES 5000000 /* How "soft" the policy should be */ #define EPSILON 0.1 /* What is what */ #define HIT 0 #define STICK 1 /* * Let p = player_sum, * d = dealer_showing, * u = usable_ace, * a = action. * * Q[p][d][u][a]represents the 400 action values Q(s,a), where every * state s is defined by a triple (p,d,u) */ double Q[22][11][2][2]; // <-- p={12..21}, d={1..10}, u={0..1}, a={0..1} /* * POL[p][d][u][a] represents the probability of taking action a * at state s(p,d,u) under the current policy */ double POL[22][11][2][2]; // <-- p={12..21}, d={1..10}, u={0..1}, a={0..1} /* * W[p][d][u][a] holds the number of times a state-action pair (s,a) * has been encoutered in the simulation. * Needed to incrementally calculate the average of Q(s,a). (see section * 2.5 and 5.7 for details) */ int W[22][11][2][2]; // <-- p={12..21}, d={1..10}, u={0..1}, a={0..1} /* * To be used as nodes in the linked list of state-action pairs. */ struct sa_pair{ int p; int d; int u; int a; struct sa_pair *next; }; /* * main() * * */ int main(int argc, char *argv[]){ struct sa_pair *appeared_pairs, *new_pair, *temp_ptr; int i, p, d, u, dealer_u, a, hit_result; int player_busted, dealer_busted; int dealer_has_natural; int episode_ending, card_count; printf("\n"); printf("Doing %d episodes.\n", MAX_EPISODES); printf("Change SHOW_EPISODES to 1 if you want to see them displayed.\n"); printf("\n"); initialize(); /* * Repeat "forever": * * 1. Simulate an episode and update action-values for * every state-action pair that appeared in it. * * 2. Greedily improve the policy according to the updated * action value function */ for(i = 0; i < MAX_EPISODES; i++){ appeared_pairs = NULL; player_busted = 0; dealer_busted = 0; dealer_has_natural = 0; p = 0; u = 0; /* * Give 2 cards to the player and 1 to the dealer */ hit_result = request_card(); // <-- player's first card update_card_sum(&p, &hit_result, &u); hit_result = request_card(); // <-- player's second card update_card_sum(&p, &hit_result, &u); if ((p < 12)||(p == 21)){ // <-- We don't consider these cases continue; } d = request_card(); // <-- dealer's first card if(SHOW_EPISODES) printf("New episode: p=%d, d=%d, u=%d\n", p, d, u); /* * Player plays */ while(select_action(p, d, u) == HIT){ /* * Add this state-action pair to the list of appeared (s,a) */ new_pair = (struct sa_pair *) malloc(sizeof(struct sa_pair)); new_pair->p = p; new_pair->d = d; new_pair->u = u; new_pair->a = HIT; new_pair->next = appeared_pairs; appeared_pairs = new_pair; /* * Now request for a card */ hit_result = request_card(); update_card_sum(&p, &hit_result, &u); if (p > 21){ player_busted = 1; if(SHOW_EPISODES) printf("Player hit: %d and went busted.\n", hit_result); break; } if(SHOW_EPISODES) printf("Player hit: %d and becomes p=%d, d=%d, u=%d\n", hit_result, p, d, u); } /* * If player was not busted, record the action value of the last (s,a) */ if (!player_busted){ new_pair = (struct sa_pair *) malloc(sizeof(struct sa_pair)); new_pair->p = p; new_pair->d = d; new_pair->u = u; new_pair->a = STICK; new_pair->next = appeared_pairs; appeared_pairs = new_pair; } /* * Dealer plays */ if (!player_busted){ /* * An ace as the first and only card is a usable ace * and is counted as 11 */ if (d == 1){ dealer_u = 1; d = 11; } else{ dealer_u = 0; } /* * Request for more card(s) */ card_count = 1; while (d < 17){ // <-- has to hit if less than 17 hit_result = request_card(); update_card_sum(&d, &hit_result, &dealer_u); if (d > 21){ dealer_busted = 1; if(SHOW_EPISODES) printf("Dealer hit: %d and went busted.\n", hit_result); break; } card_count++; /* * The dealer is said to have a natural if he has 2 cards AND: * - Both of them are aces * OR * - 1 is an ace, and 1 is a ten */ if((card_count == 2) && (((hit_result == 1)&&(d == 13)) || (d == 21))){ dealer_has_natural == 1; if (SHOW_EPISODES) printf("Dealer has a natural (%d, %d)\n", d, hit_result); break; } if(SHOW_EPISODES) printf("Dealer hit: %d and becomes d=%d, dealer_u=%d\n", hit_result, d, dealer_u); } } /* * Who won */ if ((dealer_has_natural) || (player_busted)){ episode_ending = -1; } else if (dealer_busted){ episode_ending = 1; } else{ if (d < p) episode_ending = 1; else if (d == p) episode_ending = 0; else episode_ending = -1; } if(SHOW_EPISODES) printf("---> Episode ended: %d\n", episode_ending); /* * Now update the action-values for * every state-action pair that appeared in this episode. */ if(SHOW_EPISODES) printf("---> Involved state-action pairs (in reverse order): \n"); while(appeared_pairs != NULL){ p = appeared_pairs->p; d = appeared_pairs->d; u = appeared_pairs->u; a = appeared_pairs->a; if(SHOW_EPISODES) printf(" %d, %d, %d, %d\n", p, d, u, a); /* * Incrementally update the average of Q(s,a): * (See section 2.5) */ Q[p][d][u][a] = Q[p][d][u][a] + (episode_ending - Q[p][d][u][a])/(W[p][d][u][a] + 1); W[p][d][u][a]++; temp_ptr = appeared_pairs->next; free(appeared_pairs); appeared_pairs = temp_ptr; } if(SHOW_EPISODES) printf("\n"); /* * Finally, improve the policy according to the updated value function */ for(p = 12; p < 22; p++){ for(d = 1; d < 11; d++){ for(u = 0; u < 2; u++){ if (Q[p][d][u][HIT] > Q[p][d][u][STICK]){ POL[p][d][u][HIT] = 1 - EPSILON + EPSILON/2; POL[p][d][u][STICK] = EPSILON/2; } else{ POL[p][d][u][STICK] = 1 - EPSILON + EPSILON/2; POL[p][d][u][HIT] = EPSILON/2; } } } } } show_POL(); exit(0); } /* * intitialize() * * */ void initialize(){ int p, d, u, a; /* * Provide a seed to the random generator */ // srand(time(NULL)); /* * Start with an arbitrary (zero) action value function */ for(p = 12; p < 22; p++){ for(d = 1; d < 11; d++){ for(u = 0; u < 2; u++){ for(a = 0; a < 2; a++){ Q[p][d][u][a] = 0; W[p][d][u][a] = 0; } } } } /* * Start with an arbitrary (always hit) policy */ for(p = 12; p < 22; p++){ for(d = 1; d < 11; d++){ for(u = 0; u < 2; u++){ POL[p][d][u][HIT] = 1 - EPSILON + EPSILON/2; POL[p][d][u][STICK] = EPSILON/2; } } } return; } /* * request_card() * * */ int request_card(){ int card; /* * There are 13 different cards in each color * Each should has an equal chance to appear */ // card = (random() % 13) + 1; // <-- Won't do. Type "man rand" to see why card = 1 + (int) (13.0*rand()/(RAND_MAX+1.0)); // <-- {1..13} /* * All the cards after 9 are counted as 10 */ if (card > 9) return 10; else return card; } /* * select_action() * * decide whether we should "hit" or "stick" */ int select_action(int p, int d, int u){ /* * Take the greedy action (1- EPSILON + EPSILON/2) percents * of the time, and non-greedy (EPSILON/2) percents of the times */ if ((rand()/(RAND_MAX+1.0)) < (EPSILON/2)){ // <-- non-greedy action should be taken if (POL[p][d][u][HIT] < POL[p][d][u][STICK]){ // <-- is HIT the "minority" action? return HIT; } else{ return STICK; } } else{ if (POL[p][d][u][HIT] < POL[p][d][u][STICK]){ return STICK; } else{ return HIT; } } } /* * update_card_sum() * * */ void update_card_sum(int *current, int *new, int *usable){ int p = *current; int hit_result = *new; int u = *usable; if ((p + hit_result) > 21){ // <-- potentially busted if (u == 1){ // <-- No problem, since he has a usable ace p = p - 10 + hit_result; u = 0; } else{ p = p + hit_result; // <-- busted } } else{ if ((hit_result == 1) && ((p + 11) < 22)){ // <-- Just got a usable ace p = p + 11; u = 1; } else{ p = p + hit_result; } } *current = p; *new = hit_result; *usable = u; return; } /* * show_POL() * * */ void show_POL(){ int action0[22][11]; // <-- No usable ace int action1[22][11]; // <-- usable ace int i, j; /* * Compile the "soft" policy into 2 deterministic policies */ for(i = 12; i < 22; i++){ for(j = 1; j < 11; j++){ /* * No usable ace */ if (POL[i][j][0][HIT] < POL[i][j][0][STICK]){ // <-- HIT is not the favorite action action0[i][j] = STICK; } else // <-- Hit is the favorite action action0[i][j] = HIT; /* * Usable ace */ if (POL[i][j][1][HIT] < POL[i][j][1][STICK]) // <-- HIT is not the favorite action action1[i][j] = STICK; else // <-- Hit is the favorite action action1[i][j] = HIT; } } /* * Now show them */ printf(" Policy for episodes that start with a usable ace\n"); printf(" | 1 2 3 4 5 6 7 8 9 10\n"); printf("-----+-------------------------------\n"); for(i = 21; i >11 ; i--){ printf(" %3d |", i); for(j = 1; j < 11; j++){ printf(" %d", action1[i][j]); } printf("\n"); } printf("\n"); printf(" Policy for episodes that start without a usable ace\n"); printf(" | 1 2 3 4 5 6 7 8 9 10\n"); printf("-----+-------------------------------\n"); for(i = 21; i >11 ; i--){ printf(" %3d |", i); for(j = 1; j < 11; j++){ printf(" %d", action0[i][j]); } printf("\n"); } printf("\n"); }