578 lines
110 KiB
Plaintext
578 lines
110 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Import Packages"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import necessary libraries\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"from matplotlib import colors\n",
|
|
"import matplotlib.animation as animation\n",
|
|
"import json\n",
|
|
"import time\n",
|
|
"import threading\n",
|
|
"import tqdm\n",
|
|
"from tqdm import tqdm\n",
|
|
"from tqdm import trange\n",
|
|
"import datetime"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Create Map\n",
|
|
"Create a map for the Q-learning algorithm to try. You can choose any grid size, but the larger the grid, the more compute it will take. I would suggest around an 8x8 to 12x12 grid."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"pygame 2.1.0 (SDL 2.0.16, Python 3.10.14)\n",
|
|
"Hello from the pygame community. https://www.pygame.org/contribute.html\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!./map_generator"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Importing Map Array and Displaying Map\n",
|
|
"<br>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load the saved map\n",
|
|
"with open(\"map_data.json\", \"r\") as f:\n",
|
|
" rewards = np.array(json.load(f))\n",
|
|
"\n",
|
|
"#rewards[rewards == 1000] = 500\n",
|
|
"\n",
|
|
"environment_rows = rewards.shape[0]\n",
|
|
"environment_columns = rewards.shape[1]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Define the colormap for the grid values\n",
|
|
"cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n",
|
|
"# Bounds now account for the actual range of values, with small gaps between to handle exact matching\n",
|
|
"bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 2000.5]\n",
|
|
"norm = colors.BoundaryNorm(bounds, cmap.N)\n",
|
|
"\n",
|
|
"# Create the plot\n",
|
|
"plt.imshow(rewards, cmap=cmap, norm=norm)\n",
|
|
"\n",
|
|
"\n",
|
|
"# Display the plot\n",
|
|
"plt.title(\"Map Visualization with Goal\")\n",
|
|
"plt.show()\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Visualization Functions\n",
|
|
"<br>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def graph(q_table, save=False, title=\"\"):\n",
|
|
" # Define the colormap for the grid values\n",
|
|
" #fig, ax = plt.subplots(figsize=(8, 8), dpi=200) # Increased figure size and DPI\n",
|
|
"\n",
|
|
" cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n",
|
|
" # Bounds now account for the actual range of values, with small gaps between to handle exact matching\n",
|
|
" bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 1000.5]\n",
|
|
" norm = colors.BoundaryNorm(bounds, cmap.N)\n",
|
|
"\n",
|
|
" \n",
|
|
" # Create the plot for rewards\n",
|
|
" plt.imshow(rewards, cmap=cmap, norm=norm)\n",
|
|
" \n",
|
|
" # Calculate the optimal direction from Q-table\n",
|
|
" # Directions: up (0), right (1), down (2), left (3)\n",
|
|
" optimal_directions = np.argmax(q_table, axis=2)\n",
|
|
" \n",
|
|
" # Initialize arrays for arrow direction (dx, dy) at each grid point\n",
|
|
" dx = np.zeros_like(optimal_directions, dtype=float)\n",
|
|
" dy = np.zeros_like(optimal_directions, dtype=float)\n",
|
|
" \n",
|
|
" # Define movement deltas for [up, right, down, left]\n",
|
|
" move_map = {\n",
|
|
" 0: (0, -1), # up\n",
|
|
" 1: (1, 0), # right\n",
|
|
" 2: (0, 1), # down\n",
|
|
" 3: (-1, 0), # left\n",
|
|
" }\n",
|
|
"\n",
|
|
" # Fill in dx, dy based on optimal directions, but only if the sum of Q-values is not zero\n",
|
|
" for i in range(optimal_directions.shape[0]):\n",
|
|
" for j in range(optimal_directions.shape[1]):\n",
|
|
" if np.sum(q_table[i, j]) != 0: # Check if the Q-values are non-zero\n",
|
|
" direction = optimal_directions[i, j]\n",
|
|
" dx[i, j], dy[i, j] = move_map[direction]\n",
|
|
" \n",
|
|
" # Create a meshgrid for plotting arrows\n",
|
|
" x, y = np.meshgrid(np.arange(optimal_directions.shape[1]), np.arange(optimal_directions.shape[0]))\n",
|
|
" \n",
|
|
" # Plot arrows using quiver, only for non-zero vectors\n",
|
|
" plt.quiver(x, y, dx, dy, angles='xy', scale_units='xy', scale=1, color='black')\n",
|
|
" plt.title(title)\n",
|
|
"\n",
|
|
" if save:\n",
|
|
" timestamp = datetime.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
|
|
" filename = f\"images/plot_{timestamp}.png\"\n",
|
|
" plt.savefig(filename, format='png')\n",
|
|
" \n",
|
|
" # Display the plot with arrows\n",
|
|
" plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def graph_path(path):\n",
|
|
" # Define the colormap for the grid values\n",
|
|
" cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n",
|
|
" bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 1000.5]\n",
|
|
" norm = colors.BoundaryNorm(bounds, cmap.N)\n",
|
|
"\n",
|
|
" # Create the plot for rewards\n",
|
|
" plt.imshow(rewards, cmap=cmap, norm=norm)\n",
|
|
"\n",
|
|
" move_map = {\n",
|
|
" 0: (0, -1), # up\n",
|
|
" 1: (1, 0), # right\n",
|
|
" 2: (0, 1), # down\n",
|
|
" 3: (-1, 0), # left\n",
|
|
" }\n",
|
|
"\n",
|
|
" # Now plot the path taken by the robot\n",
|
|
" path_x = [pos[1] for pos in path]\n",
|
|
" path_y = [pos[0] for pos in path]\n",
|
|
" \n",
|
|
" # Create arrows for the robot's path\n",
|
|
" for i in range(len(path) - 1):\n",
|
|
" start_x, start_y = path_x[i], path_y[i]\n",
|
|
" end_x, end_y = path_x[i + 1], path_y[i + 1]\n",
|
|
" plt.arrow(start_x, start_y, end_x - start_x, end_y - start_y, color='yellow', head_width=0.2)\n",
|
|
"\n",
|
|
" # Display the plot with arrows\n",
|
|
" plt.show()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Q-Learning helper functions\n",
|
|
"<br>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# define actions\n",
|
|
"# we will use numeric (index) to represent the actions\n",
|
|
"# 0 = up, 1 = right, 2 = down, 3 = left\n",
|
|
"actions = ['up', 'right', 'down', 'left']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# because we will end the episode if we reach Goal\n",
|
|
"def is_terminal_state(current_row_index, current_column_index):\n",
|
|
" if rewards[current_row_index, current_column_index] != np.max(rewards): # it is not terminal if the rewards is -1\n",
|
|
" return False\n",
|
|
" else:\n",
|
|
" return True\n",
|
|
"\n",
|
|
"# this starting location must not be on the road\n",
|
|
"def get_starting_location():\n",
|
|
" current_row_index = np.random.randint(environment_rows) # get a random row index\n",
|
|
" current_column_index = np.random.randint(environment_columns) # get a random column index\n",
|
|
" \n",
|
|
" while rewards[current_row_index, current_column_index] != -1: # True if it is terminal\n",
|
|
" current_row_index = np.random.randint(environment_rows) # repeat to get another random row index\n",
|
|
" current_column_index = np.random.randint(environment_columns) # repeat to get another random row index\n",
|
|
" return current_row_index, current_column_index # returns a random starting location that is not terminal\n",
|
|
"\n",
|
|
"\n",
|
|
"# define an epsilon greedy algorithm for deciding the next action\n",
|
|
"def get_next_action(current_row_index, current_column_index, epsilon):\n",
|
|
" if np.random.random() < epsilon: # choose the action with the highest q_values\n",
|
|
" return np.random.randint(4)\n",
|
|
" else: # choose a random action\n",
|
|
" return np.argmax(q_values[current_row_index, current_column_index])\n",
|
|
"\n",
|
|
"\n",
|
|
"# define a function that will get the next location based on the chosen action\n",
|
|
"# refer to how the board is drawn physically, with the rows and columns\n",
|
|
"def get_next_location(current_row_index, current_column_index, action_index):\n",
|
|
" new_row_index = current_row_index\n",
|
|
" new_column_index = current_column_index\n",
|
|
" if actions[action_index] == 'up' and current_row_index > 0:\n",
|
|
" new_row_index -= 1\n",
|
|
" elif actions[action_index] == 'right' and current_column_index < environment_columns - 1:\n",
|
|
" new_column_index += 1\n",
|
|
" elif actions[action_index] == 'down' and current_row_index < environment_rows - 1:\n",
|
|
" new_row_index += 1\n",
|
|
" elif actions[action_index] == 'left' and current_column_index > 0:\n",
|
|
" new_column_index -= 1\n",
|
|
" return new_row_index, new_column_index\n",
|
|
"\n",
|
|
"\n",
|
|
"# Define a function that will get the shortest path that is on the white tiles \n",
|
|
"def get_shortest_path(start_row_index, start_column_index):\n",
|
|
" i = 0\n",
|
|
" if is_terminal_state(start_row_index, start_column_index): # check if it is on Goal or Cliff\n",
|
|
" return [] # if yes, there are no available steps\n",
|
|
" \n",
|
|
" else: #if this is a 'legal' starting location\n",
|
|
" current_row_index, current_column_index = start_row_index, start_column_index\n",
|
|
" shortest_path = []\n",
|
|
" shortest_path.append([current_row_index, current_column_index]) # add the current coordinate to the list\n",
|
|
"\n",
|
|
" while not is_terminal_state(current_row_index, current_column_index): # repeat until we reach Goal or Cliff\n",
|
|
" action_index = get_next_action(current_row_index, current_column_index, 1.) \n",
|
|
" # get next coordinate \n",
|
|
" \n",
|
|
" current_row_index, current_column_index = get_next_location(current_row_index, current_column_index, action_index)\n",
|
|
" # update that next coordinate as current coordinate\n",
|
|
" \n",
|
|
" shortest_path.append([current_row_index, current_column_index]) \n",
|
|
" # add the current coordinate to the list\n",
|
|
"\n",
|
|
" i += 1\n",
|
|
" if i > 100:\n",
|
|
" return 0;\n",
|
|
" return shortest_path"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def q_learn_single(epsilon = 0.9, discount_factor = 0.9, learning_rate = 0.9, epochs = 1000,):\n",
|
|
" q_values = np.zeros((environment_rows, environment_columns, 4))\n",
|
|
" \n",
|
|
" for episode in tqdm(range(epochs), desc=\"Training Progress\", unit=\"epochs\", ncols=100): # Adjust `ncols` to shorten the bar\n",
|
|
" row_index, column_index = get_starting_location()\n",
|
|
"\n",
|
|
" while not is_terminal_state(row_index, column_index):\n",
|
|
" # choose which action to take (i.e., where to move next)\n",
|
|
" action_index = get_next_action(row_index, column_index, epsilon)\n",
|
|
"\n",
|
|
" # perform the chosen action, and transition to the next state / next location\n",
|
|
" old_row_index, old_column_index = row_index, column_index # store the old row and column indexes\n",
|
|
" row_index, column_index = get_next_location(row_index, column_index, action_index)\n",
|
|
"\n",
|
|
" # receive the reward for moving to the new state, and calculate the temporal difference\n",
|
|
" reward = rewards[row_index, column_index]\n",
|
|
" old_q_value = q_values[old_row_index, old_column_index, action_index]\n",
|
|
" temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n",
|
|
"\n",
|
|
" # update the Q-value for the previous state and action pair\n",
|
|
" new_q_value = old_q_value + (learning_rate * temporal_difference)\n",
|
|
" q_values[old_row_index, old_column_index, action_index] = new_q_value\n",
|
|
"\n",
|
|
" print('Training complete!')\n",
|
|
"\n",
|
|
" return q_values\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def q_learn_single(epsilon = 0.9, discount_factor = 0.9, learning_rate = 0.9, epochs = 1000):\n",
|
|
" # Initialize the Q-table with zeros for each state-action pair\n",
|
|
" # The shape is (environment_rows, environment_columns, 4) \n",
|
|
" # where 4 represents 4 possible actions (e.g., up, down, left, right)\n",
|
|
" q_values = np.zeros((environment_rows, environment_columns, 4))\n",
|
|
" \n",
|
|
" # Iterate through a number of episodes (i.e., learning cycles)\n",
|
|
" for episode in tqdm(range(epochs), desc=\"Training Progress\", unit=\"epochs\", ncols=100):\n",
|
|
" # Start each episode by selecting a random starting location in the environment\n",
|
|
" row_index, column_index = get_starting_location()\n",
|
|
"\n",
|
|
" # Continue taking actions until the agent reaches a terminal state\n",
|
|
" while not is_terminal_state(row_index, column_index):\n",
|
|
" # Choose the next action based on an epsilon-greedy policy\n",
|
|
" # This function should balance exploration (random) vs exploitation (best known action)\n",
|
|
" action_index = get_next_action(row_index, column_index, epsilon)\n",
|
|
"\n",
|
|
" # Save the old position before taking the action\n",
|
|
" old_row_index, old_column_index = row_index, column_index\n",
|
|
" \n",
|
|
" # Move to the new state based on the chosen action\n",
|
|
" row_index, column_index = get_next_location(row_index, column_index, action_index)\n",
|
|
"\n",
|
|
" # Get the reward for the new state the agent has moved to\n",
|
|
" reward = rewards[row_index, column_index]\n",
|
|
" \n",
|
|
" # Retrieve the Q-value of the old state-action pair\n",
|
|
" old_q_value = q_values[old_row_index, old_column_index, action_index]\n",
|
|
"\n",
|
|
" # Calculate the temporal difference: \n",
|
|
" # TD = Reward + Discount * (Max Q-value for the next state) - Old Q-value\n",
|
|
" temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n",
|
|
"\n",
|
|
" # Update the Q-value for the previous state-action pair using the learning rate\n",
|
|
" new_q_value = old_q_value + (learning_rate * temporal_difference)\n",
|
|
" q_values[old_row_index, old_column_index, action_index] = new_q_value # Assign updated value\n",
|
|
"\n",
|
|
" # After all episodes, print a message indicating the training is complete\n",
|
|
" print('Training complete!')\n",
|
|
"\n",
|
|
" # Return the Q-values for further use (e.g., evaluation or exploitation phase)\n",
|
|
" return q_values\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# single episode\n",
|
|
"\n",
|
|
"def eposode(epsilon, discount_factor, learning_rate, epochs):\n",
|
|
" for episode in range(epochs):\n",
|
|
" row_index, column_index = get_starting_location()\n",
|
|
" \n",
|
|
" while not is_terminal_state(row_index, column_index):\n",
|
|
" # choose which action to take (i.e., where to move next)\n",
|
|
" action_index = get_next_action(row_index, column_index, epsilon)\n",
|
|
" \n",
|
|
" # perform the chosen action, and transition to the next state / next location\n",
|
|
" old_row_index, old_column_index = row_index, column_index # store the old row and column indexes\n",
|
|
" row_index, column_index = get_next_location(row_index, column_index, action_index)\n",
|
|
" \n",
|
|
" # receive the reward for moving to the new state, and calculate the temporal difference\n",
|
|
" reward = rewards[row_index, column_index]\n",
|
|
" old_q_value = q_values[old_row_index, old_column_index, action_index]\n",
|
|
" temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n",
|
|
" \n",
|
|
" # update the Q-value for the previous state and action pair\n",
|
|
" new_q_value = old_q_value + (learning_rate * temporal_difference)\n",
|
|
" q_values[old_row_index, old_column_index, action_index] = new_q_value\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def q_learn_multi(epsilon=0.9, discount_factor=0.9, learning_rate=0.9, epochs=250, threads = 4):\n",
|
|
" \n",
|
|
" thread_array = []\n",
|
|
"\n",
|
|
" \n",
|
|
" for num in range(threads):\n",
|
|
" thread = threading.Thread(target=eposode, args=(epsilon, discount_factor, learning_rate, epochs))\n",
|
|
" thread_array.append(thread)\n",
|
|
" thread.start()\n",
|
|
"\n",
|
|
" for thread in thread_array:\n",
|
|
" thread.join()\n",
|
|
" print('Training complete!')\n",
|
|
"\n",
|
|
" return q_values\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Q-Learning Multi-threaded\n",
|
|
"<br>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Training complete!\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"q_values = np.zeros((environment_rows, environment_columns, 4))\n",
|
|
"\n",
|
|
"q_values = q_learn_multi(0.7, 0.6, 0.1, 500, 12)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"graph(q_values, save=True, title=\"multi-thread: epsilon=0.7, discount_factor=0.6\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Q-Learning Single Threaded\n",
|
|
"<br>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Training Progress: 100%|████████████████████████████████████| 1000/1000 [00:37<00:00, 26.43epochs/s]"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Training complete!\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"q_values = np.zeros((environment_rows, environment_columns, 4))\n",
|
|
"\n",
|
|
"q_values = q_learn_single(0.9, 0.7, 0.1, 1000)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"graph(q_values, save=True, title=\"single-thread: epsilon=0.9, discount_factor=0.6\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.14"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|