q-learning-terrain-navigator/q-learning-terrain-navigator.ipynb
2024-10-21 22:49:35 -04:00

578 lines
110 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import Packages"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# import necessary libraries\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import colors\n",
"import matplotlib.animation as animation\n",
"import json\n",
"import time\n",
"import threading\n",
"import tqdm\n",
"from tqdm import tqdm\n",
"from tqdm import trange\n",
"import datetime"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create Map\n",
"Create a map for the Q-learning algorithm to try. You can choose any grid size, but the larger the grid, the more compute it will take. I would suggest around an 8x8 to 12x12 grid."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"pygame 2.1.0 (SDL 2.0.16, Python 3.10.14)\n",
"Hello from the pygame community. https://www.pygame.org/contribute.html\n"
]
}
],
"source": [
"!./map_generator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Importing Map Array and Displaying Map\n",
"<br>"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Load the saved map\n",
"with open(\"map_data.json\", \"r\") as f:\n",
" rewards = np.array(json.load(f))\n",
"\n",
"#rewards[rewards == 1000] = 500\n",
"\n",
"environment_rows = rewards.shape[0]\n",
"environment_columns = rewards.shape[1]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Define the colormap for the grid values\n",
"cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n",
"# Bounds now account for the actual range of values, with small gaps between to handle exact matching\n",
"bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 2000.5]\n",
"norm = colors.BoundaryNorm(bounds, cmap.N)\n",
"\n",
"# Create the plot\n",
"plt.imshow(rewards, cmap=cmap, norm=norm)\n",
"\n",
"\n",
"# Display the plot\n",
"plt.title(\"Map Visualization with Goal\")\n",
"plt.show()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Visualization Functions\n",
"<br>"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def graph(q_table, save=False, title=\"\"):\n",
" # Define the colormap for the grid values\n",
" #fig, ax = plt.subplots(figsize=(8, 8), dpi=200) # Increased figure size and DPI\n",
"\n",
" cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n",
" # Bounds now account for the actual range of values, with small gaps between to handle exact matching\n",
" bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 1000.5]\n",
" norm = colors.BoundaryNorm(bounds, cmap.N)\n",
"\n",
" \n",
" # Create the plot for rewards\n",
" plt.imshow(rewards, cmap=cmap, norm=norm)\n",
" \n",
" # Calculate the optimal direction from Q-table\n",
" # Directions: up (0), right (1), down (2), left (3)\n",
" optimal_directions = np.argmax(q_table, axis=2)\n",
" \n",
" # Initialize arrays for arrow direction (dx, dy) at each grid point\n",
" dx = np.zeros_like(optimal_directions, dtype=float)\n",
" dy = np.zeros_like(optimal_directions, dtype=float)\n",
" \n",
" # Define movement deltas for [up, right, down, left]\n",
" move_map = {\n",
" 0: (0, -1), # up\n",
" 1: (1, 0), # right\n",
" 2: (0, 1), # down\n",
" 3: (-1, 0), # left\n",
" }\n",
"\n",
" # Fill in dx, dy based on optimal directions, but only if the sum of Q-values is not zero\n",
" for i in range(optimal_directions.shape[0]):\n",
" for j in range(optimal_directions.shape[1]):\n",
" if np.sum(q_table[i, j]) != 0: # Check if the Q-values are non-zero\n",
" direction = optimal_directions[i, j]\n",
" dx[i, j], dy[i, j] = move_map[direction]\n",
" \n",
" # Create a meshgrid for plotting arrows\n",
" x, y = np.meshgrid(np.arange(optimal_directions.shape[1]), np.arange(optimal_directions.shape[0]))\n",
" \n",
" # Plot arrows using quiver, only for non-zero vectors\n",
" plt.quiver(x, y, dx, dy, angles='xy', scale_units='xy', scale=1, color='black')\n",
" plt.title(title)\n",
"\n",
" if save:\n",
" timestamp = datetime.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
" filename = f\"images/plot_{timestamp}.png\"\n",
" plt.savefig(filename, format='png')\n",
" \n",
" # Display the plot with arrows\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def graph_path(path):\n",
" # Define the colormap for the grid values\n",
" cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n",
" bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 1000.5]\n",
" norm = colors.BoundaryNorm(bounds, cmap.N)\n",
"\n",
" # Create the plot for rewards\n",
" plt.imshow(rewards, cmap=cmap, norm=norm)\n",
"\n",
" move_map = {\n",
" 0: (0, -1), # up\n",
" 1: (1, 0), # right\n",
" 2: (0, 1), # down\n",
" 3: (-1, 0), # left\n",
" }\n",
"\n",
" # Now plot the path taken by the robot\n",
" path_x = [pos[1] for pos in path]\n",
" path_y = [pos[0] for pos in path]\n",
" \n",
" # Create arrows for the robot's path\n",
" for i in range(len(path) - 1):\n",
" start_x, start_y = path_x[i], path_y[i]\n",
" end_x, end_y = path_x[i + 1], path_y[i + 1]\n",
" plt.arrow(start_x, start_y, end_x - start_x, end_y - start_y, color='yellow', head_width=0.2)\n",
"\n",
" # Display the plot with arrows\n",
" plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Q-Learning helper functions\n",
"<br>"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# define actions\n",
"# we will use numeric (index) to represent the actions\n",
"# 0 = up, 1 = right, 2 = down, 3 = left\n",
"actions = ['up', 'right', 'down', 'left']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# because we will end the episode if we reach Goal\n",
"def is_terminal_state(current_row_index, current_column_index):\n",
" if rewards[current_row_index, current_column_index] != np.max(rewards): # it is not terminal if the rewards is -1\n",
" return False\n",
" else:\n",
" return True\n",
"\n",
"# this starting location must not be on the road\n",
"def get_starting_location():\n",
" current_row_index = np.random.randint(environment_rows) # get a random row index\n",
" current_column_index = np.random.randint(environment_columns) # get a random column index\n",
" \n",
" while rewards[current_row_index, current_column_index] != -1: # True if it is terminal\n",
" current_row_index = np.random.randint(environment_rows) # repeat to get another random row index\n",
" current_column_index = np.random.randint(environment_columns) # repeat to get another random row index\n",
" return current_row_index, current_column_index # returns a random starting location that is not terminal\n",
"\n",
"\n",
"# define an epsilon greedy algorithm for deciding the next action\n",
"def get_next_action(current_row_index, current_column_index, epsilon):\n",
" if np.random.random() < epsilon: # choose the action with the highest q_values\n",
" return np.random.randint(4)\n",
" else: # choose a random action\n",
" return np.argmax(q_values[current_row_index, current_column_index])\n",
"\n",
"\n",
"# define a function that will get the next location based on the chosen action\n",
"# refer to how the board is drawn physically, with the rows and columns\n",
"def get_next_location(current_row_index, current_column_index, action_index):\n",
" new_row_index = current_row_index\n",
" new_column_index = current_column_index\n",
" if actions[action_index] == 'up' and current_row_index > 0:\n",
" new_row_index -= 1\n",
" elif actions[action_index] == 'right' and current_column_index < environment_columns - 1:\n",
" new_column_index += 1\n",
" elif actions[action_index] == 'down' and current_row_index < environment_rows - 1:\n",
" new_row_index += 1\n",
" elif actions[action_index] == 'left' and current_column_index > 0:\n",
" new_column_index -= 1\n",
" return new_row_index, new_column_index\n",
"\n",
"\n",
"# Define a function that will get the shortest path that is on the white tiles \n",
"def get_shortest_path(start_row_index, start_column_index):\n",
" i = 0\n",
" if is_terminal_state(start_row_index, start_column_index): # check if it is on Goal or Cliff\n",
" return [] # if yes, there are no available steps\n",
" \n",
" else: #if this is a 'legal' starting location\n",
" current_row_index, current_column_index = start_row_index, start_column_index\n",
" shortest_path = []\n",
" shortest_path.append([current_row_index, current_column_index]) # add the current coordinate to the list\n",
"\n",
" while not is_terminal_state(current_row_index, current_column_index): # repeat until we reach Goal or Cliff\n",
" action_index = get_next_action(current_row_index, current_column_index, 1.) \n",
" # get next coordinate \n",
" \n",
" current_row_index, current_column_index = get_next_location(current_row_index, current_column_index, action_index)\n",
" # update that next coordinate as current coordinate\n",
" \n",
" shortest_path.append([current_row_index, current_column_index]) \n",
" # add the current coordinate to the list\n",
"\n",
" i += 1\n",
" if i > 100:\n",
" return 0;\n",
" return shortest_path"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def q_learn_single(epsilon = 0.9, discount_factor = 0.9, learning_rate = 0.9, epochs = 1000,):\n",
" q_values = np.zeros((environment_rows, environment_columns, 4))\n",
" \n",
" for episode in tqdm(range(epochs), desc=\"Training Progress\", unit=\"epochs\", ncols=100): # Adjust `ncols` to shorten the bar\n",
" row_index, column_index = get_starting_location()\n",
"\n",
" while not is_terminal_state(row_index, column_index):\n",
" # choose which action to take (i.e., where to move next)\n",
" action_index = get_next_action(row_index, column_index, epsilon)\n",
"\n",
" # perform the chosen action, and transition to the next state / next location\n",
" old_row_index, old_column_index = row_index, column_index # store the old row and column indexes\n",
" row_index, column_index = get_next_location(row_index, column_index, action_index)\n",
"\n",
" # receive the reward for moving to the new state, and calculate the temporal difference\n",
" reward = rewards[row_index, column_index]\n",
" old_q_value = q_values[old_row_index, old_column_index, action_index]\n",
" temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n",
"\n",
" # update the Q-value for the previous state and action pair\n",
" new_q_value = old_q_value + (learning_rate * temporal_difference)\n",
" q_values[old_row_index, old_column_index, action_index] = new_q_value\n",
"\n",
" print('Training complete!')\n",
"\n",
" return q_values\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def q_learn_single(epsilon = 0.9, discount_factor = 0.9, learning_rate = 0.9, epochs = 1000):\n",
" # Initialize the Q-table with zeros for each state-action pair\n",
" # The shape is (environment_rows, environment_columns, 4) \n",
" # where 4 represents 4 possible actions (e.g., up, down, left, right)\n",
" q_values = np.zeros((environment_rows, environment_columns, 4))\n",
" \n",
" # Iterate through a number of episodes (i.e., learning cycles)\n",
" for episode in tqdm(range(epochs), desc=\"Training Progress\", unit=\"epochs\", ncols=100):\n",
" # Start each episode by selecting a random starting location in the environment\n",
" row_index, column_index = get_starting_location()\n",
"\n",
" # Continue taking actions until the agent reaches a terminal state\n",
" while not is_terminal_state(row_index, column_index):\n",
" # Choose the next action based on an epsilon-greedy policy\n",
" # This function should balance exploration (random) vs exploitation (best known action)\n",
" action_index = get_next_action(row_index, column_index, epsilon)\n",
"\n",
" # Save the old position before taking the action\n",
" old_row_index, old_column_index = row_index, column_index\n",
" \n",
" # Move to the new state based on the chosen action\n",
" row_index, column_index = get_next_location(row_index, column_index, action_index)\n",
"\n",
" # Get the reward for the new state the agent has moved to\n",
" reward = rewards[row_index, column_index]\n",
" \n",
" # Retrieve the Q-value of the old state-action pair\n",
" old_q_value = q_values[old_row_index, old_column_index, action_index]\n",
"\n",
" # Calculate the temporal difference: \n",
" # TD = Reward + Discount * (Max Q-value for the next state) - Old Q-value\n",
" temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n",
"\n",
" # Update the Q-value for the previous state-action pair using the learning rate\n",
" new_q_value = old_q_value + (learning_rate * temporal_difference)\n",
" q_values[old_row_index, old_column_index, action_index] = new_q_value # Assign updated value\n",
"\n",
" # After all episodes, print a message indicating the training is complete\n",
" print('Training complete!')\n",
"\n",
" # Return the Q-values for further use (e.g., evaluation or exploitation phase)\n",
" return q_values\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# single episode\n",
"\n",
"def eposode(epsilon, discount_factor, learning_rate, epochs):\n",
" for episode in range(epochs):\n",
" row_index, column_index = get_starting_location()\n",
" \n",
" while not is_terminal_state(row_index, column_index):\n",
" # choose which action to take (i.e., where to move next)\n",
" action_index = get_next_action(row_index, column_index, epsilon)\n",
" \n",
" # perform the chosen action, and transition to the next state / next location\n",
" old_row_index, old_column_index = row_index, column_index # store the old row and column indexes\n",
" row_index, column_index = get_next_location(row_index, column_index, action_index)\n",
" \n",
" # receive the reward for moving to the new state, and calculate the temporal difference\n",
" reward = rewards[row_index, column_index]\n",
" old_q_value = q_values[old_row_index, old_column_index, action_index]\n",
" temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n",
" \n",
" # update the Q-value for the previous state and action pair\n",
" new_q_value = old_q_value + (learning_rate * temporal_difference)\n",
" q_values[old_row_index, old_column_index, action_index] = new_q_value\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def q_learn_multi(epsilon=0.9, discount_factor=0.9, learning_rate=0.9, epochs=250, threads = 4):\n",
" \n",
" thread_array = []\n",
"\n",
" \n",
" for num in range(threads):\n",
" thread = threading.Thread(target=eposode, args=(epsilon, discount_factor, learning_rate, epochs))\n",
" thread_array.append(thread)\n",
" thread.start()\n",
"\n",
" for thread in thread_array:\n",
" thread.join()\n",
" print('Training complete!')\n",
"\n",
" return q_values\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Q-Learning Multi-threaded\n",
"<br>"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training complete!\n"
]
}
],
"source": [
"q_values = np.zeros((environment_rows, environment_columns, 4))\n",
"\n",
"q_values = q_learn_multi(0.7, 0.6, 0.1, 500, 12)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"graph(q_values, save=True, title=\"multi-thread: epsilon=0.7, discount_factor=0.6\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Q-Learning Single Threaded\n",
"<br>"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Training Progress: 100%|████████████████████████████████████| 1000/1000 [00:37<00:00, 26.43epochs/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training complete!\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"q_values = np.zeros((environment_rows, environment_columns, 4))\n",
"\n",
"q_values = q_learn_single(0.9, 0.7, 0.1, 1000)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"graph(q_values, save=True, title=\"single-thread: epsilon=0.9, discount_factor=0.6\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 4
}