diff --git a/README.md b/README.md index 7702ee2..a0843de 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,69 @@ -# q-learning-terrain-navigator +# Q-Learning Terrain Navigator +This repository demonstrates the application of Q-learning algorithm in a jupyter notebook environment. The Q-learning algorithm is used to navigate a terrain map and learn the optimal path. +## Jupyter Notebook + + + +### Description +This notebook show how Q-learning works and demonstrate how it can learn to navigate a terrain map and find the optimal path to travel down. This map made using a pygame program that allows you to interactavly draw out the different parts of the terain. + +### Features +- Create a map for the Q-learning algorithm to try using pygame +- Visualize the map and q-table using a matplotlib +- Implement a multi-threaded version of Q-learning +- Compare different hyper paramaters of Q-learning algorithm + +### Requirements +- Python 3.x +- `jupyter lab` or `jupyter notebook` + +```bash +pip install numpy, matplotlib, threading, tqdm, +``` + + + +## Pygame Map Builder + + + +### Description +This Pygame-based map editor allows users to generate, edit, and save custom maps. Users can define grid sizes, place various terrain types, and automatically apply boundary walls around the grid. The map is saved to a JSON file, and it will load from the file if it exists on startup. + +### Features +- **Adjustable Grid Size**: Use the slider on the right to set the grid size from 5x5 to 15x15. +- **Terrain Types**: Place different terrain types using keyboard shortcuts. +- **Boundary Walls**: Automatically creates a boundary with walls (-1000) around the grid. +- **Save/Load Functionality**: The map is saved to and loaded from `map_data.json`. If you have already created a map it will automatically detect that and load it on start, so you can make simple changes without having to rebuild it again. +### Controls +- **Slider**: Adjust grid size (5x5 to 15x15) using the slider on the right panel. +- **Mouse Click**: Click inside the grid to select a cell and place terrain based on the active key shortcut. + +### Terrain Shortcuts: +- `G`: Place **Goal** (1000) +- `R`: Place **Road** (-1) +- `C`: Place **Cliff** (-100) +- `V`: Place **River** (-10) +- `M`: Place **Mountain** (-50) + +### How to Use +1. **Start the Program**: Run the Python script to start the map editor. +2. **Set Grid Size**: Use the slider on the right panel to adjust the grid size between 5x5 and 15x15. +3. **Edit the Map**: Click on cells and use the keyboard shortcuts to place terrain types. +4. **Save/Load**: On exiting, the current map is saved to `map_data.json`. The map will automatically load the next time the program is started if the file exists. + +### Requirements +- Python 3.x +- `pygame` library +- `numpy` library + +### Installation +Install dependencies using: +```bash +pip install pygame numpy +``` +### Running the Program +```bash +./map_generator +``` diff --git a/docs/map_generator.png b/docs/map_generator.png new file mode 100644 index 0000000..9df654a Binary files /dev/null and b/docs/map_generator.png differ diff --git a/docs/q-table.png b/docs/q-table.png new file mode 100644 index 0000000..29feeb6 Binary files /dev/null and b/docs/q-table.png differ diff --git a/images/plot_20241021_224820.png b/images/plot_20241021_224820.png new file mode 100644 index 0000000..7ca5da6 Binary files /dev/null and b/images/plot_20241021_224820.png differ diff --git a/images/plot_20241021_224858.png b/images/plot_20241021_224858.png new file mode 100644 index 0000000..240c1f9 Binary files /dev/null and b/images/plot_20241021_224858.png differ diff --git a/map_data.json b/map_data.json new file mode 100644 index 0000000..bcfce64 --- /dev/null +++ b/map_data.json @@ -0,0 +1 @@ +[[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0], [-1000.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -50.0, -50.0, -1000.0], [-1000.0, -1.0, -100.0, -1.0, -1.0, -1.0, -1.0, -50.0, -50.0, -1.0, -1.0, -1000.0], [-1000.0, -1.0, -100.0, -1.0, -1.0, -1.0, -50.0, -50.0, -100.0, -1.0, -1.0, -1000.0], [-1000.0, -100.0, -100.0, -1.0, -1.0, -50.0, -50.0, -50.0, -100.0, -1.0, -1.0, -1000.0], [-1000.0, -100.0, -1.0, -1.0, -10.0, -10.0, -50.0, -50.0, -1.0, -1.0, -1.0, -1000.0], [-1000.0, -1.0, -1.0, -100.0, -10.0, -10.0, -10.0, -50.0, -1.0, -1.0, -1.0, -1000.0], [-1000.0, -1.0, -100.0, -100.0, -1.0, -10.0, -10.0, -10.0, -50.0, -1.0, -1.0, -1000.0], [-1000.0, -1.0, -100.0, -1.0, -1.0, -1.0, -10.0, -10.0, -10.0, -50.0, -1.0, -1000.0], [-1000.0, -1.0, -100.0, -1.0, -1.0, -1.0, -1.0, -10.0, -10.0, -10.0, -1.0, -1000.0], [-1000.0, 1000.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -10.0, -10.0, -1.0, -1000.0], [-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0]] \ No newline at end of file diff --git a/map_generator b/map_generator new file mode 100755 index 0000000..21cb746 --- /dev/null +++ b/map_generator @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 + + +import pygame +import numpy as np +import json +import os + +# Pygame setup +pygame.init() +width, height = 800, 600 # Adjust window size to make space for the shortcut panel +rows, cols = 10, 10 # Default size of the grid (can be changed) +cell_size = min(width // (cols + 4), height // rows) # Adjust cell size based on window and grid size + +# Colors +colors = { + 'wall': (0, 0, 0), # Black for wall (boundary) + 'cliff': (255, 0, 0), # Red for cliff + 'road': (128, 128, 128), # Grey for road + 'goal': (0, 255, 0), # Green for goal + 'river': (0, 0, 255), # Blue for river + 'mountain': (139, 69, 19), # Brown for mountain + 'empty': (255, 255, 255) # White for default +} + +# Create initial map array with a dynamic size +def create_map(rows, cols): + # Create a new map array and set boundary values to -1000 + new_map = np.full((rows, cols), -1.0) # Default to road (-1) + new_map[0, :] = -1000 # Top boundary + new_map[-1, :] = -1000 # Bottom boundary + new_map[:, 0] = -1000 # Left boundary + new_map[:, -1] = -1000 # Right boundary + return new_map + +# Function to load the map from the JSON file and set the size accordingly +def load_map(): + if os.path.exists("map_data.json"): + with open("map_data.json", "r") as f: + loaded_map = np.array(json.load(f)) + return loaded_map, loaded_map.shape[0], loaded_map.shape[1] # Return map and its dimensions + return create_map(rows, cols), rows, cols # If no file exists, return default map + +# Load the map and set the initial size based on the file +map_array, rows, cols = load_map() +cell_size = min(width // (cols + 4), height // rows) # Adjust cell size based on loaded grid size + +# Create the window +screen = pygame.display.set_mode((width, height)) +pygame.display.set_caption("Map Editor") + +# Slider parameters +slider_x = width - 100 +slider_y = 350 +slider_height = 200 +slider_pos = slider_y + (slider_height // 2) # Initial slider position +min_size = 5 +max_size = 15 +slider_value = rows # Default slider value corresponds to the loaded grid size + +# Function to draw the grid +def draw_grid(): + for row in range(rows): + for col in range(cols): + value = map_array[row, col] + if value == -1000: + color = colors['wall'] # Black for walls + elif value == -100: + color = colors['cliff'] + elif value == -50: + color = colors['mountain'] + elif value == -10: + color = colors['river'] + elif value == -1: + color = colors['road'] + elif value == 1000: + color = colors['goal'] + else: + color = colors['empty'] + pygame.draw.rect(screen, color, (col * cell_size, row * cell_size, cell_size, cell_size)) + pygame.draw.rect(screen, (0, 0, 0), (col * cell_size, row * cell_size, cell_size, cell_size), 1) + +# Function to display shortcut panel and slider +def draw_side_panel(): + font = pygame.font.SysFont(None, 24) + shortcuts = [ + "Shortcuts:", + "G: Goal (1000)", + "R: Road (-1)", + "C: Cliff (-100)", + "V: River (-10)", + "M: Mountain (-50)" + ] + for i, text in enumerate(shortcuts): + img = font.render(text, True, (0, 0, 0)) + screen.blit(img, (cols * cell_size + 10, i * 30 + 10)) + + # Draw the slider + pygame.draw.rect(screen, (150, 150, 150), (slider_x, slider_y, 20, slider_height)) # Slider track + pygame.draw.circle(screen, (0, 0, 0), (slider_x + 10, slider_pos), 10) # Slider knob + label = font.render(f"Size: {slider_value}x{slider_value}", True, (0, 0, 0)) + screen.blit(label, (slider_x - 10, slider_y - 30)) # Display current grid size + +# Main loop +running = True +dragging_slider = False + +while running: + screen.fill((255, 255, 255)) # Fill the background + + for event in pygame.event.get(): + if event.type == pygame.QUIT: + # Save map to file on exit + with open("map_data.json", "w") as f: + json.dump(map_array.tolist(), f) + running = False + + # Handle mouse press on slider + if event.type == pygame.MOUSEBUTTONDOWN: + mouse_x, mouse_y = pygame.mouse.get_pos() + if slider_x <= mouse_x <= slider_x + 20 and slider_y <= mouse_y <= slider_y + slider_height: + dragging_slider = True + + # Handle mouse release for slider + if event.type == pygame.MOUSEBUTTONUP: + dragging_slider = False + + # Handle dragging of the slider + if dragging_slider: + mouse_y = pygame.mouse.get_pos()[1] + slider_pos = max(slider_y, min(slider_y + slider_height, mouse_y)) + # Map the slider position to a value between min_size and max_size + slider_value = min_size + (slider_pos - slider_y) * (max_size - min_size) // slider_height + rows, cols = slider_value, slider_value + map_array = create_map(rows, cols) + cell_size = min(width // (cols + 4), height // rows) + + # Handle mouse clicks for grid drawing + if not dragging_slider and pygame.mouse.get_pressed()[0]: + x, y = pygame.mouse.get_pos() + if x < cols * cell_size: # Only allow clicking inside the grid + col, row = x // cell_size, y // cell_size + + # Change the value based on key press + keys = pygame.key.get_pressed() + if keys[pygame.K_g]: # 'g' for Goal + map_array[row, col] = 1000 + elif keys[pygame.K_r]: # 'r' for Road + map_array[row, col] = -1 + elif keys[pygame.K_c]: # 'c' for Cliff + map_array[row, col] = -100 + elif keys[pygame.K_v]: # 'v' for River + map_array[row, col] = -10 + elif keys[pygame.K_m]: # 'm' for Mountain + map_array[row, col] = -50 + + # Redraw grid and side panel + draw_grid() + draw_side_panel() + pygame.display.update() + +pygame.quit() diff --git a/q-learning-terrain-navigator.ipynb b/q-learning-terrain-navigator.ipynb new file mode 100644 index 0000000..19c48ba --- /dev/null +++ b/q-learning-terrain-navigator.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import Packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import necessary libraries\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib import colors\n", + "import matplotlib.animation as animation\n", + "import json\n", + "import time\n", + "import threading\n", + "import tqdm\n", + "from tqdm import tqdm\n", + "from tqdm import trange\n", + "import datetime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Map\n", + "Create a map for the Q-learning algorithm to try. You can choose any grid size, but the larger the grid, the more compute it will take. I would suggest around an 8x8 to 12x12 grid." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pygame 2.1.0 (SDL 2.0.16, Python 3.10.14)\n", + "Hello from the pygame community. https://www.pygame.org/contribute.html\n" + ] + } + ], + "source": [ + "!./map_generator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Importing Map Array and Displaying Map\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the saved map\n", + "with open(\"map_data.json\", \"r\") as f:\n", + " rewards = np.array(json.load(f))\n", + "\n", + "#rewards[rewards == 1000] = 500\n", + "\n", + "environment_rows = rewards.shape[0]\n", + "environment_columns = rewards.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Define the colormap for the grid values\n", + "cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n", + "# Bounds now account for the actual range of values, with small gaps between to handle exact matching\n", + "bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 2000.5]\n", + "norm = colors.BoundaryNorm(bounds, cmap.N)\n", + "\n", + "# Create the plot\n", + "plt.imshow(rewards, cmap=cmap, norm=norm)\n", + "\n", + "\n", + "# Display the plot\n", + "plt.title(\"Map Visualization with Goal\")\n", + "plt.show()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualization Functions\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def graph(q_table, save=False, title=\"\"):\n", + " # Define the colormap for the grid values\n", + " #fig, ax = plt.subplots(figsize=(8, 8), dpi=200) # Increased figure size and DPI\n", + "\n", + " cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n", + " # Bounds now account for the actual range of values, with small gaps between to handle exact matching\n", + " bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 1000.5]\n", + " norm = colors.BoundaryNorm(bounds, cmap.N)\n", + "\n", + " \n", + " # Create the plot for rewards\n", + " plt.imshow(rewards, cmap=cmap, norm=norm)\n", + " \n", + " # Calculate the optimal direction from Q-table\n", + " # Directions: up (0), right (1), down (2), left (3)\n", + " optimal_directions = np.argmax(q_table, axis=2)\n", + " \n", + " # Initialize arrays for arrow direction (dx, dy) at each grid point\n", + " dx = np.zeros_like(optimal_directions, dtype=float)\n", + " dy = np.zeros_like(optimal_directions, dtype=float)\n", + " \n", + " # Define movement deltas for [up, right, down, left]\n", + " move_map = {\n", + " 0: (0, -1), # up\n", + " 1: (1, 0), # right\n", + " 2: (0, 1), # down\n", + " 3: (-1, 0), # left\n", + " }\n", + "\n", + " # Fill in dx, dy based on optimal directions, but only if the sum of Q-values is not zero\n", + " for i in range(optimal_directions.shape[0]):\n", + " for j in range(optimal_directions.shape[1]):\n", + " if np.sum(q_table[i, j]) != 0: # Check if the Q-values are non-zero\n", + " direction = optimal_directions[i, j]\n", + " dx[i, j], dy[i, j] = move_map[direction]\n", + " \n", + " # Create a meshgrid for plotting arrows\n", + " x, y = np.meshgrid(np.arange(optimal_directions.shape[1]), np.arange(optimal_directions.shape[0]))\n", + " \n", + " # Plot arrows using quiver, only for non-zero vectors\n", + " plt.quiver(x, y, dx, dy, angles='xy', scale_units='xy', scale=1, color='black')\n", + " plt.title(title)\n", + "\n", + " if save:\n", + " timestamp = datetime.datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", + " filename = f\"images/plot_{timestamp}.png\"\n", + " plt.savefig(filename, format='png')\n", + " \n", + " # Display the plot with arrows\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def graph_path(path):\n", + " # Define the colormap for the grid values\n", + " cmap = colors.ListedColormap(['black', 'red', (0.5451, 0.2706, 0.0745), 'blue', 'gray', (0,1,0)])\n", + " bounds = [-1000.5, -100.5, -99.5, -49.5, -9, -0.5, 1000.5]\n", + " norm = colors.BoundaryNorm(bounds, cmap.N)\n", + "\n", + " # Create the plot for rewards\n", + " plt.imshow(rewards, cmap=cmap, norm=norm)\n", + "\n", + " move_map = {\n", + " 0: (0, -1), # up\n", + " 1: (1, 0), # right\n", + " 2: (0, 1), # down\n", + " 3: (-1, 0), # left\n", + " }\n", + "\n", + " # Now plot the path taken by the robot\n", + " path_x = [pos[1] for pos in path]\n", + " path_y = [pos[0] for pos in path]\n", + " \n", + " # Create arrows for the robot's path\n", + " for i in range(len(path) - 1):\n", + " start_x, start_y = path_x[i], path_y[i]\n", + " end_x, end_y = path_x[i + 1], path_y[i + 1]\n", + " plt.arrow(start_x, start_y, end_x - start_x, end_y - start_y, color='yellow', head_width=0.2)\n", + "\n", + " # Display the plot with arrows\n", + " plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q-Learning helper functions\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# define actions\n", + "# we will use numeric (index) to represent the actions\n", + "# 0 = up, 1 = right, 2 = down, 3 = left\n", + "actions = ['up', 'right', 'down', 'left']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# because we will end the episode if we reach Goal\n", + "def is_terminal_state(current_row_index, current_column_index):\n", + " if rewards[current_row_index, current_column_index] != np.max(rewards): # it is not terminal if the rewards is -1\n", + " return False\n", + " else:\n", + " return True\n", + "\n", + "# this starting location must not be on the road\n", + "def get_starting_location():\n", + " current_row_index = np.random.randint(environment_rows) # get a random row index\n", + " current_column_index = np.random.randint(environment_columns) # get a random column index\n", + " \n", + " while rewards[current_row_index, current_column_index] != -1: # True if it is terminal\n", + " current_row_index = np.random.randint(environment_rows) # repeat to get another random row index\n", + " current_column_index = np.random.randint(environment_columns) # repeat to get another random row index\n", + " return current_row_index, current_column_index # returns a random starting location that is not terminal\n", + "\n", + "\n", + "# define an epsilon greedy algorithm for deciding the next action\n", + "def get_next_action(current_row_index, current_column_index, epsilon):\n", + " if np.random.random() < epsilon: # choose the action with the highest q_values\n", + " return np.random.randint(4)\n", + " else: # choose a random action\n", + " return np.argmax(q_values[current_row_index, current_column_index])\n", + "\n", + "\n", + "# define a function that will get the next location based on the chosen action\n", + "# refer to how the board is drawn physically, with the rows and columns\n", + "def get_next_location(current_row_index, current_column_index, action_index):\n", + " new_row_index = current_row_index\n", + " new_column_index = current_column_index\n", + " if actions[action_index] == 'up' and current_row_index > 0:\n", + " new_row_index -= 1\n", + " elif actions[action_index] == 'right' and current_column_index < environment_columns - 1:\n", + " new_column_index += 1\n", + " elif actions[action_index] == 'down' and current_row_index < environment_rows - 1:\n", + " new_row_index += 1\n", + " elif actions[action_index] == 'left' and current_column_index > 0:\n", + " new_column_index -= 1\n", + " return new_row_index, new_column_index\n", + "\n", + "\n", + "# Define a function that will get the shortest path that is on the white tiles \n", + "def get_shortest_path(start_row_index, start_column_index):\n", + " i = 0\n", + " if is_terminal_state(start_row_index, start_column_index): # check if it is on Goal or Cliff\n", + " return [] # if yes, there are no available steps\n", + " \n", + " else: #if this is a 'legal' starting location\n", + " current_row_index, current_column_index = start_row_index, start_column_index\n", + " shortest_path = []\n", + " shortest_path.append([current_row_index, current_column_index]) # add the current coordinate to the list\n", + "\n", + " while not is_terminal_state(current_row_index, current_column_index): # repeat until we reach Goal or Cliff\n", + " action_index = get_next_action(current_row_index, current_column_index, 1.) \n", + " # get next coordinate \n", + " \n", + " current_row_index, current_column_index = get_next_location(current_row_index, current_column_index, action_index)\n", + " # update that next coordinate as current coordinate\n", + " \n", + " shortest_path.append([current_row_index, current_column_index]) \n", + " # add the current coordinate to the list\n", + "\n", + " i += 1\n", + " if i > 100:\n", + " return 0;\n", + " return shortest_path" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def q_learn_single(epsilon = 0.9, discount_factor = 0.9, learning_rate = 0.9, epochs = 1000,):\n", + " q_values = np.zeros((environment_rows, environment_columns, 4))\n", + " \n", + " for episode in tqdm(range(epochs), desc=\"Training Progress\", unit=\"epochs\", ncols=100): # Adjust `ncols` to shorten the bar\n", + " row_index, column_index = get_starting_location()\n", + "\n", + " while not is_terminal_state(row_index, column_index):\n", + " # choose which action to take (i.e., where to move next)\n", + " action_index = get_next_action(row_index, column_index, epsilon)\n", + "\n", + " # perform the chosen action, and transition to the next state / next location\n", + " old_row_index, old_column_index = row_index, column_index # store the old row and column indexes\n", + " row_index, column_index = get_next_location(row_index, column_index, action_index)\n", + "\n", + " # receive the reward for moving to the new state, and calculate the temporal difference\n", + " reward = rewards[row_index, column_index]\n", + " old_q_value = q_values[old_row_index, old_column_index, action_index]\n", + " temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n", + "\n", + " # update the Q-value for the previous state and action pair\n", + " new_q_value = old_q_value + (learning_rate * temporal_difference)\n", + " q_values[old_row_index, old_column_index, action_index] = new_q_value\n", + "\n", + " print('Training complete!')\n", + "\n", + " return q_values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def q_learn_single(epsilon = 0.9, discount_factor = 0.9, learning_rate = 0.9, epochs = 1000):\n", + " # Initialize the Q-table with zeros for each state-action pair\n", + " # The shape is (environment_rows, environment_columns, 4) \n", + " # where 4 represents 4 possible actions (e.g., up, down, left, right)\n", + " q_values = np.zeros((environment_rows, environment_columns, 4))\n", + " \n", + " # Iterate through a number of episodes (i.e., learning cycles)\n", + " for episode in tqdm(range(epochs), desc=\"Training Progress\", unit=\"epochs\", ncols=100):\n", + " # Start each episode by selecting a random starting location in the environment\n", + " row_index, column_index = get_starting_location()\n", + "\n", + " # Continue taking actions until the agent reaches a terminal state\n", + " while not is_terminal_state(row_index, column_index):\n", + " # Choose the next action based on an epsilon-greedy policy\n", + " # This function should balance exploration (random) vs exploitation (best known action)\n", + " action_index = get_next_action(row_index, column_index, epsilon)\n", + "\n", + " # Save the old position before taking the action\n", + " old_row_index, old_column_index = row_index, column_index\n", + " \n", + " # Move to the new state based on the chosen action\n", + " row_index, column_index = get_next_location(row_index, column_index, action_index)\n", + "\n", + " # Get the reward for the new state the agent has moved to\n", + " reward = rewards[row_index, column_index]\n", + " \n", + " # Retrieve the Q-value of the old state-action pair\n", + " old_q_value = q_values[old_row_index, old_column_index, action_index]\n", + "\n", + " # Calculate the temporal difference: \n", + " # TD = Reward + Discount * (Max Q-value for the next state) - Old Q-value\n", + " temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n", + "\n", + " # Update the Q-value for the previous state-action pair using the learning rate\n", + " new_q_value = old_q_value + (learning_rate * temporal_difference)\n", + " q_values[old_row_index, old_column_index, action_index] = new_q_value # Assign updated value\n", + "\n", + " # After all episodes, print a message indicating the training is complete\n", + " print('Training complete!')\n", + "\n", + " # Return the Q-values for further use (e.g., evaluation or exploitation phase)\n", + " return q_values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# single episode\n", + "\n", + "def eposode(epsilon, discount_factor, learning_rate, epochs):\n", + " for episode in range(epochs):\n", + " row_index, column_index = get_starting_location()\n", + " \n", + " while not is_terminal_state(row_index, column_index):\n", + " # choose which action to take (i.e., where to move next)\n", + " action_index = get_next_action(row_index, column_index, epsilon)\n", + " \n", + " # perform the chosen action, and transition to the next state / next location\n", + " old_row_index, old_column_index = row_index, column_index # store the old row and column indexes\n", + " row_index, column_index = get_next_location(row_index, column_index, action_index)\n", + " \n", + " # receive the reward for moving to the new state, and calculate the temporal difference\n", + " reward = rewards[row_index, column_index]\n", + " old_q_value = q_values[old_row_index, old_column_index, action_index]\n", + " temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value\n", + " \n", + " # update the Q-value for the previous state and action pair\n", + " new_q_value = old_q_value + (learning_rate * temporal_difference)\n", + " q_values[old_row_index, old_column_index, action_index] = new_q_value\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def q_learn_multi(epsilon=0.9, discount_factor=0.9, learning_rate=0.9, epochs=250, threads = 4):\n", + " \n", + " thread_array = []\n", + "\n", + " \n", + " for num in range(threads):\n", + " thread = threading.Thread(target=eposode, args=(epsilon, discount_factor, learning_rate, epochs))\n", + " thread_array.append(thread)\n", + " thread.start()\n", + "\n", + " for thread in thread_array:\n", + " thread.join()\n", + " print('Training complete!')\n", + "\n", + " return q_values\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q-Learning Multi-threaded\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training complete!\n" + ] + } + ], + "source": [ + "q_values = np.zeros((environment_rows, environment_columns, 4))\n", + "\n", + "q_values = q_learn_multi(0.7, 0.6, 0.1, 500, 12)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "graph(q_values, save=True, title=\"multi-thread: epsilon=0.7, discount_factor=0.6\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q-Learning Single Threaded\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training Progress: 100%|████████████████████████████████████| 1000/1000 [00:37<00:00, 26.43epochs/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training complete!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "q_values = np.zeros((environment_rows, environment_columns, 4))\n", + "\n", + "q_values = q_learn_single(0.9, 0.7, 0.1, 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "graph(q_values, save=True, title=\"single-thread: epsilon=0.9, discount_factor=0.6\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}