{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "2852bf96-652d-498f-9f82-5064859706fd", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "from requests import get\n", "import pandas as pd\n", "from datetime import datetime\n", "import re\n", "\n", "\n", "def load_html(filename):\n", " with open(filename, 'r') as f:\n", " html_doc = f.read()\n", " return html_doc\n", "\n", "def replace_with_newline(match):\n", " return \"\\n\" + match.group(0)[1:]\n", " \n", "\n", "name = \"SPX_long\"\n", "\n", "html_doc = load_html(\"data/\"+name+\".html\")\n", "\n", "soup = BeautifulSoup(html_doc, 'html.parser')\n", "\n", "stock_data = soup.find_all(\"table\")\n", "\n", "for stock in stock_data:\n", " value = stock.text.replace(',','').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ',')\n", " pattern = r\",[A-Z][a-z]{2}\"\n", " new_string = re.sub(pattern, replace_with_newline, value)\n", "\n", " # Remove all llines that do not have 9 commas\n", " lines = new_string.splitlines()\n", " filtered_lines = [line for line in lines if line.count(\",\") == 8]\n", " new_string = \"\\n\".join(filtered_lines)\n", " \n", " label = \"Month,Day,Year,Open,High,Low,Close,Adj Close,Volume\"\n", "\n", " data = label + \"\\n\" + new_string" ] }, { "cell_type": "code", "execution_count": 2, "id": "b283eb21-d682-4b13-8921-18c2aa6e47a1", "metadata": {}, "outputs": [], "source": [ "with open(\"data/\"+name+'.csv', 'w') as f:\n", " f.write(data)\n", "\n", "# Read the CSV file into a DataFrame\n", "df = pd.read_csv(\"data/\"+name+'.csv')\n", "\n", "\n", "\n", "item = []\n", "\n", "for i in range(len(df[\"Day\"])):\n", " value = str(df[\"Month\"].iloc[i]) + \" \" + str(df[\"Day\"].iloc[i]) + \" \" + str(df[\"Year\"].iloc[i])\n", " item.append(datetime.strptime(value, '%b %d %Y').strftime('%y-%m-%d'))\n", " \n", "\n", "df[\"Date\"] = item\n", "\n", "df = df.iloc[::-1]\n", "df = df.drop(columns=['Month','Day','Year'])\n", "\n", "\n", "df.to_csv(\"data/\"+name+'.csv', index=False)\n", "\n", "df = pd.read_csv(\"data/\"+name+'.csv')" ] }, { "cell_type": "code", "execution_count": 3, "id": "63877830-a456-4bad-bdef-cc704a7c677e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }