{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2852bf96-652d-498f-9f82-5064859706fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "from requests import get\n",
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "import re\n",
    "\n",
    "\n",
    "def load_html(filename):\n",
    "    with open(filename, 'r') as f:\n",
    "        html_doc = f.read()\n",
    "    return html_doc\n",
    "\n",
    "def replace_with_newline(match):\n",
    "    return \"\\n\" + match.group(0)[1:]\n",
    "    \n",
    "\n",
    "name = \"SPX_long\"\n",
    "\n",
    "html_doc = load_html(\"data/\"+name+\".html\")\n",
    "\n",
    "soup = BeautifulSoup(html_doc, 'html.parser')\n",
    "\n",
    "stock_data = soup.find_all(\"table\")\n",
    "\n",
    "for stock in stock_data:\n",
    "    value = stock.text.replace(',','').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace(' ', ',')\n",
    "    pattern = r\",[A-Z][a-z]{2}\"\n",
    "    new_string = re.sub(pattern, replace_with_newline, value)\n",
    "\n",
    "    # Remove all llines that do not have 9 commas\n",
    "    lines = new_string.splitlines()\n",
    "    filtered_lines = [line for line in lines if line.count(\",\") == 8]\n",
    "    new_string = \"\\n\".join(filtered_lines)\n",
    "    \n",
    "    label = \"Month,Day,Year,Open,High,Low,Close,Adj Close,Volume\"\n",
    "\n",
    "    data = label + \"\\n\" + new_string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b283eb21-d682-4b13-8921-18c2aa6e47a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"data/\"+name+'.csv', 'w') as f:\n",
    "    f.write(data)\n",
    "\n",
    "# Read the CSV file into a DataFrame\n",
    "df = pd.read_csv(\"data/\"+name+'.csv')\n",
    "\n",
    "\n",
    "\n",
    "item = []\n",
    "\n",
    "for i in range(len(df[\"Day\"])):\n",
    "    value = str(df[\"Month\"].iloc[i]) + \" \" + str(df[\"Day\"].iloc[i]) + \" \" + str(df[\"Year\"].iloc[i])\n",
    "    item.append(datetime.strptime(value, '%b %d %Y').strftime('%y-%m-%d'))\n",
    "    \n",
    "\n",
    "df[\"Date\"] = item\n",
    "\n",
    "df = df.iloc[::-1]\n",
    "df = df.drop(columns=['Month','Day','Year'])\n",
    "\n",
    "\n",
    "df.to_csv(\"data/\"+name+'.csv', index=False)\n",
    "\n",
    "df = pd.read_csv(\"data/\"+name+'.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "63877830-a456-4bad-bdef-cc704a7c677e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bound method NDFrame.head of          Open     High      Low    Close  Adj Close      Volume      Date\n",
       "0     1458.29  1458.29  1436.29  1436.51    1436.51  1197100000  00-09-29\n",
       "1     1436.52  1445.60  1429.83  1436.23    1436.23  1051200000  00-10-02\n",
       "2     1436.23  1454.82  1425.28  1426.46    1426.46  1098100000  00-10-03\n",
       "3     1426.46  1439.99  1416.31  1434.32    1434.32  1167400000  00-10-04\n",
       "4     1434.32  1444.17  1431.80  1436.28    1436.28  1176100000  00-10-05\n",
       "...       ...      ...      ...      ...        ...         ...       ...\n",
       "6025  5603.34  5636.27  5601.65  5626.02    5626.02  3500790000  24-09-13\n",
       "6026  5615.21  5636.05  5604.53  5633.09    5633.09  3437070000  24-09-16\n",
       "6027  5655.51  5670.81  5614.05  5634.58    5634.58  3443600000  24-09-17\n",
       "6028  5641.68  5689.75  5615.08  5618.26    5618.26  3691390000  24-09-18\n",
       "6029  5702.63  5733.57  5686.42  5713.64    5713.64  4024530000  24-09-19\n",
       "\n",
       "[6030 rows x 7 columns]>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}