AI-Stock-Predictor/Scrape Stock Data.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2852bf96-652d-498f-9f82-5064859706fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "from requests import get\n",
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "import re\n",
    "\n",
    "\n",
    "def load_html(filename):\n",
    "    with open(filename, 'r') as f:\n",
    "        html_doc = f.read()\n",
    "    return html_doc\n",
    "\n",
    "def replace_with_newline(match):\n",
    "    return \"\\n\" + match.group(0)[1:]\n",
    "    \n",
    "\n",
    "name = \"SPX_long\"\n",
    "\n",
    "html_doc = load_html(\"data/\"+name+\".html\")\n",
    "\n",
    "soup = BeautifulSoup(html_doc, 'html.parser')\n",
    "\n",
    "stock_data = soup.find_all(\"table\")\n",
    "\n",
    "for stock in stock_data:\n",
    "    value = stock.text.replace(',','').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace(' ', ',')\n",
    "    pattern = r\",[A-Z][a-z]{2}\"\n",
    "    new_string = re.sub(pattern, replace_with_newline, value)\n",
    "\n",
    "    # Remove all llines that do not have 9 commas\n",
    "    lines = new_string.splitlines()\n",
    "    filtered_lines = [line for line in lines if line.count(\",\") == 8]\n",
    "    new_string = \"\\n\".join(filtered_lines)\n",
    "    \n",
    "    label = \"Month,Day,Year,Open,High,Low,Close,Adj Close,Volume\"\n",
    "\n",
    "    data = label + \"\\n\" + new_string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b283eb21-d682-4b13-8921-18c2aa6e47a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"data/\"+name+'.csv', 'w') as f:\n",
    "    f.write(data)\n",
    "\n",
    "# Read the CSV file into a DataFrame\n",
    "df = pd.read_csv(\"data/\"+name+'.csv')\n",
    "\n",
    "\n",
    "\n",
    "item = []\n",
    "\n",
    "for i in range(len(df[\"Day\"])):\n",
    "    value = str(df[\"Month\"].iloc[i]) + \" \" + str(df[\"Day\"].iloc[i]) + \" \" + str(df[\"Year\"].iloc[i])\n",
    "    item.append(datetime.strptime(value, '%b %d %Y').strftime('%y-%m-%d'))\n",
    "    \n",
    "\n",
    "df[\"Date\"] = item\n",
    "\n",
    "df = df.iloc[::-1]\n",
    "df = df.drop(columns=['Month','Day','Year'])\n",
    "\n",
    "\n",
    "df.to_csv(\"data/\"+name+'.csv', index=False)\n",
    "\n",
    "df = pd.read_csv(\"data/\"+name+'.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "63877830-a456-4bad-bdef-cc704a7c677e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bound method NDFrame.head of          Open     High      Low    Close  Adj Close      Volume      Date\n",
       "0     1458.29  1458.29  1436.29  1436.51    1436.51  1197100000  00-09-29\n",
       "1     1436.52  1445.60  1429.83  1436.23    1436.23  1051200000  00-10-02\n",
       "2     1436.23  1454.82  1425.28  1426.46    1426.46  1098100000  00-10-03\n",
       "3     1426.46  1439.99  1416.31  1434.32    1434.32  1167400000  00-10-04\n",
       "4     1434.32  1444.17  1431.80  1436.28    1436.28  1176100000  00-10-05\n",
       "...       ...      ...      ...      ...        ...         ...       ...\n",
       "6025  5603.34  5636.27  5601.65  5626.02    5626.02  3500790000  24-09-13\n",
       "6026  5615.21  5636.05  5604.53  5633.09    5633.09  3437070000  24-09-16\n",
       "6027  5655.51  5670.81  5614.05  5634.58    5634.58  3443600000  24-09-17\n",
       "6028  5641.68  5689.75  5615.08  5618.26    5618.26  3691390000  24-09-18\n",
       "6029  5702.63  5733.57  5686.42  5713.64    5713.64  4024530000  24-09-19\n",
       "\n",
       "[6030 rows x 7 columns]>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
added code 2024-10-08 01:24:55 +00:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"id": "2852bf96-652d-498f-9f82-5064859706fd",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from bs4 import BeautifulSoup\n",`
			`"from requests import get\n",`
			`"import pandas as pd\n",`
			`"from datetime import datetime\n",`
			`"import re\n",`
			`"\n",`
			`"\n",`
			`"def load_html(filename):\n",`
			`" with open(filename, 'r') as f:\n",`
			`" html_doc = f.read()\n",`
			`" return html_doc\n",`
			`"\n",`
			`"def replace_with_newline(match):\n",`
			`" return \"\\n\" + match.group(0)[1:]\n",`
			`" \n",`
			`"\n",`
			`"name = \"SPX_long\"\n",`
			`"\n",`
			`"html_doc = load_html(\"data/\"+name+\".html\")\n",`
			`"\n",`
			`"soup = BeautifulSoup(html_doc, 'html.parser')\n",`
			`"\n",`
			`"stock_data = soup.find_all(\"table\")\n",`
			`"\n",`
			`"for stock in stock_data:\n",`
			`" value = stock.text.replace(',','').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ',')\n",`
			`" pattern = r\",[A-Z][a-z]{2}\"\n",`
			`" new_string = re.sub(pattern, replace_with_newline, value)\n",`
			`"\n",`
			`" # Remove all llines that do not have 9 commas\n",`
			`" lines = new_string.splitlines()\n",`
			`" filtered_lines = [line for line in lines if line.count(\",\") == 8]\n",`
			`" new_string = \"\\n\".join(filtered_lines)\n",`
			`" \n",`
			`" label = \"Month,Day,Year,Open,High,Low,Close,Adj Close,Volume\"\n",`
			`"\n",`
			`" data = label + \"\\n\" + new_string"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"id": "b283eb21-d682-4b13-8921-18c2aa6e47a1",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"with open(\"data/\"+name+'.csv', 'w') as f:\n",`
			`" f.write(data)\n",`
			`"\n",`
			`"# Read the CSV file into a DataFrame\n",`
			`"df = pd.read_csv(\"data/\"+name+'.csv')\n",`
			`"\n",`
			`"\n",`
			`"\n",`
			`"item = []\n",`
			`"\n",`
			`"for i in range(len(df[\"Day\"])):\n",`
			`" value = str(df[\"Month\"].iloc[i]) + \" \" + str(df[\"Day\"].iloc[i]) + \" \" + str(df[\"Year\"].iloc[i])\n",`
			`" item.append(datetime.strptime(value, '%b %d %Y').strftime('%y-%m-%d'))\n",`
			`" \n",`
			`"\n",`
			`"df[\"Date\"] = item\n",`
			`"\n",`
			`"df = df.iloc[::-1]\n",`
			`"df = df.drop(columns=['Month','Day','Year'])\n",`
			`"\n",`
			`"\n",`
			`"df.to_csv(\"data/\"+name+'.csv', index=False)\n",`
			`"\n",`
			`"df = pd.read_csv(\"data/\"+name+'.csv')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"id": "63877830-a456-4bad-bdef-cc704a7c677e",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"<bound method NDFrame.head of Open High Low Close Adj Close Volume Date\n",`
			`"0 1458.29 1458.29 1436.29 1436.51 1436.51 1197100000 00-09-29\n",`
			`"1 1436.52 1445.60 1429.83 1436.23 1436.23 1051200000 00-10-02\n",`
			`"2 1436.23 1454.82 1425.28 1426.46 1426.46 1098100000 00-10-03\n",`
			`"3 1426.46 1439.99 1416.31 1434.32 1434.32 1167400000 00-10-04\n",`
			`"4 1434.32 1444.17 1431.80 1436.28 1436.28 1176100000 00-10-05\n",`
			`"... ... ... ... ... ... ... ...\n",`
			`"6025 5603.34 5636.27 5601.65 5626.02 5626.02 3500790000 24-09-13\n",`
			`"6026 5615.21 5636.05 5604.53 5633.09 5633.09 3437070000 24-09-16\n",`
			`"6027 5655.51 5670.81 5614.05 5634.58 5634.58 3443600000 24-09-17\n",`
			`"6028 5641.68 5689.75 5615.08 5618.26 5618.26 3691390000 24-09-18\n",`
			`"6029 5702.63 5733.57 5686.42 5713.64 5713.64 4024530000 24-09-19\n",`
			`"\n",`
			`"[6030 rows x 7 columns]>"`
			`]`
			`},`
			`"execution_count": 3,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"df.head"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.14"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`