AI-Stock-Predictor/Scrape Stock Data.ipynb

139 lines
4.3 KiB
Plaintext
Raw Normal View History

2024-10-08 01:24:55 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "2852bf96-652d-498f-9f82-5064859706fd",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"from requests import get\n",
"import pandas as pd\n",
"from datetime import datetime\n",
"import re\n",
"\n",
"\n",
"def load_html(filename):\n",
" with open(filename, 'r') as f:\n",
" html_doc = f.read()\n",
" return html_doc\n",
"\n",
"def replace_with_newline(match):\n",
" return \"\\n\" + match.group(0)[1:]\n",
" \n",
"\n",
"name = \"SPX_long\"\n",
"\n",
"html_doc = load_html(\"data/\"+name+\".html\")\n",
"\n",
"soup = BeautifulSoup(html_doc, 'html.parser')\n",
"\n",
"stock_data = soup.find_all(\"table\")\n",
"\n",
"for stock in stock_data:\n",
" value = stock.text.replace(',','').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ',')\n",
" pattern = r\",[A-Z][a-z]{2}\"\n",
" new_string = re.sub(pattern, replace_with_newline, value)\n",
"\n",
" # Remove all llines that do not have 9 commas\n",
" lines = new_string.splitlines()\n",
" filtered_lines = [line for line in lines if line.count(\",\") == 8]\n",
" new_string = \"\\n\".join(filtered_lines)\n",
" \n",
" label = \"Month,Day,Year,Open,High,Low,Close,Adj Close,Volume\"\n",
"\n",
" data = label + \"\\n\" + new_string"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b283eb21-d682-4b13-8921-18c2aa6e47a1",
"metadata": {},
"outputs": [],
"source": [
"with open(\"data/\"+name+'.csv', 'w') as f:\n",
" f.write(data)\n",
"\n",
"# Read the CSV file into a DataFrame\n",
"df = pd.read_csv(\"data/\"+name+'.csv')\n",
"\n",
"\n",
"\n",
"item = []\n",
"\n",
"for i in range(len(df[\"Day\"])):\n",
" value = str(df[\"Month\"].iloc[i]) + \" \" + str(df[\"Day\"].iloc[i]) + \" \" + str(df[\"Year\"].iloc[i])\n",
" item.append(datetime.strptime(value, '%b %d %Y').strftime('%y-%m-%d'))\n",
" \n",
"\n",
"df[\"Date\"] = item\n",
"\n",
"df = df.iloc[::-1]\n",
"df = df.drop(columns=['Month','Day','Year'])\n",
"\n",
"\n",
"df.to_csv(\"data/\"+name+'.csv', index=False)\n",
"\n",
"df = pd.read_csv(\"data/\"+name+'.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "63877830-a456-4bad-bdef-cc704a7c677e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bound method NDFrame.head of Open High Low Close Adj Close Volume Date\n",
"0 1458.29 1458.29 1436.29 1436.51 1436.51 1197100000 00-09-29\n",
"1 1436.52 1445.60 1429.83 1436.23 1436.23 1051200000 00-10-02\n",
"2 1436.23 1454.82 1425.28 1426.46 1426.46 1098100000 00-10-03\n",
"3 1426.46 1439.99 1416.31 1434.32 1434.32 1167400000 00-10-04\n",
"4 1434.32 1444.17 1431.80 1436.28 1436.28 1176100000 00-10-05\n",
"... ... ... ... ... ... ... ...\n",
"6025 5603.34 5636.27 5601.65 5626.02 5626.02 3500790000 24-09-13\n",
"6026 5615.21 5636.05 5604.53 5633.09 5633.09 3437070000 24-09-16\n",
"6027 5655.51 5670.81 5614.05 5634.58 5634.58 3443600000 24-09-17\n",
"6028 5641.68 5689.75 5615.08 5618.26 5618.26 3691390000 24-09-18\n",
"6029 5702.63 5733.57 5686.42 5713.64 5713.64 4024530000 24-09-19\n",
"\n",
"[6030 rows x 7 columns]>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}