139 lines
4.3 KiB
Plaintext
139 lines
4.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "2852bf96-652d-498f-9f82-5064859706fd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from bs4 import BeautifulSoup\n",
|
|
"from requests import get\n",
|
|
"import pandas as pd\n",
|
|
"from datetime import datetime\n",
|
|
"import re\n",
|
|
"\n",
|
|
"\n",
|
|
"def load_html(filename):\n",
|
|
" with open(filename, 'r') as f:\n",
|
|
" html_doc = f.read()\n",
|
|
" return html_doc\n",
|
|
"\n",
|
|
"def replace_with_newline(match):\n",
|
|
" return \"\\n\" + match.group(0)[1:]\n",
|
|
" \n",
|
|
"\n",
|
|
"name = \"SPX_long\"\n",
|
|
"\n",
|
|
"html_doc = load_html(\"data/\"+name+\".html\")\n",
|
|
"\n",
|
|
"soup = BeautifulSoup(html_doc, 'html.parser')\n",
|
|
"\n",
|
|
"stock_data = soup.find_all(\"table\")\n",
|
|
"\n",
|
|
"for stock in stock_data:\n",
|
|
" value = stock.text.replace(',','').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ',')\n",
|
|
" pattern = r\",[A-Z][a-z]{2}\"\n",
|
|
" new_string = re.sub(pattern, replace_with_newline, value)\n",
|
|
"\n",
|
|
" # Remove all llines that do not have 9 commas\n",
|
|
" lines = new_string.splitlines()\n",
|
|
" filtered_lines = [line for line in lines if line.count(\",\") == 8]\n",
|
|
" new_string = \"\\n\".join(filtered_lines)\n",
|
|
" \n",
|
|
" label = \"Month,Day,Year,Open,High,Low,Close,Adj Close,Volume\"\n",
|
|
"\n",
|
|
" data = label + \"\\n\" + new_string"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "b283eb21-d682-4b13-8921-18c2aa6e47a1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"data/\"+name+'.csv', 'w') as f:\n",
|
|
" f.write(data)\n",
|
|
"\n",
|
|
"# Read the CSV file into a DataFrame\n",
|
|
"df = pd.read_csv(\"data/\"+name+'.csv')\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"item = []\n",
|
|
"\n",
|
|
"for i in range(len(df[\"Day\"])):\n",
|
|
" value = str(df[\"Month\"].iloc[i]) + \" \" + str(df[\"Day\"].iloc[i]) + \" \" + str(df[\"Year\"].iloc[i])\n",
|
|
" item.append(datetime.strptime(value, '%b %d %Y').strftime('%y-%m-%d'))\n",
|
|
" \n",
|
|
"\n",
|
|
"df[\"Date\"] = item\n",
|
|
"\n",
|
|
"df = df.iloc[::-1]\n",
|
|
"df = df.drop(columns=['Month','Day','Year'])\n",
|
|
"\n",
|
|
"\n",
|
|
"df.to_csv(\"data/\"+name+'.csv', index=False)\n",
|
|
"\n",
|
|
"df = pd.read_csv(\"data/\"+name+'.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "63877830-a456-4bad-bdef-cc704a7c677e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<bound method NDFrame.head of Open High Low Close Adj Close Volume Date\n",
|
|
"0 1458.29 1458.29 1436.29 1436.51 1436.51 1197100000 00-09-29\n",
|
|
"1 1436.52 1445.60 1429.83 1436.23 1436.23 1051200000 00-10-02\n",
|
|
"2 1436.23 1454.82 1425.28 1426.46 1426.46 1098100000 00-10-03\n",
|
|
"3 1426.46 1439.99 1416.31 1434.32 1434.32 1167400000 00-10-04\n",
|
|
"4 1434.32 1444.17 1431.80 1436.28 1436.28 1176100000 00-10-05\n",
|
|
"... ... ... ... ... ... ... ...\n",
|
|
"6025 5603.34 5636.27 5601.65 5626.02 5626.02 3500790000 24-09-13\n",
|
|
"6026 5615.21 5636.05 5604.53 5633.09 5633.09 3437070000 24-09-16\n",
|
|
"6027 5655.51 5670.81 5614.05 5634.58 5634.58 3443600000 24-09-17\n",
|
|
"6028 5641.68 5689.75 5615.08 5618.26 5618.26 3691390000 24-09-18\n",
|
|
"6029 5702.63 5733.57 5686.42 5713.64 5713.64 4024530000 24-09-19\n",
|
|
"\n",
|
|
"[6030 rows x 7 columns]>"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.head"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.14"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|