4.3 KiB
4.3 KiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
from bs4 import BeautifulSoup from requests import get import pandas as pd from datetime import datetime import re def load_html(filename): with open(filename, 'r') as f: html_doc = f.read() return html_doc def replace_with_newline(match): return "\n" + match.group(0)[1:] name = "SPX_long" html_doc = load_html("data/"+name+".html") soup = BeautifulSoup(html_doc, 'html.parser') stock_data = soup.find_all("table") for stock in stock_data: value = stock.text.replace(',','').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ',') pattern = r",[A-Z][a-z]{2}" new_string = re.sub(pattern, replace_with_newline, value) # Remove all llines that do not have 9 commas lines = new_string.splitlines() filtered_lines = [line for line in lines if line.count(",") == 8] new_string = "\n".join(filtered_lines) label = "Month,Day,Year,Open,High,Low,Close,Adj Close,Volume" data = label + "\n" + new_string
In [2]:
with open("data/"+name+'.csv', 'w') as f: f.write(data) # Read the CSV file into a DataFrame df = pd.read_csv("data/"+name+'.csv') item = [] for i in range(len(df["Day"])): value = str(df["Month"].iloc[i]) + " " + str(df["Day"].iloc[i]) + " " + str(df["Year"].iloc[i]) item.append(datetime.strptime(value, '%b %d %Y').strftime('%y-%m-%d')) df["Date"] = item df = df.iloc[::-1] df = df.drop(columns=['Month','Day','Year']) df.to_csv("data/"+name+'.csv', index=False) df = pd.read_csv("data/"+name+'.csv')
In [3]:
df.head
Out[3]:
<bound method NDFrame.head of Open High Low Close Adj Close Volume Date 0 1458.29 1458.29 1436.29 1436.51 1436.51 1197100000 00-09-29 1 1436.52 1445.60 1429.83 1436.23 1436.23 1051200000 00-10-02 2 1436.23 1454.82 1425.28 1426.46 1426.46 1098100000 00-10-03 3 1426.46 1439.99 1416.31 1434.32 1434.32 1167400000 00-10-04 4 1434.32 1444.17 1431.80 1436.28 1436.28 1176100000 00-10-05 ... ... ... ... ... ... ... ... 6025 5603.34 5636.27 5601.65 5626.02 5626.02 3500790000 24-09-13 6026 5615.21 5636.05 5604.53 5633.09 5633.09 3437070000 24-09-16 6027 5655.51 5670.81 5614.05 5634.58 5634.58 3443600000 24-09-17 6028 5641.68 5689.75 5615.08 5618.26 5618.26 3691390000 24-09-18 6029 5702.63 5733.57 5686.42 5713.64 5713.64 4024530000 24-09-19 [6030 rows x 7 columns]>