AI-Stock-Predictor/Scrape Stock Data.ipynb
2024-10-07 21:24:55 -04:00

4.3 KiB

None <html lang="en"> <head> </head>
In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
from datetime import datetime
import re


def load_html(filename):
    with open(filename, 'r') as f:
        html_doc = f.read()
    return html_doc

def replace_with_newline(match):
    return "\n" + match.group(0)[1:]
    

name = "SPX_long"

html_doc = load_html("data/"+name+".html")

soup = BeautifulSoup(html_doc, 'html.parser')

stock_data = soup.find_all("table")

for stock in stock_data:
    value = stock.text.replace(',','').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace(' ', ',')
    pattern = r",[A-Z][a-z]{2}"
    new_string = re.sub(pattern, replace_with_newline, value)

    # Remove all llines that do not have 9 commas
    lines = new_string.splitlines()
    filtered_lines = [line for line in lines if line.count(",") == 8]
    new_string = "\n".join(filtered_lines)
    
    label = "Month,Day,Year,Open,High,Low,Close,Adj Close,Volume"

    data = label + "\n" + new_string
In [2]:
with open("data/"+name+'.csv', 'w') as f:
    f.write(data)

# Read the CSV file into a DataFrame
df = pd.read_csv("data/"+name+'.csv')



item = []

for i in range(len(df["Day"])):
    value = str(df["Month"].iloc[i]) + " " + str(df["Day"].iloc[i]) + " " + str(df["Year"].iloc[i])
    item.append(datetime.strptime(value, '%b %d %Y').strftime('%y-%m-%d'))
    

df["Date"] = item

df = df.iloc[::-1]
df = df.drop(columns=['Month','Day','Year'])


df.to_csv("data/"+name+'.csv', index=False)

df = pd.read_csv("data/"+name+'.csv')
In [3]:
df.head
Out[3]:
<bound method NDFrame.head of          Open     High      Low    Close  Adj Close      Volume      Date
0     1458.29  1458.29  1436.29  1436.51    1436.51  1197100000  00-09-29
1     1436.52  1445.60  1429.83  1436.23    1436.23  1051200000  00-10-02
2     1436.23  1454.82  1425.28  1426.46    1426.46  1098100000  00-10-03
3     1426.46  1439.99  1416.31  1434.32    1434.32  1167400000  00-10-04
4     1434.32  1444.17  1431.80  1436.28    1436.28  1176100000  00-10-05
...       ...      ...      ...      ...        ...         ...       ...
6025  5603.34  5636.27  5601.65  5626.02    5626.02  3500790000  24-09-13
6026  5615.21  5636.05  5604.53  5633.09    5633.09  3437070000  24-09-16
6027  5655.51  5670.81  5614.05  5634.58    5634.58  3443600000  24-09-17
6028  5641.68  5689.75  5615.08  5618.26    5618.26  3691390000  24-09-18
6029  5702.63  5733.57  5686.42  5713.64    5713.64  4024530000  24-09-19

[6030 rows x 7 columns]>
</html>