pip install sklearn
pip install pandas
pip install numpy
pip install yfinance
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import yfinance as yf
import math
from datetime import date, timedelta
# We set the stock we want to work with
data = yf.Ticker('NFLX')
# This built-in feature will return us the date of today
today = date.today()
# We extract the data history for the ticker we selected from a starting day to an ending day
df = data.history(period="max", start="2015-01-01", end="2020-12-01")
# We want to view the 5 first row of our collected data
df.head()
# We want to predict if the stock will close up or down n days from the day indicated
n = 1
# If we have the data from a day m, then we want to obtain a prediction for the day m+1 since we set our n = 1
# We will work with the close column, so we create a list out of it
close_n_days = []
close = []
close_actual = df["Close"].copy()
for i in close_actual:
close.append(i)
close_n_days = close[n:]
# We "delete" the n last rows of the X column and the n first rows in the Y column
# In this way, if we put them side by side, the Y value un row m will tell if the day m+n is a up or down day
df = df[:len(df)-n]
df["Close in n days"] = close_n_days
# We take the first p percent of our dataframe to be our training data
p = 90
df_percentage = int((len(close_n_days)*p)/100)
training = []
for i in range (df_percentage):
training.append(True)
for i in range (df_percentage, len(close_n_days)):
training.append(False)
df['Training Set'] = training
df[(df_percentage-2):(df_percentage+3)]
# We split the dataframe into two separate dataframes, one for testing and one for training
train, test = df[df['Training Set']==True], df[df['Training Set']==False]
print('Number of rows in the training data: ', len(train))
print('Number of rows in the testing data: ', len(test))
# In this case the list of features is ['Open', 'High', 'Low', 'Close'] and they are used to predict the closing price in n days
features = df.columns[:4]
X = train[features]
y = train['Close in n days']
regr = RandomForestRegressor(n_estimators = 50)
regr.fit(X, y)
The coefficient of determination (R squared) is used to see how accurate the predictions are. The closer this R squared is to 1, the better the predictions.
regr.score(X, y)
# We apply the model to our testing dataframe
preds = regr.predict(test[features])
print('First five test values: ', preds[0:5])
# We can compare the predicted values above to the real values shown below
test['Close in n days'].head()
# We get yesterday's date
yesterday = today + timedelta(days=-2)
# We take the data from yesterday (and only from yesterday)
pred_data = data.history(period="max", start=yesterday, end=today)
pred_data_y = pred_data[:1]
# We again only take the Open, High, Low and Close Features
X = pred_data_y[features]
# We predict the closing price for yesterday
preds = regr.predict(X)
print('Today the predicted closing value is: ', preds[0])
print("Today's actual closing value was: ")
pred_data_t = pred_data[1:]
print(pred_data_t["Close"][0])