pip install pandas
pip install pandas-datareader
First, we need to import some modules.
import numpy as np
import pandas as pd
from pandas_datareader import data
import matplotlib.pyplot as plt
import random
Now, we define a function which sets the company we want to look at and which has as parameters the starting and the ending dates.
def get_data(start_date, end_date):
symbol = 'NFLX'
data_source = 'yahoo'
start_date = start_date
end_date = end_date
df = data.get_data_yahoo(symbol, start_date, end_date)
return df
Now, we set the starting and ending dates. Since we will define the mini-bathes such that every mini-batch contains 16 datapoints, we need to take care that the number of stock prices during that period is divisible by 16. If the number of stock prices is not divisble by 16, we eliminate a few datapoints so that the number of datatpoints is divisible by 16. Moreover we convert a NumPy Array object to a list object and do some computations such that we get at the end a list of the y-values and a list of the x-values.
df = get_data('2020-01-01','2020-12-01')
ar = np.array(df['Adj Close'])
l1 = ar.tolist()
if (len(l1))//16 != 0:
r = len(l1)%16
nl = len(l1)-r
l1 = [l1[i] for i in range(nl)]
else:
pass
dic = {}
for i in range(len(l1)):
dic[l1[i]] = i
Y = []
X = []
for k,v in dic.items():
Y.append(k)
X.append(v)
Choosing the starting parameters a and b randomly as well as the learning parameter alpha.
a = random.randrange(0, 25)
b = random.randrange(0, 500)
alpha = 0.00001
print("a = ", a, "b = ", b)
Before constructing the mini-batches, we shuffle the dataset such that the training examples of each mini-batch are randomly chosen. Since our dataset is not as large, we divide it into mini-batches of 16 elements each.
random.shuffle(Y)
Yi = []
Xi = []
j = 0
while 16 + j*16 < len(Y):
j+=1
Yi.append([Y[i + j*16] for i in range(16)])
for m in range(len(Yi)):
Xi.append([dic[k] for k in Yi[m]])
Now we calculate for each mini batch the average of all gradients and update the parameters with this mean gradient. This process is repreated 50000 times, in order to find the best matching parameters a and b.
for i in range(50000): #50000 iterations
for m in range(len(Yi)):
d_a = 0
d_b = 0
Y_pred = [a*x + b for x in Xi[m]]
for j in range(len(Yi[m])):
d_a += Xi[m][j]*(Y_pred[j]-Yi[m][j])
d_b += (Y_pred[j]-Yi[m][j])
d_a = d_a * (2/len(Xi[m]))
d_b = d_b * (2/len(Xi[m]))
a = a- alpha * d_a
b = b- alpha * d_b
print(a, b)
print("Best matching linear line after 50000 iterations and learning rate 0,00001: y = {0:.2f}x + {1:.2f}".format(a, b))
Now we associate our linear equation to the different dates of the year (independant variable) in order to be able to plot it.
x = np.arange(len(df.index))
lx = x.tolist()
eq = [a*i+b for i in lx]
Finally we can plot our data with the appropriate linear regression line.
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot_date(x , y = df['Adj Close'])
fig.suptitle('NFLX')
ax.set_xlabel('Date')
ax.set_ylabel("Adj Close")
ax.plot(x, eq)
#to label the x axis with the dates. Here it is every 14 trading days.
xt = np.arange(0, len(df.index), step=14)
xl = df.index[xt].date
ax.set_xticks(xt, minor=False)
ax.set_xticklabels(xl, minor=False, rotation=45)
plt.show()