import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from itertools import product
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.vector_ar.var_model import VAR
import statsmodels.api as sm
from tabulate import tabulate
from tqdm import tqdm_notebook
from pmdarima import auto_arima
import warnings

warnings.filterwarnings('ignore')

current_directory = os.getcwd()
file_path = os.path.join(current_directory, 'cleaned_data.csv')
df = pd.read_csv(file_path, index_col = 0, parse_dates = True)
df.index.freq = 'QS-OCT'

df.tail(5)

n_forecast = 1 # number of period(s) forecasting ahead.
n_test = 4 # number of testing periods.
df_pm = df[df['profit_margin'].notna()][['profit_margin']]
df_pm

df['profit_margin'].plot(figsize=(10,4))
plt.title('Profit Margin')
plt.ylabel('Value')
plt.xlabel('Date')

Text(0.5, 0, 'Date')

def ad_test(dataset):
     dftest = adfuller(dataset, autolag = 'AIC')
     print("1. ADF : ",dftest[0])
     print("2. P-Value : ", dftest[1])
     print("3. Num Of Lags : ", dftest[2])
     print("4. Num Of Observations Used For ADF Regression:", dftest[3])
     print("5. Critical Values :")
     for key, val in dftest[4].items():
         print("\t",key, ": ", val)
ad_test(df_pm)

1. ADF :  -1.3788121097506558
2. P-Value :  0.5923906598883197
3. Num Of Lags :  0
4. Num Of Observations Used For ADF Regression: 211
5. Critical Values :
	 1% :  -3.46172743446274
	 5% :  -2.8753374677799957
	 10% :  -2.574124089081557

stepwise_fit = auto_arima(df_pm, trace = True, suppress_warnings = True)
stepwise_fit.summary()

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=-1727.323, Time=0.14 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-1735.323, Time=0.01 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-1733.467, Time=0.02 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-1733.478, Time=0.03 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-1736.686, Time=0.01 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-1731.339, Time=0.04 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 0.266 seconds

accf_arima = plot_acf(df_pm.diff()[1:])
plt.title('Autorcorrelation for Profit Margin')

Text(0.5, 1.0, 'Autorcorrelation for Profit Margin')

pacf_arima = plot_pacf(df_pm.diff()[1:])
plt.title('Partial Autorcorrelation for Profit Margin')

Text(0.5, 1.0, 'Partial Autorcorrelation for Profit Margin')

train_arima = df_pm.iloc[:-n_test]
test_arima = df_pm.iloc[-n_test:]

arima_order =(0,1,0)
arimamodel = ARIMA(train_arima,order=arima_order)
arimamodel = arimamodel.fit()

start = len(train_arima)
end = len(train_arima) + len(test_arima) - 1

pred_arima = arimamodel.predict(start = start, end = end, typ = 'levels').rename('ARIMA(0,1,0) In sample prediction')
pred_arima.plot(legend = True, figsize = (10,4))
test_arima['profit_margin'].plot(legend = True, figsize = (10,4))
plt.legend(labels=['ARIMA(0,1,0) In sample prediction', 'profit margin'])
plt.ylabel('Value')
plt.xlabel('Date')
plt.title('ARIMA in sample prediction')
plt.show()

mape_arima = round(mean_absolute_percentage_error(test_arima['profit_margin'], pred_arima) * 100, 2)
print('Mean Absolute Percentage Error under ARIMA(0,1,0) is : {} %'.format(mape_arima))

Mean Absolute Percentage Error under ARIMA(0,1,0) is : 5.25 %

ts1 = ARIMA(df_pm, order=arima_order)
ts1_fit = ts1.fit()
arima_forecast = ts1_fit.forecast(steps=n_forecast)[0]
print("arima forecast:", arima_forecast)

arima forecast: 0.1003

df_pm_ir = df[df['profit_margin'].notna()][['profit_margin', 'interest_rate']]

df_pm_ir

plt.plot(df_pm_ir.index, df_pm_ir['interest_rate'], label='Interest Rate')
plt.plot(df_pm_ir.index, df_pm_ir['profit_margin'], label='Profit Margin')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Profit Margin vs Interest Rate')
plt.plotfigsize = (10,4)
plt.legend()
plt.show()

ad_fuller_result_1 = adfuller(df_pm_ir['profit_margin'].diff()[1:])
print('Profit margin')
print(f'ADF Statisitc:{ad_fuller_result_1[0]}')
print(f'p-value:{ad_fuller_result_1[1]}')
print()
ad_fuller_result_2 = adfuller(df_pm_ir['interest_rate'].diff()[1:])
print('Interest rates')
print(f'ADF Statisitc:{ad_fuller_result_2[0]}')
print(f'p-value:{ad_fuller_result_2[1]}')

Profit margin
ADF Statisitc:-14.793007270916444
p-value:2.1567193537527807e-27

Interest rates
ADF Statisitc:-6.131977725390502
p-value:8.359467380348921e-08

print('Testing whether interest rate Granger-causes profit margin\n')
granger = grangercausalitytests(df_pm_ir, 15, addconst=True, verbose=True)

Testing whether interest rate Granger-causes profit margin


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=17.0569 , p=0.0001  , df_denom=208, df_num=1
ssr based chi2 test:   chi2=17.3029 , p=0.0000  , df=1
likelihood ratio test: chi2=16.6300 , p=0.0000  , df=1
parameter F test:         F=17.0569 , p=0.0001  , df_denom=208, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=8.8155  , p=0.0002  , df_denom=205, df_num=2
ssr based chi2 test:   chi2=18.0611 , p=0.0001  , df=2
likelihood ratio test: chi2=17.3263 , p=0.0002  , df=2
parameter F test:         F=8.8155  , p=0.0002  , df_denom=205, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=5.8307  , p=0.0008  , df_denom=202, df_num=3
ssr based chi2 test:   chi2=18.0982 , p=0.0004  , df=3
likelihood ratio test: chi2=17.3571 , p=0.0006  , df=3
parameter F test:         F=5.8307  , p=0.0008  , df_denom=202, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=4.2639  , p=0.0025  , df_denom=199, df_num=4
ssr based chi2 test:   chi2=17.8271 , p=0.0013  , df=4
likelihood ratio test: chi2=17.1041 , p=0.0018  , df=4
parameter F test:         F=4.2639  , p=0.0025  , df_denom=199, df_num=4

Granger Causality
number of lags (no zero) 5
ssr based F test:         F=3.3391  , p=0.0064  , df_denom=196, df_num=5
ssr based chi2 test:   chi2=17.6325 , p=0.0034  , df=5
likelihood ratio test: chi2=16.9216 , p=0.0047  , df=5
parameter F test:         F=3.3391  , p=0.0064  , df_denom=196, df_num=5

Granger Causality
number of lags (no zero) 6
ssr based F test:         F=2.9641  , p=0.0086  , df_denom=193, df_num=6
ssr based chi2 test:   chi2=18.9826 , p=0.0042  , df=6
likelihood ratio test: chi2=18.1582 , p=0.0058  , df=6
parameter F test:         F=2.9641  , p=0.0086  , df_denom=193, df_num=6

Granger Causality
number of lags (no zero) 7
ssr based F test:         F=2.5395  , p=0.0161  , df_denom=190, df_num=7
ssr based chi2 test:   chi2=19.1797 , p=0.0076  , df=7
likelihood ratio test: chi2=18.3348 , p=0.0105  , df=7
parameter F test:         F=2.5395  , p=0.0161  , df_denom=190, df_num=7

Granger Causality
number of lags (no zero) 8
ssr based F test:         F=2.2247  , p=0.0274  , df_denom=187, df_num=8
ssr based chi2 test:   chi2=19.4159 , p=0.0128  , df=8
likelihood ratio test: chi2=18.5466 , p=0.0175  , df=8
parameter F test:         F=2.2247  , p=0.0274  , df_denom=187, df_num=8

Granger Causality
number of lags (no zero) 9
ssr based F test:         F=1.9266  , p=0.0506  , df_denom=184, df_num=9
ssr based chi2 test:   chi2=19.1294 , p=0.0241  , df=9
likelihood ratio test: chi2=18.2810 , p=0.0320  , df=9
parameter F test:         F=1.9266  , p=0.0506  , df_denom=184, df_num=9

Granger Causality
number of lags (no zero) 10
ssr based F test:         F=1.7931  , p=0.0646  , df_denom=181, df_num=10
ssr based chi2 test:   chi2=20.0108 , p=0.0292  , df=10
likelihood ratio test: chi2=19.0806 , p=0.0393  , df=10
parameter F test:         F=1.7931  , p=0.0646  , df_denom=181, df_num=10

Granger Causality
number of lags (no zero) 11
ssr based F test:         F=1.5923  , p=0.1043  , df_denom=178, df_num=11
ssr based chi2 test:   chi2=19.7789 , p=0.0485  , df=11
likelihood ratio test: chi2=18.8652 , p=0.0636  , df=11
parameter F test:         F=1.5923  , p=0.1043  , df_denom=178, df_num=11

Granger Causality
number of lags (no zero) 12
ssr based F test:         F=1.4806  , p=0.1352  , df_denom=175, df_num=12
ssr based chi2 test:   chi2=20.3051 , p=0.0615  , df=12
likelihood ratio test: chi2=19.3392 , p=0.0807  , df=12
parameter F test:         F=1.4806  , p=0.1352  , df_denom=175, df_num=12

Granger Causality
number of lags (no zero) 13
ssr based F test:         F=1.3073  , p=0.2126  , df_denom=172, df_num=13
ssr based chi2 test:   chi2=19.6620 , p=0.1040  , df=13
likelihood ratio test: chi2=18.7502 , p=0.1311  , df=13
parameter F test:         F=1.3073  , p=0.2126  , df_denom=172, df_num=13

Granger Causality
number of lags (no zero) 14
ssr based F test:         F=1.1733  , p=0.2998  , df_denom=169, df_num=14
ssr based chi2 test:   chi2=19.2457 , p=0.1558  , df=14
likelihood ratio test: chi2=18.3668 , p=0.1906  , df=14
parameter F test:         F=1.1733  , p=0.2998  , df_denom=169, df_num=14

Granger Causality
number of lags (no zero) 15
ssr based F test:         F=1.0317  , p=0.4253  , df_denom=166, df_num=15
ssr based chi2 test:   chi2=18.3659 , p=0.2439  , df=15
likelihood ratio test: chi2=17.5595 , p=0.2865  , df=15
parameter F test:         F=1.0317  , p=0.4253  , df_denom=166, df_num=15

train_var = df_pm_ir[:-n_test]
test_var = df_pm_ir[-n_test:]

varmodel = VAR(train_var.diff()[1:])
sorted_order = varmodel.select_order(maxlags = 10)
print(sorted_order.summary())

 VAR Order Selection (* highlights the minimums)  
==================================================
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0       -20.45     -20.42*   1.312e-09      -20.44
1       -20.48      -20.38   1.276e-09     -20.44*
2       -20.49      -20.32   1.263e-09      -20.42
3       -20.50      -20.27   1.246e-09      -20.41
4       -20.47      -20.17   1.285e-09      -20.35
5      -20.52*      -20.15  1.226e-09*      -20.37
6       -20.50      -20.07   1.248e-09      -20.33
7       -20.51      -20.01   1.234e-09      -20.31
8       -20.49      -19.92   1.263e-09      -20.26
9       -20.46      -19.83   1.299e-09      -20.21
10      -20.44      -19.74   1.327e-09      -20.16
--------------------------------------------------

var_order = (5,0) # fifth-order VAR model.
var_model = VARMAX(train_var, order = var_order, enforce_stationarity = True)
fitted_model = var_model.fit(disp = False)
print(fitted_model.summary())

                                   Statespace Model Results                                   
==============================================================================================
Dep. Variable:     ['profit_margin', 'interest_rate']   No. Observations:                  208
Model:                                         VAR(5)   Log Likelihood                1563.460
                                          + intercept   AIC                          -3076.921
Date:                                Tue, 07 May 2024   BIC                          -2993.482
Time:                                        21:40:21   HQIC                         -3043.182
Sample:                                    01-01-1971                                         
                                         - 10-01-2022                                         
Covariance Type:                                  opg                                         
===================================================================================
Ljung-Box (L1) (Q):             0.00, 0.02   Jarque-Bera (JB):      190.65, 2075.76
Prob(Q):                        0.99, 0.89   Prob(JB):                   0.00, 0.00
Heteroskedasticity (H):         3.63, 0.08   Skew:                       0.23, 1.62
Prob(H) (two-sided):            0.00, 0.00   Kurtosis:                  7.67, 18.13
                         Results for equation profit_margin                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
intercept            0.0078      0.003      2.318      0.020       0.001       0.014
L1.profit_margin     0.8854      0.068     13.108      0.000       0.753       1.018
L1.interest_rate    -0.0609      0.044     -1.391      0.164      -0.147       0.025
L2.profit_margin    -0.0016      0.079     -0.020      0.984      -0.156       0.153
L2.interest_rate     0.0213      0.082      0.260      0.794      -0.139       0.182
L3.profit_margin     0.1075      0.072      1.490      0.136      -0.034       0.249
L3.interest_rate    -0.0142      0.085     -0.167      0.867      -0.181       0.152
L4.profit_margin    -0.0327      0.102     -0.320      0.749      -0.233       0.168
L4.interest_rate     0.0032      0.069      0.046      0.963      -0.132       0.139
L5.profit_margin    -0.0430      0.074     -0.579      0.563      -0.189       0.103
L5.interest_rate     0.0142      0.050      0.283      0.777      -0.084       0.112
                         Results for equation interest_rate                         
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
intercept            0.0053      0.008      0.699      0.484      -0.010       0.020
L1.profit_margin     0.0339      0.284      0.120      0.905      -0.522       0.590
L1.interest_rate     1.2700      0.080     15.792      0.000       1.112       1.428
L2.profit_margin     0.1679      0.440      0.382      0.703      -0.694       1.030
L2.interest_rate    -0.4922      0.121     -4.061      0.000      -0.730      -0.255
L3.profit_margin    -0.3441      0.485     -0.710      0.478      -1.294       0.606
L3.interest_rate     0.3801      0.107      3.564      0.000       0.171       0.589
L4.profit_margin     0.1001      0.474      0.211      0.833      -0.829       1.029
L4.interest_rate    -0.1644      0.117     -1.406      0.160      -0.394       0.065
L5.profit_margin     0.0010      0.298      0.003      0.997      -0.583       0.585
L5.interest_rate    -0.0408      0.073     -0.559      0.576      -0.184       0.102
                                        Error covariance matrix                                         
========================================================================================================
                                           coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
sqrt.var.profit_margin                   0.0037      0.000     25.187      0.000       0.003       0.004
sqrt.cov.profit_margin.interest_rate     0.0004      0.001      0.289      0.773      -0.002       0.003
sqrt.var.interest_rate                   0.0086      0.000     29.924      0.000       0.008       0.009
========================================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

predict_var = fitted_model.get_prediction(start = start, end = end)
predictions_var = predict_var.predicted_mean
predictions_var.columns = ['profit_margin_pred','int_pred']
predictions_var = predictions_var.drop('int_pred', axis = 1)

pred_var = pd.concat([test_var, predictions_var], axis = 1)
pred_var[['profit_margin_pred','profit_margin']].plot(figsize = (10,4))
plt.legend(['VAR in sample prediction','Profit margin' ])
plt.ylim(0.08, 0.11)
plt.ylabel('Value')
plt.xlabel('Date')

Text(0.5, 0, 'Date')

mape_var = round(mean_absolute_percentage_error(test_var['profit_margin'], predictions_var['profit_margin_pred']) * 100, 2)
print('Mean Absolute Percentage Error of Profit Margin under VAR Model is : {} %'.format(mape_var))

Mean Absolute Percentage Error of Profit Margin under VAR Model is : 8.65 %

ts2 = VARMAX(df_pm_ir, order = var_order, enforce_stationarity = True)
ts2_fit = ts2.fit(disp = False)
var_forecast=ts2_fit.forecast(steps=n_forecast)['profit_margin'][0]
print("var forecast:",var_forecast)

var forecast: 0.09784981023669143

# creating auto-lagged data
df_pm_ml = df_pm
df_pm_ml['profit_margin-1Q'] = df['profit_margin'].shift(+1)
df_pm_ml['profit_margin-2Q'] = df['profit_margin'].shift(+2)
df_pm_ml['profit_margin-3Q'] = df['profit_margin'].shift(+3)
df_pm_ml = df_pm_ml.dropna()
df_pm_ml

lr_ml = LinearRegression()
rf_ml = RandomForestRegressor(n_estimators = 100, max_features = 3, random_state = 1) # The values here can be changed accordingly.

# Concatanating data for in-sample forecasts
x1_ml, x2_ml, x3_ml, y_ml = df_pm_ml['profit_margin-1Q'], df_pm_ml['profit_margin-2Q'], df_pm_ml['profit_margin-3Q'], df_pm_ml['profit_margin']
x1_ml, x2_ml, x3_ml, y_ml = np.array(x1_ml), np.array(x2_ml), np.array(x3_ml), np.array(y_ml)
x1_ml, x2_ml, x3_ml, y_ml = x1_ml.reshape(-1,1), x2_ml.reshape(-1,1), x3_ml.reshape(-1,1), y_ml.reshape(-1,1)
final_x_ml = np.concatenate((x1_ml, x2_ml, x3_ml), axis = 1)
final_x_ml.shape

(209, 3)

# Splitting the data into training and testing groups
x_train_ml, x_test_ml, y_train_ml, y_test_ml = final_x_ml[:-n_test], final_x_ml[-n_test:], y_ml[:-n_test], y_ml[-n_test:]
lr_ml.fit(x_train_ml, y_train_ml)
rf_ml.fit(x_train_ml, y_train_ml)

RandomForestRegressor(max_features=3, random_state=1)

RandomForestRegressor(max_features=3, random_state=1)

lin_pred_ml = lr_ml.predict(x_test_ml)
plt.rcParams["figure.figsize"] = (10,4)
plt.plot(lin_pred_ml , label = 'Linear regression in-sample predictions')
plt.plot(y_test_ml, label = 'Profit Margin')
plt.legend(loc = "upper left")
plt.ylim(0.08, 0.11)
plt.ylabel('Value')
plt.xlabel('Forecast periods')
plt.title('Linear Regression Predictions vs. profit margin')
plt.show()

pred_ml = rf_ml.predict(x_test_ml)
plt.rcParams["figure.figsize"] = (10,4)
plt.plot(pred_ml, label = 'Random forest in-sample predictions')
plt.plot(y_test_ml, label = 'Profit Margin')
plt.legend(loc = "upper left")
plt.ylim(0.08, 0.11)
plt.ylabel('Value')
plt.xlabel('Forecast periods')
plt.title('Random Forest Predictions vs. profit margin')
plt.show()

mape_lr_ml = round(mean_absolute_percentage_error(y_test_ml, lin_pred_ml) * 100, 2)
print('Mean Absolute Percentage Error under Linear Regression Model is : {} %'.format(mape_lr_ml))
mape_rf_ml = round(mean_absolute_percentage_error(y_test_ml, pred_ml) * 100, 2)
print('Mean Absolute Percentage Error under Random Forest Model is: {} %'.format(mape_rf_ml))

Mean Absolute Percentage Error under Linear Regression Model is : 5.58 %
Mean Absolute Percentage Error under Random Forest Model is: 5.61 %

# Predicting one period ahead with linear regression
lr_forecast = lr_ml.predict(final_x_ml[-1].reshape(1, -1))[0][0]
print("Linear Regression Forecast:", lr_forecast)

# Predicting one period ahead with random forest
rf_forecast = rf_ml.predict(final_x_ml[-1].reshape(1, -1))[0]
print("Random Forest Forecast:", rf_forecast)

Linear Regression Forecast: 0.09707552813728464
Random Forest Forecast: 0.09508999999999991

test_arima['profit_margin'].plot(legend = True, label='Profit Margin')
pred_arima.plot(legend = True, figsize = (10,4), label='ARIMA')

pred_var['profit_margin_pred'].plot(label='VAR')

lin_pred_ml = pd.Series(lr_ml.predict(x_test_ml).flatten(), index=df_pm_ml.index[-n_test:])
lin_pred_ml.plot(label='Linear Regression')

rf_pred_ml = pd.Series(rf_ml.predict(x_test_ml).flatten(), index=df_pm_ml.index[-n_test:])
rf_pred_ml.plot(label='Random Forest')

plt.legend()
plt.ylabel('Value')
plt.xlabel('Date')
plt.title('In Sample Predictions vs Observed Profit Margin Values')
plt.show()

table_data = [
    ["Model", "Mean Absolute Percentage Error (%)", "Forecast"],
    ["ARIMA(0,1,0)", mape_arima, round(arima_forecast, 4)],
    ["VAR Model", mape_var, round(var_forecast, 4)],
    ["Linear Regression", mape_lr_ml, round(lr_forecast, 4)],
    ["Random Forest", mape_rf_ml, round(rf_forecast, 4)]
]

print(tabulate(table_data, headers="firstrow", tablefmt="grid"))

+-------------------+--------------------------------------+------------+
| Model             |   Mean Absolute Percentage Error (%) |   Forecast |
+===================+======================================+============+
| ARIMA(0,1,0)      |                                 5.25 |     0.1003 |
+-------------------+--------------------------------------+------------+
| VAR Model         |                                 8.65 |     0.0978 |
+-------------------+--------------------------------------+------------+
| Linear Regression |                                 5.58 |     0.0971 |
+-------------------+--------------------------------------+------------+
| Random Forest     |                                 5.61 |     0.0951 |
+-------------------+--------------------------------------+------------+

df_updated = df.copy()
df_updated.loc[df.index[-1], 'profit_margin'] = arima_forecast

df_updated

df_updated['market_index'].plot(figsize=(10,4))
plt.ylabel('Value')
plt.title('US Market Index')
plt.xlabel('Date')

Text(0.5, 0, 'Date')

x_var, y_var = df_updated['cpi'], df_updated['market_index']
x_var_1 = sm.add_constant(x_var)
rg1 = sm.OLS(y_var, x_var_1).fit() # ordinary least square method.
print(rg1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           market_index   R-squared:                       0.716
Model:                            OLS   Adj. R-squared:                  0.715
Method:                 Least Squares   F-statistic:                     532.3
Date:                Tue, 07 May 2024   Prob (F-statistic):           1.30e-59
Time:                        21:40:23   Log-Likelihood:                -1029.3
No. Observations:                 213   AIC:                             2063.
Df Residuals:                     211   BIC:                             2069.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -62.9649      5.102    -12.341      0.000     -73.022     -52.907
cpi            0.6675      0.029     23.072      0.000       0.610       0.725
==============================================================================
Omnibus:                       44.449   Durbin-Watson:                   0.019
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               67.133
Skew:                           1.198   Prob(JB):                     2.64e-15
Kurtosis:                       4.352   Cond. No.                         430.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

x_axis, y_axis = np.array(x_var_1), np.array(y_var).reshape(-1,1)
plt.figure(figsize=(10,4))
plt.scatter(x_axis[:, 1], y_axis)
plt.plot(x_axis[:, 1], rg1.predict(x_axis), color = 'r', linewidth = 1)
plt.xlabel('cpi')
plt.ylabel('Market index')
plt.title('Relationship between cpi and the market index')
plt.show()

plt.figure(figsize=(10,4))
pred_simple = rg1.predict(x_var_1)
pred_simple.plot(legend = True)
df['market_index'].plot(legend = True)
plt.ylabel('Value')
plt.xlabel('Date')
plt.legend(labels=['Simple linear regression', 'Market Index'])
plt.title('Simple Linear Regression Model vs Market Index')

Text(0.5, 1.0, 'Simple Linear Regression Model vs Market Index')

df_updated.plot(y=['market_index', 'gdp', 'interest_rate', 'cpi', 'profit_margin'], subplots=True, layout=(3, 2), figsize=(15, 10))
plt.suptitle('Economic Indicators Over Time', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Value')
plt.tight_layout()
plt.show()

x_var_2 = sm.add_constant(df_updated[['gdp','interest_rate','cpi','profit_margin']])
rg2 = sm.OLS(y_var, x_var_2).fit()
print(rg2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           market_index   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.950
Method:                 Least Squares   F-statistic:                     998.7
Date:                Tue, 07 May 2024   Prob (F-statistic):          1.70e-134
Time:                        21:40:23   Log-Likelihood:                -843.28
No. Observations:                 213   AIC:                             1697.
Df Residuals:                     208   BIC:                             1713.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            50.0683      8.774      5.706      0.000      32.770      67.366
gdp               0.0199      0.001     29.297      0.000       0.019       0.021
interest_rate   166.1826     33.249      4.998      0.000     100.635     231.730
cpi              -1.1010      0.062    -17.729      0.000      -1.223      -0.979
profit_margin  -564.9169    104.610     -5.400      0.000    -771.149    -358.685
==============================================================================
Omnibus:                       59.041   Durbin-Watson:                   0.158
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              203.084
Skew:                           1.096   Prob(JB):                     7.96e-45
Kurtosis:                       7.251   Cond. No.                     1.50e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.5e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

vif_data = x_var_2.drop(columns='const')
vif = pd.DataFrame()
vif["Feature"] = vif_data.columns
vif["VIF"] = [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]
print(vif)

         Feature        VIF
0            gdp  47.996219
1  interest_rate   3.446659
2            cpi  90.355459
3  profit_margin  25.909254

x_var_3 = x_var_2.drop(columns=['cpi'])

rg3 = sm.OLS(y_var, x_var_3).fit()
print(rg3.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           market_index   R-squared:                       0.876
Model:                            OLS   Adj. R-squared:                  0.874
Method:                 Least Squares   F-statistic:                     490.9
Date:                Tue, 07 May 2024   Prob (F-statistic):           2.51e-94
Time:                        21:40:23   Log-Likelihood:                -941.34
No. Observations:                 213   AIC:                             1891.
Df Residuals:                     209   BIC:                             1904.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const           -50.4561     10.585     -4.767      0.000     -71.324     -29.588
gdp               0.0086      0.000     22.669      0.000       0.008       0.009
interest_rate   295.8238     51.274      5.769      0.000     194.743     396.905
profit_margin  -112.1970    160.370     -0.700      0.485    -428.347     203.953
==============================================================================
Omnibus:                       47.857   Durbin-Watson:                   0.056
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               81.423
Skew:                           1.183   Prob(JB):                     2.09e-18
Kurtosis:                       4.892   Cond. No.                     1.45e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.45e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

x_var_3 = x_var_3.drop(columns='const')
correlation_table = x_var_3.corr()
print(correlation_table)

                    gdp  interest_rate  profit_margin
gdp            1.000000      -0.688833       0.851415
interest_rate -0.688833       1.000000      -0.693673
profit_margin  0.851415      -0.693673       1.000000

vif = pd.DataFrame()
vif["Feature"] = x_var_3.columns
vif["VIF"] = [variance_inflation_factor(x_var_3.values, i) for i in range(x_var_3.shape[1])]
print(vif)

         Feature        VIF
0            gdp  10.382391
1  interest_rate   3.142957
2  profit_margin  15.486578

print(f"dependent variable: {y_var.name}\nindependnet variables: {x_var_3.columns.tolist()}")

dependent variable: market_index
independnet variables: ['gdp', 'interest_rate', 'profit_margin']

n_components_range = range(1, 3)  
X_train, X_val, y_train, y_val = train_test_split(x_var_3, y_var, test_size=0.2, random_state=13)

# Cross validation
cv_scores = []
for n_components in n_components_range:
    pls = PLSRegression(n_components=n_components)
    scores = cross_val_score(pls, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')  # RMSE
    cv_scores.append(np.mean(scores))

best_n_components = n_components_range[np.argmax(cv_scores)]
print("Best n_components:", best_n_components)

Best n_components: 2

pls = PLSRegression(n_components= best_n_components)  
pls.fit(x_var_3, y_var)
latent_variables = [f"lv{i+1}" for i in range(best_n_components)]

print("latent variable equations:")
for i, lv in enumerate(latent_variables):
    print(f"{lv} = ", end="")
    for j, original_var in enumerate(x_var_3.columns.tolist()):
        coefficient = pls.x_loadings_[j, i]
        if coefficient != 0:
            print(f"{coefficient:.4f}*{original_var} + ", end="")
    print("\b\b ")

latent variable equations:
lv1 = 0.6122*gdp + -0.5351*interest_rate + 0.6051*profit_margin  
lv2 = 0.4608*gdp + 0.9344*interest_rate + 0.0876*profit_margin

x_latent_vars = pls.x_scores_
x_var_4 = pd.DataFrame(x_latent_vars, columns= latent_variables)
x_var_4.index = x_var_3.index
x_var_4 = sm.add_constant(x_var_4)
rg4 = sm.OLS(y_var, x_var_4).fit()
print(rg4.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           market_index   R-squared:                       0.851
Model:                            OLS   Adj. R-squared:                  0.850
Method:                 Least Squares   F-statistic:                     601.2
Date:                Tue, 07 May 2024   Prob (F-statistic):           1.21e-87
Time:                        21:40:23   Log-Likelihood:                -960.43
No. Observations:                 213   AIC:                             1927.
Df Residuals:                     210   BIC:                             1937.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         44.4095      1.517     29.280      0.000      41.420      47.399
lv1           30.8209      0.978     31.515      0.000      28.893      32.749
lv2           39.3655      2.721     14.466      0.000      34.001      44.730
==============================================================================
Omnibus:                       20.863   Durbin-Watson:                   0.082
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               26.529
Skew:                           0.669   Prob(JB):                     1.74e-06
Kurtosis:                       4.094   Cond. No.                         2.78
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

num_iterations = 1000
num_latent_variables = 2
boot_loadings = np.zeros((num_iterations, len(x_var_3.columns.tolist()), num_latent_variables))

for i in range(num_iterations):
    bootstrap_indices = np.random.choice(len(x_var_3), size=len(x_var_3), replace=True)
    pls.fit(x_var_3.iloc[bootstrap_indices], y_var.iloc[bootstrap_indices])
    boot_loadings[i, :, :] = pls.x_loadings_ # 2 latent variables

# 95% confidence interval
lower_bound = np.percentile(boot_loadings, 2.5, axis=0)
upper_bound = np.percentile(boot_loadings, 97.5, axis=0)

for j in range(num_latent_variables):
    print(f"Latent Variable {j+1}:")
    for i, original_var in enumerate(x_var_3.columns.tolist()):
        lower_bound_val = lower_bound[i, j]
        upper_bound_val = upper_bound[i, j]
        print(f"{original_var}: Lower Bound = {lower_bound_val:.4f}, Upper Bound = {upper_bound_val:.4f} ", end="")
        if lower_bound_val > 0 or upper_bound_val < 0:
            print(f'"statistically significant"')
        else:
            print(f'"not statistically significant"')
    print()

Latent Variable 1:
gdp: Lower Bound = 0.6027, Upper Bound = 0.6236 "statistically significant"
interest_rate: Lower Bound = -0.5515, Upper Bound = -0.5153 "statistically significant"
profit_margin: Lower Bound = 0.5920, Upper Bound = 0.6181 "statistically significant"

Latent Variable 2:
gdp: Lower Bound = 0.3863, Upper Bound = 0.5540 "statistically significant"
interest_rate: Lower Bound = 0.8803, Upper Bound = 0.9657 "statistically significant"
profit_margin: Lower Bound = -0.0203, Upper Bound = 0.1827 "not statistically significant"

plt.figure(figsize=(10,4))
market_framework = rg4.predict(x_var_4)
market_framework.plot(legend = True)
df_updated['market_index'].plot(legend = True)
plt.ylabel('Value')
plt.xlabel('Date')
plt.legend(labels=['Market Framework', 'Market index'])
plt.title('Market Framework vs Market Index')

Text(0.5, 1.0, 'Market Framework vs Market Index')

df_updated['market-gdp'] = df_updated['market_index'] / df_updated['gdp']
df_updated_bi = df_updated[['market-gdp','interest_rate','profit_margin']]

df_updated_bi['market-gdp'].plot(figsize=(10,4))
plt.title('Market capitalisation to GDP')
plt.ylabel('Value')
plt.xlabel('Date')

Text(0.5, 0, 'Date')

y_var_bi = df_updated_bi['market-gdp']
x_var_5 = df_updated_bi[['interest_rate', 'profit_margin']]

explanatory_variables = np.array(x_var_5)
dependent_variable = np.array(y_var_bi)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('interest rate')
ax.set_ylabel('profit margin')
ax.set_zlabel('Market Cap to GDP')
plt.title('3D relationship between the Market Cap to GDP, interest rate, and profit margin')
ax.scatter(explanatory_variables[:, 0], explanatory_variables[:, 1], dependent_variable)

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x169410490>

x_var_5 = sm.add_constant(x_var_5)
rg5 = sm.OLS(y_var_bi, x_var_5).fit()
print(rg5.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             market-gdp   R-squared:                       0.602
Model:                            OLS   Adj. R-squared:                  0.599
Method:                 Least Squares   F-statistic:                     159.0
Date:                Tue, 07 May 2024   Prob (F-statistic):           8.88e-43
Time:                        21:40:24   Log-Likelihood:                 1102.3
No. Observations:                 213   AIC:                            -2199.
Df Residuals:                     210   BIC:                            -2189.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.0025      0.001     -3.744      0.000      -0.004      -0.001
interest_rate    -0.0093      0.003     -2.750      0.006      -0.016      -0.003
profit_margin     0.0821      0.008     10.785      0.000       0.067       0.097
==============================================================================
Omnibus:                       12.317   Durbin-Watson:                   0.069
Prob(Omnibus):                  0.002   Jarque-Bera (JB):               12.989
Skew:                           0.600   Prob(JB):                      0.00151
Kurtosis:                       3.155   Cond. No.                         85.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

plt.figure(figsize=(10,4))
market_sentiment = rg5.predict(x_var_5)
market_sentiment.plot(legend = True)
df_updated_bi['market-gdp'].plot(legend = True)
plt.ylabel('Value')
plt.xlabel('Date')
plt.legend(labels=['Market Sentiment', 'Market Cap to GDP'])
plt.title('Market Sentiment vs Market Index')

Text(0.5, 1.0, 'Market Sentiment vs Market Index')

market_sentiment = rg5.predict(x_var_5)
market_sentiment_mean = np.mean(market_sentiment)
market_sentiment_std = np.std(market_sentiment)

fig, ax1 = plt.subplots(figsize=(10, 4))
market_sentiment.plot(legend=True, label='Market Sentiment', ax=ax1, color='red')
y_var_bi.plot(legend=True, label='Market Cap to GDP', ax=ax1, color='green')

ax1.set_ylabel('Market Sentiment & Market Cap to GDP Values')
ax1.set_xlabel('Date')
ax1.set_title('Regression Models')

ax2 = ax1.twinx()
ax2.set_ylabel('Market Index & Market Framework Values')
df_updated['market_index'].plot(legend=True, label='Market Index', ax=ax2, color='blue')
market_framework.plot(legend = True, label='Market Framework', color='orange')

lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper left')

plt.show()

market_sentiment = rg5.predict(x_var_5)
market_sentiment_mean = np.mean(market_sentiment)
market_sentiment_std = np.std(market_sentiment)

plt.figure(figsize=(10, 4))
market_sentiment.plot(legend=True, label='Market Sentiment', color='red')
plt.axhline(market_sentiment_mean, color='grey', linestyle='-', label='Mean')
plt.axhline(market_sentiment_mean + market_sentiment_std, color='g', linestyle='--', label='1 sd')
plt.axhline(market_sentiment_mean - market_sentiment_std, color='g', linestyle='--')
plt.axhline(market_sentiment_mean + 2*market_sentiment_std, color='royalblue', linestyle='--', label='2 sd')
plt.axhline(market_sentiment_mean - 2*market_sentiment_std, color='royalblue', linestyle='--')
plt.axhline(market_sentiment_mean + 3*market_sentiment_std, color='purple', linestyle='--', label='3 sd')
plt.axhline(market_sentiment_mean - 3*market_sentiment_std, color='purple', linestyle='--')
plt.ylabel('Value')
plt.xlabel('Date')
plt.title('Market Sentiment Variation')
plt.legend()
plt.show()

	market_index	gdp	interest_rate	cpi	profit_margin
date
2023-01-01	199.225806	26813.601	0.045167	301.744	0.0869
2023-04-01	208.604032	27063.012	0.049900	304.003	0.0961
2023-07-01	222.245714	27610.128	0.052600	307.288	0.0977
2023-10-01	222.756984	27956.998	0.053300	308.742	0.1003
2024-01-01	250.156557	28284.498	0.053300	312.230	NaN

	profit_margin
date
1971-01-01	0.0548
1971-04-01	0.0549
1971-07-01	0.0568
1971-10-01	0.0595
1972-01-01	0.0604
...	...
2022-10-01	0.0937
2023-01-01	0.0869
2023-04-01	0.0961
2023-07-01	0.0977
2023-10-01	0.1003

	profit_margin	interest_rate
date
1971-01-01	0.0548	0.038567
1971-04-01	0.0549	0.045667
1971-07-01	0.0568	0.054767
1971-10-01	0.0595	0.047500
1972-01-01	0.0604	0.035467
...	...	...
2022-10-01	0.0937	0.036533
2023-01-01	0.0869	0.045167
2023-04-01	0.0961	0.049900
2023-07-01	0.0977	0.052600
2023-10-01	0.1003	0.053300

	profit_margin	profit_margin-1Q	profit_margin-2Q	profit_margin-3Q
date
1971-10-01	0.0595	0.0568	0.0549	0.0548
1972-01-01	0.0604	0.0595	0.0568	0.0549
1972-04-01	0.0600	0.0604	0.0595	0.0568
1972-07-01	0.0615	0.0600	0.0604	0.0595
1972-10-01	0.0622	0.0615	0.0600	0.0604
...	...	...	...	...
2022-10-01	0.0937	0.0978	0.0988	0.0949
2023-01-01	0.0869	0.0937	0.0978	0.0988
2023-04-01	0.0961	0.0869	0.0937	0.0978
2023-07-01	0.0977	0.0961	0.0869	0.0937
2023-10-01	0.1003	0.0977	0.0961	0.0869

	market_index	gdp	interest_rate	cpi	profit_margin
date
1971-01-01	1.080000	1135.156	0.038567	40.000	0.0548
1971-04-01	1.136667	1156.271	0.045667	40.500	0.0549
1971-07-01	1.113333	1177.675	0.054767	40.800	0.0568
1971-10-01	1.113333	1190.297	0.047500	41.100	0.0595
1972-01-01	1.240000	1230.609	0.035467	41.400	0.0604
...	...	...	...	...	...
2023-01-01	199.225806	26813.601	0.045167	301.744	0.0869
2023-04-01	208.604032	27063.012	0.049900	304.003	0.0961
2023-07-01	222.245714	27610.128	0.052600	307.288	0.0977
2023-10-01	222.756984	27956.998	0.053300	308.742	0.1003
2024-01-01	250.156557	28284.498	0.053300	312.230	0.1003

US Stock Market Time Series Analysis

Data as of March 31, 2024

Dependencies

Profit Margin Time Series Prediction

Set up¶

White noise¶

ARIMA

ARMA or ARIMA¶

Augmented Dickey-Fuller test¶

Order selection¶

ACF & PACF plot¶

Model building¶

In-sample prediction

MAPE

Forecast

VAR

Augmented Dickey-Fuller test¶

Order Selection¶

Granger Causality Test¶

Computer Selection¶

Model Building¶

In-sample predictions

MAPE

Forecast

Linear Regression & Random Forest

Model Setups¶

In-sample predictions

MAPE

Forecast

Profit Margin summary

Market Index Analysis

Market Index controlling for Inflationary

Regression¶

Graphs¶

Market Framework

Ordinary Least Square Model (encounters multicollinearity)¶

Partial Least Square Regression¶

Market Sentiment

Market Analysis Summary

Model Equations

Dep. Variable:	y	No. Observations:	212
Model:	SARIMAX(0, 1, 0)	Log Likelihood	869.343
Date:	Tue, 07 May 2024	AIC	-1736.686
Time:	21:40:20	BIC	-1733.334
Sample:	01-01-1971	HQIC	-1735.331
	- 10-01-2023
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
sigma2	1.541e-05	8.03e-07	19.184	0.000	1.38e-05	1.7e-05

Ljung-Box (L1) (Q):	0.15	Jarque-Bera (JB):	216.00
Prob(Q):	0.70	Prob(JB):	0.00
Heteroskedasticity (H):	4.29	Skew:	0.46
Prob(H) (two-sided):	0.00	Kurtosis:	7.87