diff --git a/ARIMA/ARIMA_V2.ipynb b/ARIMA/ARIMA_V2.ipynb new file mode 100644 index 0000000..bd5fe49 --- /dev/null +++ b/ARIMA/ARIMA_V2.ipynb @@ -0,0 +1,337 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Intial code\n", + "for reference purposes" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import pandas as pd\n", + "# import numpy as np\n", + "# from sklearn.metrics import mean_squared_error\n", + "# from sklearn.preprocessing import StandardScaler\n", + "# from statsmodels.tsa.statespace.sarimax import SARIMAX\n", + "# from statsmodels.tools.sm_exceptions import ConvergenceWarning\n", + "# import warnings\n", + "\n", + "# # Ignore convergence warnings\n", + "# warnings.simplefilter(\"ignore\")\n", + "\n", + "# # Load dataset with parsed dates\n", + "# data = pd.read_csv('../Data/SBI Train data.csv', parse_dates=['Date'], dayfirst=True)\n", + "\n", + "# # Set the index to the Date column\n", + "# data.set_index('Date', inplace=True)\n", + "# # data = data.asfreq('D')\n", + "# # Feature Engineering: Add day of week and month\n", + "# data['day_of_week'] = data.index.dayofweek\n", + "# data['month'] = data.index.month\n", + "\n", + "# # Add lagged value of the Close price and moving averages\n", + "# data['lagged_close'] = data['Close'].shift(1) \n", + "# data['moving_avg_3'] = data['Close'].rolling(window=3).mean()\n", + "# data['moving_avg_7'] = data['Close'].rolling(window=7).mean() # New: 7-day moving average for long-term trend\n", + "\n", + "# # Add Volume as a feature (scaling might help)\n", + "# data['volume'] = data['Volume']\n", + "\n", + "# # Drop rows with NaN values\n", + "# data.dropna(inplace=True)\n", + "\n", + "# # Standardize the features (important for scaling)\n", + "# scaler = StandardScaler()\n", + "# exog_features = ['day_of_week', 'month', 'lagged_close', 'moving_avg_3', 'moving_avg_7', 'volume']\n", + "# data[exog_features] = scaler.fit_transform(data[exog_features])\n", + "\n", + "# # Split the data into training and testing sets\n", + "# train_size = int(len(data) * 0.8)\n", + "# train, test = data.iloc[:train_size], data.iloc[train_size:]\n", + "\n", + "# # Tune SARIMAX hyperparameters (ARIMA order (p, d, q))\n", + "# order = (2, 1, 2) # Consider using AIC/BIC for finding optimal order\n", + "# seasonal_order = (1, 1, 1, 12) # Adding seasonality with monthly frequency\n", + "\n", + "# # Fit the SARIMAX model\n", + "# try:\n", + "# model = SARIMAX(train['Close'], \n", + "# exog=train[exog_features],\n", + "# order=order,\n", + "# seasonal_order=seasonal_order)\n", + "# model_fit = model.fit(disp=False)\n", + "# except ConvergenceWarning as e:\n", + "# print(f\"Convergence warning: {e}\")\n", + "# except Exception as e:\n", + "# print(f\"Error: {e}\")\n", + "\n", + "# # Forecasting\n", + "# forecast = model_fit.forecast(steps=len(test), exog=test[exog_features])\n", + "\n", + "# # Calculate RMSE for forecast\n", + "# rmse_arimax = np.sqrt(mean_squared_error(test['Close'], forecast))\n", + "# print(f\"Improved ARIMAX Model RMSE: {rmse_arimax}\")\n", + "\n", + "# test_prices = [i for i in test['Close']]\n", + "# # Check residuals diagnostics (optional)\n", + "# residuals = test_prices - forecast\n", + "# print(\"Mean of residuals:\", residuals.mean())\n", + "# print(\"Standard deviation of residuals:\", residuals.std())\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### immporting necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import pickle\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.preprocessing import StandardScaler\n", + "from statsmodels.tsa.statespace.sarimax import SARIMAX\n", + "from statsmodels.tools.sm_exceptions import ConvergenceWarning\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Udating features to dataset for proper time-series analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Ignore convergence warnings\n", + "warnings.simplefilter(\"ignore\", ConvergenceWarning)\n", + "\n", + "# Load training dataset with parsed dates\n", + "train_data = pd.read_csv('../Data/SBI Train data.csv', parse_dates=['Date'], dayfirst=True)\n", + "\n", + "# Set the index to the Date column\n", + "train_data.set_index('Date', inplace=True)\n", + "\n", + "# Feature Engineering: Add day of week and month\n", + "train_data['day_of_week'] = train_data.index.dayofweek\n", + "train_data['month'] = train_data.index.month\n", + "\n", + "# Add lagged value of the Close price and moving averages\n", + "train_data['lagged_close'] = train_data['Close'].shift(1)\n", + "train_data['moving_avg_3'] = train_data['Close'].rolling(window=3).mean()\n", + "train_data['moving_avg_7'] = train_data['Close'].rolling(window=7).mean()\n", + "\n", + "# Add Volume as a feature (scaling might help)\n", + "train_data['volume'] = train_data['Volume']\n", + "\n", + "# Drop rows with NaN values after applying the rolling window and lagging\n", + "train_data.dropna(inplace=True)\n", + "\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "exog_features = ['day_of_week', 'month', 'lagged_close', 'moving_avg_3', 'moving_avg_7', 'volume']\n", + "train_data[exog_features] = scaler.fit_transform(train_data[exog_features])\n", + "\n", + "# Split the data into training and testing sets\n", + "train_size = int(len(train_data) * 0.8)\n", + "train, validation = train_data.iloc[:train_size], train_data.iloc[train_size:]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training and savinng model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\statsmodels\\tsa\\base\\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.\n", + " self._init_dates(dates, freq)\n", + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\statsmodels\\tsa\\base\\tsa_model.py:473: ValueWarning: A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.\n", + " self._init_dates(dates, freq)\n", + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\statsmodels\\tsa\\statespace\\sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.\n", + " warn('Non-invertible starting MA parameters found.'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model and scaler saved successfully.\n" + ] + } + ], + "source": [ + "# Train the SARIMAX model\n", + "order = (2, 1, 2)\n", + "seasonal_order = (1, 1, 1, 12)\n", + "\n", + "model = SARIMAX(train['Close'], exog=train[exog_features], order=order, seasonal_order=seasonal_order)\n", + "model_fit = model.fit(disp=False)\n", + "\n", + "# Save the model to a file using pickle\n", + "with open('sarimax_model.pkl', 'wb') as f:\n", + " pickle.dump(model_fit, f)\n", + "\n", + "# Optionally save the scaler as well\n", + "with open('scaler.pkl', 'wb') as f:\n", + " pickle.dump(scaler, f)\n", + "\n", + "print(\"Model and scaler saved successfully.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading saved model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model and scaler from the files\n", + "with open('sarimax_model.pkl', 'rb') as f:\n", + " loaded_model = pickle.load(f)\n", + "\n", + "with open('scaler.pkl', 'rb') as f:\n", + " loaded_scaler = pickle.load(f)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading and processing Test data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the test dataset\n", + "test_data = pd.read_csv('../Data/SBI Test data.csv', parse_dates=['Date'], dayfirst=True)\n", + "\n", + "# Set the index to the Date column\n", + "test_data.set_index('Date', inplace=True)\n", + "\n", + "# Apply the same feature engineering on the test data\n", + "test_data['day_of_week'] = test_data.index.dayofweek\n", + "test_data['month'] = test_data.index.month\n", + "test_data['lagged_close'] = test_data['Close'].shift(1)\n", + "test_data['moving_avg_3'] = test_data['Close'].rolling(window=3).mean()\n", + "test_data['moving_avg_7'] = test_data['Close'].rolling(window=7).mean()\n", + "\n", + "# Add Volume as a feature\n", + "test_data['volume'] = test_data['Volume']\n", + "\n", + "# Drop rows with NaN values\n", + "test_data.dropna(inplace=True)\n", + "\n", + "# Standardize the features in the test dataset using the loaded scaler\n", + "test_data[exog_features] = loaded_scaler.transform(test_data[exog_features])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Predicting share prices using model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\statsmodels\\tsa\\base\\tsa_model.py:837: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.\n", + " return get_prediction_index(\n", + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\statsmodels\\tsa\\base\\tsa_model.py:837: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.\n", + " return get_prediction_index(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Data RMSE: 4.673693537736142\n", + "Mean of residuals: 0.311316834051805\n", + "Standard deviation of residuals: 4.664969245987366\n" + ] + } + ], + "source": [ + "# Forecasting on the test data using the loaded model\n", + "forecast_test = loaded_model.forecast(steps=len(test_data), exog=test_data[exog_features])\n", + "\n", + "# Calculate RMSE for forecast\n", + "rmse_test = np.sqrt(mean_squared_error(test_data['Close'], forecast_test))\n", + "print(f\"Test Data RMSE: {rmse_test}\")\n", + "\n", + "# Check residuals diagnostics (optional)\n", + "test_prices = test_data['Close'].values\n", + "residuals_test = test_prices - forecast_test\n", + "print(\"Mean of residuals:\", residuals_test.mean())\n", + "print(\"Standard deviation of residuals:\", residuals_test.std())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ARIMA/README.md b/ARIMA/README.md new file mode 100644 index 0000000..1b56446 --- /dev/null +++ b/ARIMA/README.md @@ -0,0 +1,75 @@ +# ARIMA Model Development for Time Series Forecasting + +## Overview + +This notebook contains the code and resources for developing an ARIMA model to forecast time series data. The model uses historical data to make predictions on future values, specifically focusing on stock prices. + +## Table of Contents + +- [Database Changes](#database-changes) +- [ARIMA Model Development Process](#arima-model-development-process) +- [Usage](#usage) + +## Database Changes + +To facilitate the development of the ARIMA model, the following changes were made to the database: + +1. **New Features Added**: + + - **Day of Week**: Extracted from the date to capture weekly seasonality. + - **Month**: Extracted from the date to capture monthly trends. + - **Lagged Close Price**: Included the previous day's closing price as a feature for better prediction accuracy. + - **Moving Averages**: Added 3-day and 7-day moving averages to smooth the data and identify trends. + - **Volume**: Included volume as a feature to understand its impact on stock prices. + +2. **Data Preprocessing**: + - Rows with missing values were removed after feature engineering to ensure model accuracy. + - Features were standardized using `StandardScaler` to improve model performance. + +## ARIMA Model Development Process + +The following steps outline the process for developing the ARIMA model: + +1. **Data Loading**: + + - Load historical stock price data from a CSV file, parsing dates correctly. + +2. **Feature Engineering**: + + - Extract features like day of the week, month, lagged prices, moving averages, and volume. + - Drop any rows with NaN values resulting from feature engineering. + +3. **Data Splitting**: + + - Split the dataset into training and validation sets (80% training, 20% validation). + +4. **Model Specification**: + + - Specify the order of the ARIMA model (p, d, q) based on prior analysis or domain knowledge. + - Define seasonal orders if applicable. + +5. **Model Fitting**: + + - Fit the SARIMAX model using the training dataset, including exogenous features. + +6. **Model Saving**: + + - Save the trained model and scaler using `pickle` for later use in forecasting. + +7. **Model Testing**: + + - Load the saved model and scaler. + - Preprocess the test dataset in the same way as the training dataset. + - Make predictions using the test dataset and evaluate model performance using RMSE (Root Mean Square Error). + +8. **Residual Analysis**: + - Analyze the residuals of the model to check for any patterns that may indicate model inadequacy. + +## Usage + +1. **Install Dependencies**: + Make sure you have the required packages installed. You can do this using `pip`: + + ```bash + pip install pandas numpy scikit-learn statsmodels + ``` diff --git a/ARIMA/hybrid.ipynb b/ARIMA/hybrid.ipynb new file mode 100644 index 0000000..491551d --- /dev/null +++ b/ARIMA/hybrid.ipynb @@ -0,0 +1,1028 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing stepwise search to minimize aic\n", + " ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=30202.237, Time=0.62 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=30233.097, Time=0.05 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=30205.175, Time=0.16 sec\n", + " ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=30203.304, Time=0.22 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] : AIC=30231.827, Time=0.04 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=30196.421, Time=0.84 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=30201.003, Time=0.20 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] intercept : AIC=30198.399, Time=1.24 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=30198.353, Time=1.94 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=30196.789, Time=1.17 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] intercept : AIC=30202.291, Time=0.30 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=30199.886, Time=1.15 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] : AIC=30195.213, Time=0.28 sec\n", + " ARIMA(1,1,1)(0,0,0)[0] : AIC=30200.893, Time=0.38 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] : AIC=30199.679, Time=0.09 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] : AIC=30197.192, Time=0.56 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] : AIC=30197.148, Time=0.97 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] : AIC=30203.808, Time=0.06 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] : AIC=30195.578, Time=0.44 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] : AIC=30200.982, Time=0.13 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] : AIC=30198.679, Time=0.51 sec\n", + "\n", + "Best model: ARIMA(2,1,1)(0,0,0)[0] \n", + "Total fit time: 11.362 seconds\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'numpy.ndarray' object has no attribute 'values'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 71\u001b[0m\n\u001b[0;32m 68\u001b[0m custom_data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../Data/SBI Train data.csv\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 69\u001b[0m close_prices \u001b[38;5;241m=\u001b[39m custom_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mClose\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m---> 71\u001b[0m prediction \u001b[38;5;241m=\u001b[39m \u001b[43mhybrid_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclose_prices\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHybrid model prediction for next day closing price: $\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mprediction\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m.2f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[1;32mIn[2], line 34\u001b[0m, in \u001b[0;36mhybrid_model\u001b[1;34m(data, time_steps)\u001b[0m\n\u001b[0;32m 31\u001b[0m arima_results \u001b[38;5;241m=\u001b[39m arima_model\u001b[38;5;241m.\u001b[39mfit()\n\u001b[0;32m 33\u001b[0m \u001b[38;5;66;03m# Get ARIMA residuals\u001b[39;00m\n\u001b[1;32m---> 34\u001b[0m arima_residuals \u001b[38;5;241m=\u001b[39m df \u001b[38;5;241m-\u001b[39m \u001b[43marima_results\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfittedvalues\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Prepare data for LSTM\u001b[39;00m\n\u001b[0;32m 37\u001b[0m scaler \u001b[38;5;241m=\u001b[39m MinMaxScaler()\n", + "\u001b[1;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'values'" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from statsmodels.tsa.arima.model import ARIMA\n", + "from pmdarima import auto_arima\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, LSTM\n", + "\n", + "def prepare_data(data, time_steps):\n", + " X, y = [], []\n", + " for i in range(len(data) - time_steps):\n", + " X.append(data[i:(i + time_steps), 0])\n", + " y.append(data[i + time_steps, 0])\n", + " return np.array(X), np.array(y)\n", + "\n", + "def hybrid_model(data, time_steps=60):\n", + " # Ensure data is numpy array\n", + " if isinstance(data, pd.DataFrame):\n", + " df = data.values\n", + " else:\n", + " df = np.array(data)\n", + " \n", + " df = df.reshape(-1, 1)\n", + "\n", + " # ARIMA model\n", + " model_auto = auto_arima(df, start_p=1, start_q=1, max_p=3, max_q=3, m=1,\n", + " d=None, seasonal=False, start_P=0, D=0, trace=True,\n", + " error_action='ignore', suppress_warnings=True, stepwise=True)\n", + "\n", + " arima_model = ARIMA(df, order=model_auto.order)\n", + " arima_results = arima_model.fit()\n", + "\n", + " # Get ARIMA residuals\n", + " arima_residuals = df - arima_results.fittedvalues.values.reshape(-1, 1)\n", + "\n", + " # Prepare data for LSTM\n", + " scaler = MinMaxScaler()\n", + " residuals_scaled = scaler.fit_transform(arima_residuals)\n", + "\n", + " X, y = prepare_data(residuals_scaled, time_steps)\n", + " X = np.reshape(X, (X.shape[0], X.shape[1], 1))\n", + "\n", + " # LSTM model\n", + " lstm_model = Sequential([\n", + " LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)),\n", + " LSTM(units=50),\n", + " Dense(units=1)\n", + " ])\n", + " lstm_model.compile(optimizer='adam', loss='mean_squared_error')\n", + " lstm_model.fit(X, y, epochs=50, batch_size=32, verbose=0)\n", + "\n", + " # Make hybrid prediction\n", + " last_60_days = residuals_scaled[-60:]\n", + " X_test = np.array([last_60_days])\n", + " X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))\n", + "\n", + " lstm_prediction = lstm_model.predict(X_test)\n", + " lstm_prediction = scaler.inverse_transform(lstm_prediction)\n", + "\n", + " arima_forecast = arima_results.forecast(steps=1)\n", + "\n", + " hybrid_prediction = arima_forecast + lstm_prediction[0][0]\n", + "\n", + " return hybrid_prediction[0]\n", + "\n", + "# Example usage with custom data\n", + "# Assuming you have a CSV file named 'my_stock_data.csv' with a 'Close' column\n", + "custom_data = pd.read_csv('../Data/SBI Train data.csv')\n", + "close_prices = custom_data['Close']\n", + "\n", + "prediction = hybrid_model(close_prices)\n", + "print(f\"Hybrid model prediction for next day closing price: ${prediction:.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing stepwise search to minimize aic\n", + " ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=30202.237, Time=1.22 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=30233.097, Time=0.09 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=30205.175, Time=0.55 sec\n", + " ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=30203.304, Time=0.46 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] : AIC=30231.827, Time=0.06 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=30196.421, Time=1.17 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=30201.003, Time=0.20 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] intercept : AIC=30198.399, Time=1.24 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=30198.353, Time=1.94 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=30196.789, Time=1.18 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] intercept : AIC=30202.291, Time=0.31 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=30199.886, Time=1.17 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] : AIC=30195.213, Time=0.31 sec\n", + " ARIMA(1,1,1)(0,0,0)[0] : AIC=30200.893, Time=0.39 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] : AIC=30199.679, Time=0.08 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] : AIC=30197.192, Time=0.57 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] : AIC=30197.148, Time=0.97 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] : AIC=30203.808, Time=0.07 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] : AIC=30195.578, Time=0.45 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] : AIC=30200.982, Time=0.14 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] : AIC=30198.679, Time=0.55 sec\n", + "\n", + "Best model: ARIMA(2,1,1)(0,0,0)[0] \n", + "Total fit time: 13.125 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\keras\\src\\layers\\rnn\\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n", + " super().__init__(**kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 148ms/step\n", + "Hybrid model prediction for next day closing price: $245.77\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from statsmodels.tsa.arima.model import ARIMA\n", + "from pmdarima import auto_arima\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, LSTM\n", + "\n", + "def prepare_data(data, time_steps):\n", + " X, y = [], []\n", + " for i in range(len(data) - time_steps):\n", + " X.append(data[i:(i + time_steps), 0])\n", + " y.append(data[i + time_steps, 0])\n", + " return np.array(X), np.array(y)\n", + "\n", + "def hybrid_model(data, time_steps=60):\n", + " # Ensure data is numpy array\n", + " if isinstance(data, pd.Series):\n", + " df = data.values\n", + " elif isinstance(data, pd.DataFrame):\n", + " df = data.values\n", + " else:\n", + " df = np.array(data)\n", + " \n", + " df = df.reshape(-1, 1)\n", + "\n", + " # ARIMA model\n", + " model_auto = auto_arima(df, start_p=1, start_q=1, max_p=3, max_q=3, m=1,\n", + " d=None, seasonal=False, start_P=0, D=0, trace=True,\n", + " error_action='ignore', suppress_warnings=True, stepwise=True)\n", + "\n", + " arima_model = ARIMA(df, order=model_auto.order)\n", + " arima_results = arima_model.fit()\n", + "\n", + " # Get ARIMA residuals\n", + " arima_residuals = df - arima_results.fittedvalues.reshape(-1, 1)\n", + "\n", + " # Prepare data for LSTM\n", + " scaler = MinMaxScaler()\n", + " residuals_scaled = scaler.fit_transform(arima_residuals)\n", + "\n", + " X, y = prepare_data(residuals_scaled, time_steps)\n", + " X = np.reshape(X, (X.shape[0], X.shape[1], 1))\n", + "\n", + " # LSTM model\n", + " lstm_model = Sequential([\n", + " LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)),\n", + " LSTM(units=50),\n", + " Dense(units=1)\n", + " ])\n", + " lstm_model.compile(optimizer='adam', loss='mean_squared_error')\n", + " lstm_model.fit(X, y, epochs=50, batch_size=32, verbose=0)\n", + "\n", + " # Make hybrid prediction\n", + " last_60_days = residuals_scaled[-60:]\n", + " X_test = np.array([last_60_days])\n", + " X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))\n", + "\n", + " lstm_prediction = lstm_model.predict(X_test)\n", + " lstm_prediction = scaler.inverse_transform(lstm_prediction)\n", + "\n", + " arima_forecast = arima_results.forecast(steps=1)\n", + "\n", + " hybrid_prediction = arima_forecast + lstm_prediction[0][0]\n", + "\n", + " return hybrid_prediction[0]\n", + "\n", + "# Example usage with custom data\n", + "custom_data = pd.read_csv('../Data/SBI Train data.csv')\n", + "close_prices = custom_data['Close']\n", + "\n", + "prediction = hybrid_model(close_prices)\n", + "print(f\"Hybrid model prediction for next day closing price: ${prediction:.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing stepwise search to minimize aic\n", + " ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=30202.237, Time=0.62 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=30233.097, Time=0.05 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=30205.175, Time=0.15 sec\n", + " ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=30203.304, Time=0.21 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] : AIC=30231.827, Time=0.03 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=30196.421, Time=0.85 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=30201.003, Time=0.21 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] intercept : AIC=30198.399, Time=1.25 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=30198.353, Time=1.96 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=30196.789, Time=1.21 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] intercept : AIC=30202.291, Time=0.32 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=30199.886, Time=1.15 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] : AIC=30195.213, Time=0.29 sec\n", + " ARIMA(1,1,1)(0,0,0)[0] : AIC=30200.893, Time=0.43 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] : AIC=30199.679, Time=0.09 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] : AIC=30197.192, Time=0.58 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] : AIC=30197.148, Time=1.01 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] : AIC=30203.808, Time=0.06 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] : AIC=30195.578, Time=0.44 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] : AIC=30200.982, Time=0.15 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] : AIC=30198.679, Time=0.52 sec\n", + "\n", + "Best model: ARIMA(2,1,1)(0,0,0)[0] \n", + "Total fit time: 11.613 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\keras\\src\\layers\\rnn\\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n", + " super().__init__(**kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 140ms/step\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'ARIMA' object has no attribute 'append'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[4], line 88\u001b[0m\n\u001b[0;32m 85\u001b[0m test_close_prices \u001b[38;5;241m=\u001b[39m test_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mClose\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 87\u001b[0m \u001b[38;5;66;03m# Make predictions\u001b[39;00m\n\u001b[1;32m---> 88\u001b[0m predictions \u001b[38;5;241m=\u001b[39m \u001b[43mhybrid_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_close_prices\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_close_prices\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 90\u001b[0m \u001b[38;5;66;03m# Calculate accuracy metrics\u001b[39;00m\n\u001b[0;32m 91\u001b[0m mae \u001b[38;5;241m=\u001b[39m mean_absolute_error(test_close_prices, predictions)\n", + "Cell \u001b[1;32mIn[4], line 76\u001b[0m, in \u001b[0;36mhybrid_model\u001b[1;34m(train_data, test_data, time_steps)\u001b[0m\n\u001b[0;32m 73\u001b[0m predictions\u001b[38;5;241m.\u001b[39mappend(hybrid_prediction[\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m 75\u001b[0m \u001b[38;5;66;03m# Update ARIMA model\u001b[39;00m\n\u001b[1;32m---> 76\u001b[0m arima_results \u001b[38;5;241m=\u001b[39m \u001b[43marima_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mappend\u001b[49m(test_data[i])\u001b[38;5;241m.\u001b[39mfit()\n\u001b[0;32m 78\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39marray(predictions)\n", + "\u001b[1;31mAttributeError\u001b[0m: 'ARIMA' object has no attribute 'append'" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from statsmodels.tsa.arima.model import ARIMA\n", + "from pmdarima import auto_arima\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, LSTM\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", + "\n", + "def prepare_data(data, time_steps):\n", + " X, y = [], []\n", + " for i in range(len(data) - time_steps):\n", + " X.append(data[i:(i + time_steps), 0])\n", + " y.append(data[i + time_steps, 0])\n", + " return np.array(X), np.array(y)\n", + "\n", + "def hybrid_model(train_data, test_data, time_steps=60):\n", + " # Ensure data is numpy array\n", + " if isinstance(train_data, pd.Series):\n", + " train_df = train_data.values\n", + " elif isinstance(train_data, pd.DataFrame):\n", + " train_df = train_data.values\n", + " else:\n", + " train_df = np.array(train_data)\n", + " \n", + " train_df = train_df.reshape(-1, 1)\n", + "\n", + " # ARIMA model\n", + " model_auto = auto_arima(train_df, start_p=1, start_q=1, max_p=3, max_q=3, m=1,\n", + " d=None, seasonal=False, start_P=0, D=0, trace=True,\n", + " error_action='ignore', suppress_warnings=True, stepwise=True)\n", + "\n", + " arima_model = ARIMA(train_df, order=model_auto.order)\n", + " arima_results = arima_model.fit()\n", + "\n", + " # Get ARIMA residuals\n", + " arima_residuals = train_df - arima_results.fittedvalues.reshape(-1, 1)\n", + "\n", + " # Prepare data for LSTM\n", + " scaler = MinMaxScaler()\n", + " residuals_scaled = scaler.fit_transform(arima_residuals)\n", + "\n", + " X, y = prepare_data(residuals_scaled, time_steps)\n", + " X = np.reshape(X, (X.shape[0], X.shape[1], 1))\n", + "\n", + " # LSTM model\n", + " lstm_model = Sequential([\n", + " LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)),\n", + " LSTM(units=50),\n", + " Dense(units=1)\n", + " ])\n", + " lstm_model.compile(optimizer='adam', loss='mean_squared_error')\n", + " lstm_model.fit(X, y, epochs=50, batch_size=32, verbose=0)\n", + "\n", + " # Make predictions for test data\n", + " predictions = []\n", + " test_data = np.array(test_data).reshape(-1, 1)\n", + " combined_data = np.vstack((train_df, test_data))\n", + "\n", + " for i in range(len(test_data)):\n", + " # ARIMA prediction\n", + " arima_forecast = arima_results.forecast(steps=1)\n", + "\n", + " # LSTM prediction\n", + " last_60_days = scaler.transform(combined_data[-(time_steps+1):-1])\n", + " X_test = np.array([last_60_days])\n", + " X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))\n", + " lstm_prediction = lstm_model.predict(X_test)\n", + " lstm_prediction = scaler.inverse_transform(lstm_prediction)\n", + "\n", + " # Combine predictions\n", + " hybrid_prediction = arima_forecast + lstm_prediction[0][0]\n", + " predictions.append(hybrid_prediction[0])\n", + "\n", + " # Update ARIMA model\n", + " arima_results = arima_model.append(test_data[i]).fit()\n", + "\n", + " return np.array(predictions)\n", + "\n", + "# Load and prepare data\n", + "train_data = pd.read_csv('../Data/SBI Train data.csv')\n", + "test_data = pd.read_csv('../Data/SBI Test data.csv')\n", + "\n", + "train_close_prices = train_data['Close']\n", + "test_close_prices = test_data['Close']\n", + "\n", + "# Make predictions\n", + "predictions = hybrid_model(train_close_prices, test_close_prices)\n", + "\n", + "# Calculate accuracy metrics\n", + "mae = mean_absolute_error(test_close_prices, predictions)\n", + "rmse = np.sqrt(mean_squared_error(test_close_prices, predictions))\n", + "\n", + "print(f\"Mean Absolute Error: ${mae:.2f}\")\n", + "print(f\"Root Mean Squared Error: ${rmse:.2f}\")\n", + "\n", + "# You can also calculate percentage error\n", + "mape = np.mean(np.abs((test_close_prices - predictions) / test_close_prices)) * 100\n", + "print(f\"Mean Absolute Percentage Error: {mape:.2f}%\")\n", + "\n", + "# Plot actual vs predicted prices\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(12,6))\n", + "plt.plot(test_data['Date'], test_close_prices, label='Actual Prices')\n", + "plt.plot(test_data['Date'], predictions, label='Predicted Prices')\n", + "plt.title('Actual vs Predicted Stock Prices')\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Price')\n", + "plt.legend()\n", + "plt.xticks(rotation=45)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing stepwise search to minimize aic\n", + " ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=30202.237, Time=0.63 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=30233.097, Time=0.05 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=30205.175, Time=0.17 sec\n", + " ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=30203.304, Time=0.24 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] : AIC=30231.827, Time=0.04 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=30196.421, Time=0.93 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=30201.003, Time=0.23 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] intercept : AIC=30198.399, Time=1.30 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=30198.353, Time=1.94 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=30196.789, Time=1.22 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] intercept : AIC=30202.291, Time=0.30 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=30199.886, Time=1.15 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] : AIC=30195.213, Time=0.31 sec\n", + " ARIMA(1,1,1)(0,0,0)[0] : AIC=30200.893, Time=0.42 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] : AIC=30199.679, Time=0.10 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] : AIC=30197.192, Time=0.61 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] : AIC=30197.148, Time=1.00 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] : AIC=30203.808, Time=0.05 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] : AIC=30195.578, Time=0.45 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] : AIC=30200.982, Time=0.14 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] : AIC=30198.679, Time=0.54 sec\n", + "\n", + "Best model: ARIMA(2,1,1)(0,0,0)[0] \n", + "Total fit time: 11.839 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\keras\\src\\layers\\rnn\\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n", + " super().__init__(**kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 134ms/step\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'ARIMA' object has no attribute 'append'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[5], line 88\u001b[0m\n\u001b[0;32m 85\u001b[0m test_close_prices \u001b[38;5;241m=\u001b[39m test_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mClose\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 87\u001b[0m \u001b[38;5;66;03m# Make predictions\u001b[39;00m\n\u001b[1;32m---> 88\u001b[0m predictions \u001b[38;5;241m=\u001b[39m \u001b[43mhybrid_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_close_prices\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_close_prices\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 90\u001b[0m \u001b[38;5;66;03m# Calculate accuracy metrics\u001b[39;00m\n\u001b[0;32m 91\u001b[0m mae \u001b[38;5;241m=\u001b[39m mean_absolute_error(test_close_prices, predictions)\n", + "Cell \u001b[1;32mIn[5], line 76\u001b[0m, in \u001b[0;36mhybrid_model\u001b[1;34m(train_data, test_data, time_steps)\u001b[0m\n\u001b[0;32m 73\u001b[0m predictions\u001b[38;5;241m.\u001b[39mappend(hybrid_prediction[\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m 75\u001b[0m \u001b[38;5;66;03m# Update ARIMA model\u001b[39;00m\n\u001b[1;32m---> 76\u001b[0m arima_results \u001b[38;5;241m=\u001b[39m \u001b[43marima_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mappend\u001b[49m(test_data[i])\u001b[38;5;241m.\u001b[39mfit()\n\u001b[0;32m 78\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39marray(predictions)\n", + "\u001b[1;31mAttributeError\u001b[0m: 'ARIMA' object has no attribute 'append'" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from statsmodels.tsa.arima.model import ARIMA\n", + "from pmdarima import auto_arima\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, LSTM\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", + "\n", + "def prepare_data(data, time_steps):\n", + " X, y = [], []\n", + " for i in range(len(data) - time_steps):\n", + " X.append(data[i:(i + time_steps), 0])\n", + " y.append(data[i + time_steps, 0])\n", + " return np.array(X), np.array(y)\n", + "\n", + "def hybrid_model(train_data, test_data, time_steps=60):\n", + " # Ensure data is numpy array\n", + " if isinstance(train_data, pd.Series):\n", + " train_df = train_data.values\n", + " elif isinstance(train_data, pd.DataFrame):\n", + " train_df = train_data.values\n", + " else:\n", + " train_df = np.array(train_data)\n", + " \n", + " train_df = train_df.reshape(-1, 1)\n", + "\n", + " # ARIMA model\n", + " model_auto = auto_arima(train_df, start_p=1, start_q=1, max_p=3, max_q=3, m=1,\n", + " d=None, seasonal=False, start_P=0, D=0, trace=True,\n", + " error_action='ignore', suppress_warnings=True, stepwise=True)\n", + "\n", + " arima_model = ARIMA(train_df, order=model_auto.order)\n", + " arima_results = arima_model.fit()\n", + "\n", + " # Get ARIMA residuals\n", + " arima_residuals = train_df - arima_results.fittedvalues.reshape(-1, 1)\n", + "\n", + " # Prepare data for LSTM\n", + " scaler = MinMaxScaler()\n", + " residuals_scaled = scaler.fit_transform(arima_residuals)\n", + "\n", + " X, y = prepare_data(residuals_scaled, time_steps)\n", + " X = np.reshape(X, (X.shape[0], X.shape[1], 1))\n", + "\n", + " # LSTM model\n", + " lstm_model = Sequential([\n", + " LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)),\n", + " LSTM(units=50),\n", + " Dense(units=1)\n", + " ])\n", + " lstm_model.compile(optimizer='adam', loss='mean_squared_error')\n", + " lstm_model.fit(X, y, epochs=50, batch_size=32, verbose=0)\n", + "\n", + " # Make predictions for test data\n", + " predictions = []\n", + " test_data = np.array(test_data).reshape(-1, 1)\n", + " combined_data = np.vstack((train_df, test_data))\n", + "\n", + " for i in range(len(test_data)):\n", + " # ARIMA prediction\n", + " arima_forecast = arima_results.forecast(steps=1)\n", + "\n", + " # LSTM prediction\n", + " last_60_days = scaler.transform(combined_data[-(time_steps+1):-1])\n", + " X_test = np.array([last_60_days])\n", + " X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))\n", + " lstm_prediction = lstm_model.predict(X_test)\n", + " lstm_prediction = scaler.inverse_transform(lstm_prediction)\n", + "\n", + " # Combine predictions\n", + " hybrid_prediction = arima_forecast + lstm_prediction[0][0]\n", + " predictions.append(hybrid_prediction[0])\n", + "\n", + " # Update ARIMA model\n", + " arima_results = arima_model.append(test_data[i]).fit()\n", + "\n", + " return np.array(predictions)\n", + "\n", + "# Load and prepare data\n", + "train_data = pd.read_csv('../Data/SBI Train data.csv')\n", + "test_data = pd.read_csv('../Data/SBI Test data.csv')\n", + "\n", + "train_close_prices = train_data['Close']\n", + "test_close_prices = test_data['Close']\n", + "\n", + "# Make predictions\n", + "predictions = hybrid_model(train_close_prices, test_close_prices)\n", + "\n", + "# Calculate accuracy metrics\n", + "mae = mean_absolute_error(test_close_prices, predictions)\n", + "rmse = np.sqrt(mean_squared_error(test_close_prices, predictions))\n", + "\n", + "print(f\"Mean Absolute Error: ${mae:.2f}\")\n", + "print(f\"Root Mean Squared Error: ${rmse:.2f}\")\n", + "\n", + "# You can also calculate percentage error\n", + "mape = np.mean(np.abs((test_close_prices - predictions) / test_close_prices)) * 100\n", + "print(f\"Mean Absolute Percentage Error: {mape:.2f}%\")\n", + "\n", + "# Plot actual vs predicted prices\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(12,6))\n", + "plt.plot(test_data['Date'], test_close_prices, label='Actual Prices')\n", + "plt.plot(test_data['Date'], predictions, label='Predicted Prices')\n", + "plt.title('Actual vs Predicted Stock Prices')\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Price')\n", + "plt.legend()\n", + "plt.xticks(rotation=45)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performing stepwise search to minimize aic\n", + " ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=30202.237, Time=0.59 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=30233.097, Time=0.06 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=30205.175, Time=0.16 sec\n", + " ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=30203.304, Time=0.25 sec\n", + " ARIMA(0,1,0)(0,0,0)[0] : AIC=30231.827, Time=0.04 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=30196.421, Time=0.85 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=30201.003, Time=0.20 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] intercept : AIC=30198.399, Time=1.28 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=30198.353, Time=2.09 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=30196.789, Time=1.29 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] intercept : AIC=30202.291, Time=0.33 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=30199.886, Time=1.23 sec\n", + " ARIMA(2,1,1)(0,0,0)[0] : AIC=30195.213, Time=0.30 sec\n", + " ARIMA(1,1,1)(0,0,0)[0] : AIC=30200.893, Time=0.38 sec\n", + " ARIMA(2,1,0)(0,0,0)[0] : AIC=30199.679, Time=0.09 sec\n", + " ARIMA(3,1,1)(0,0,0)[0] : AIC=30197.192, Time=0.57 sec\n", + " ARIMA(2,1,2)(0,0,0)[0] : AIC=30197.148, Time=0.98 sec\n", + " ARIMA(1,1,0)(0,0,0)[0] : AIC=30203.808, Time=0.06 sec\n", + " ARIMA(1,1,2)(0,0,0)[0] : AIC=30195.578, Time=0.42 sec\n", + " ARIMA(3,1,0)(0,0,0)[0] : AIC=30200.982, Time=0.14 sec\n", + " ARIMA(3,1,2)(0,0,0)[0] : AIC=30198.679, Time=0.50 sec\n", + "\n", + "Best model: ARIMA(2,1,1)(0,0,0)[0] \n", + "Total fit time: 11.823 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\agraw\\AppData\\Roaming\\Python\\Python311\\site-packages\\keras\\src\\layers\\rnn\\rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n", + " super().__init__(**kwargs)\n", + "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" + ] + }, + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'saved_model\\\\lstm_model.h5\\\\arima_model.pkl'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 122\u001b[0m\n\u001b[0;32m 119\u001b[0m save_model(arima_results, lstm_model, scaler, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msaved_model\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 121\u001b[0m \u001b[38;5;66;03m# Later, load the model and make predictions\u001b[39;00m\n\u001b[1;32m--> 122\u001b[0m loaded_arima, loaded_lstm, loaded_scaler \u001b[38;5;241m=\u001b[39m \u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msaved_model\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 123\u001b[0m predictions \u001b[38;5;241m=\u001b[39m make_predictions(loaded_arima, loaded_lstm, loaded_scaler, test_close_prices)\n\u001b[0;32m 125\u001b[0m \u001b[38;5;66;03m# Calculate accuracy metrics\u001b[39;00m\n", + "Cell \u001b[1;32mIn[6], line 77\u001b[0m, in \u001b[0;36mload_model\u001b[1;34m(folder_path)\u001b[0m\n\u001b[0;32m 74\u001b[0m arima_results \u001b[38;5;241m=\u001b[39m joblib\u001b[38;5;241m.\u001b[39mload(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(folder_path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124marima_model.pkl\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m 76\u001b[0m \u001b[38;5;66;03m# Load LSTM model\u001b[39;00m\n\u001b[1;32m---> 77\u001b[0m lstm_model \u001b[38;5;241m=\u001b[39m \u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfolder_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mlstm_model.h5\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 79\u001b[0m \u001b[38;5;66;03m# Load scaler\u001b[39;00m\n\u001b[0;32m 80\u001b[0m scaler \u001b[38;5;241m=\u001b[39m joblib\u001b[38;5;241m.\u001b[39mload(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(folder_path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mscaler.pkl\u001b[39m\u001b[38;5;124m'\u001b[39m))\n", + "Cell \u001b[1;32mIn[6], line 74\u001b[0m, in \u001b[0;36mload_model\u001b[1;34m(folder_path)\u001b[0m\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_model\u001b[39m(folder_path):\n\u001b[0;32m 73\u001b[0m \u001b[38;5;66;03m# Load ARIMA model\u001b[39;00m\n\u001b[1;32m---> 74\u001b[0m arima_results \u001b[38;5;241m=\u001b[39m \u001b[43mjoblib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfolder_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43marima_model.pkl\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 76\u001b[0m \u001b[38;5;66;03m# Load LSTM model\u001b[39;00m\n\u001b[0;32m 77\u001b[0m lstm_model \u001b[38;5;241m=\u001b[39m load_model(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(folder_path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlstm_model.h5\u001b[39m\u001b[38;5;124m'\u001b[39m))\n", + "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\joblib\\numpy_pickle.py:579\u001b[0m, in \u001b[0;36mload\u001b[1;34m(filename, mmap_mode)\u001b[0m\n\u001b[0;32m 577\u001b[0m obj \u001b[38;5;241m=\u001b[39m _unpickle(fobj)\n\u001b[0;32m 578\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 579\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m 580\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _read_fileobject(f, filename, mmap_mode) \u001b[38;5;28;01mas\u001b[39;00m fobj:\n\u001b[0;32m 581\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(fobj, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m 582\u001b[0m \u001b[38;5;66;03m# if the returned file object is a string, this means we\u001b[39;00m\n\u001b[0;32m 583\u001b[0m \u001b[38;5;66;03m# try to load a pickle file generated with an version of\u001b[39;00m\n\u001b[0;32m 584\u001b[0m \u001b[38;5;66;03m# Joblib so we load it with joblib compatibility function.\u001b[39;00m\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'saved_model\\\\lstm_model.h5\\\\arima_model.pkl'" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from statsmodels.tsa.arima.model import ARIMA\n", + "from pmdarima import auto_arima\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from tensorflow.keras.models import Sequential, load_model\n", + "from tensorflow.keras.layers import Dense, LSTM\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", + "import joblib\n", + "import os\n", + "\n", + "def prepare_data(data, time_steps):\n", + " X, y = [], []\n", + " for i in range(len(data) - time_steps):\n", + " X.append(data[i:(i + time_steps), 0])\n", + " y.append(data[i + time_steps, 0])\n", + " return np.array(X), np.array(y)\n", + "\n", + "def create_hybrid_model(train_data, time_steps=60):\n", + " # Ensure data is numpy array\n", + " if isinstance(train_data, pd.Series):\n", + " train_df = train_data.values\n", + " elif isinstance(train_data, pd.DataFrame):\n", + " train_df = train_data.values\n", + " else:\n", + " train_df = np.array(train_data)\n", + " \n", + " train_df = train_df.reshape(-1, 1)\n", + "\n", + " # ARIMA model\n", + " model_auto = auto_arima(train_df, start_p=1, start_q=1, max_p=3, max_q=3, m=1,\n", + " d=None, seasonal=False, start_P=0, D=0, trace=True,\n", + " error_action='ignore', suppress_warnings=True, stepwise=True)\n", + "\n", + " arima_model = ARIMA(train_df, order=model_auto.order)\n", + " arima_results = arima_model.fit()\n", + "\n", + " # Get ARIMA residuals\n", + " arima_residuals = train_df - arima_results.fittedvalues.reshape(-1, 1)\n", + "\n", + " # Prepare data for LSTM\n", + " scaler = MinMaxScaler()\n", + " residuals_scaled = scaler.fit_transform(arima_residuals)\n", + "\n", + " X, y = prepare_data(residuals_scaled, time_steps)\n", + " X = np.reshape(X, (X.shape[0], X.shape[1], 1))\n", + "\n", + " # LSTM model\n", + " lstm_model = Sequential([\n", + " LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)),\n", + " LSTM(units=50),\n", + " Dense(units=1)\n", + " ])\n", + " lstm_model.compile(optimizer='adam', loss='mean_squared_error')\n", + " lstm_model.fit(X, y, epochs=50, batch_size=32, verbose=0)\n", + "\n", + " return arima_results, lstm_model, scaler\n", + "\n", + "def save_model(arima_results, lstm_model, scaler, folder_path):\n", + " if not os.path.exists(folder_path):\n", + " os.makedirs(folder_path)\n", + " \n", + " # Save ARIMA model\n", + " joblib.dump(arima_results, os.path.join(folder_path, 'arima_model.pkl'))\n", + " \n", + " # Save LSTM model\n", + " lstm_model.save(os.path.join(folder_path, 'lstm_model.h5'))\n", + " \n", + " # Save scaler\n", + " joblib.dump(scaler, os.path.join(folder_path, 'scaler.pkl'))\n", + "\n", + "def load_model(folder_path):\n", + " # Load ARIMA model\n", + " arima_results = joblib.load(os.path.join(folder_path, 'arima_model.pkl'))\n", + " \n", + " # Load LSTM model\n", + " lstm_model = load_model(os.path.join(folder_path, 'lstm_model.h5'))\n", + " \n", + " # Load scaler\n", + " scaler = joblib.load(os.path.join(folder_path, 'scaler.pkl'))\n", + " \n", + " return arima_results, lstm_model, scaler\n", + "\n", + "def make_predictions(arima_results, lstm_model, scaler, test_data, time_steps=60):\n", + " predictions = []\n", + " test_data = np.array(test_data).reshape(-1, 1)\n", + "\n", + " for i in range(len(test_data)):\n", + " # ARIMA prediction\n", + " arima_forecast = arima_results.forecast(steps=1)\n", + "\n", + " # LSTM prediction\n", + " last_60_days = scaler.transform(test_data[i:i+time_steps])\n", + " X_test = np.array([last_60_days])\n", + " X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))\n", + " lstm_prediction = lstm_model.predict(X_test)\n", + " lstm_prediction = scaler.inverse_transform(lstm_prediction)\n", + "\n", + " # Combine predictions\n", + " hybrid_prediction = arima_forecast + lstm_prediction[0][0]\n", + " predictions.append(hybrid_prediction[0])\n", + "\n", + " # Update ARIMA model\n", + " arima_results = arima_results.append(test_data[i])\n", + "\n", + " return np.array(predictions)\n", + "\n", + "# Example usage\n", + "if __name__ == \"__main__\":\n", + " # Load data\n", + " train_data = pd.read_csv('../Data/SBI Train data.csv')\n", + " test_data = pd.read_csv('../Data/SBI Test data.csv')\n", + "\n", + " train_close_prices = train_data['Close']\n", + " test_close_prices = test_data['Close']\n", + "\n", + " # Create and save the model\n", + " arima_results, lstm_model, scaler = create_hybrid_model(train_close_prices)\n", + " save_model(arima_results, lstm_model, scaler, 'saved_model')\n", + "\n", + " # Later, load the model and make predictions\n", + " loaded_arima, loaded_lstm, loaded_scaler = load_model('saved_model')\n", + " predictions = make_predictions(loaded_arima, loaded_lstm, loaded_scaler, test_close_prices)\n", + "\n", + " # Calculate accuracy metrics\n", + " mae = mean_absolute_error(test_close_prices, predictions)\n", + " rmse = np.sqrt(mean_squared_error(test_close_prices, predictions))\n", + " mape = np.mean(np.abs((test_close_prices - predictions) / test_close_prices)) * 100\n", + "\n", + " print(f\"Mean Absolute Error: ${mae:.2f}\")\n", + " print(f\"Root Mean Squared Error: ${rmse:.2f}\")\n", + " print(f\"Mean Absolute Percentage Error: {mape:.2f}%\")\n", + "\n", + " # Plot results\n", + " import matplotlib.pyplot as plt\n", + "\n", + " plt.figure(figsize=(12,6))\n", + " plt.plot(test_data['Date'], test_close_prices, label='Actual Prices')\n", + " plt.plot(test_data['Date'], predictions, label='Predicted Prices')\n", + " plt.title('Actual vs Predicted Stock Prices')\n", + " plt.xlabel('Date')\n", + " plt.ylabel('Price')\n", + " plt.legend()\n", + " plt.xticks(rotation=45)\n", + " plt.tight_layout()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:5 out of the last 5 calls to .one_step_on_data_distributed at 0x000002DD2831BCE0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:5 out of the last 5 calls to .one_step_on_data_distributed at 0x000002DD2831BCE0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 146ms/step\n" + ] + }, + { + "ename": "UnboundLocalError", + "evalue": "cannot access local variable 'arima_model' where it is not associated with a value", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mUnboundLocalError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[9], line 107\u001b[0m\n\u001b[0;32m 104\u001b[0m test_close_prices \u001b[38;5;241m=\u001b[39m test_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mClose\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 106\u001b[0m \u001b[38;5;66;03m# Make predictions\u001b[39;00m\n\u001b[1;32m--> 107\u001b[0m predictions \u001b[38;5;241m=\u001b[39m \u001b[43mhybrid_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_close_prices\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_close_prices\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 109\u001b[0m \u001b[38;5;66;03m# Calculate accuracy metrics\u001b[39;00m\n\u001b[0;32m 110\u001b[0m mae \u001b[38;5;241m=\u001b[39m mean_absolute_error(test_close_prices, predictions)\n", + "Cell \u001b[1;32mIn[9], line 95\u001b[0m, in \u001b[0;36mhybrid_model\u001b[1;34m(train_data, test_data, time_steps, model_dir)\u001b[0m\n\u001b[0;32m 92\u001b[0m predictions\u001b[38;5;241m.\u001b[39mappend(hybrid_prediction[\u001b[38;5;241m0\u001b[39m])\n\u001b[0;32m 94\u001b[0m \u001b[38;5;66;03m# Update ARIMA model with test data\u001b[39;00m\n\u001b[1;32m---> 95\u001b[0m arima_results \u001b[38;5;241m=\u001b[39m \u001b[43marima_model\u001b[49m\u001b[38;5;241m.\u001b[39mappend(test_data[i])\u001b[38;5;241m.\u001b[39mfit()\n\u001b[0;32m 97\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m np\u001b[38;5;241m.\u001b[39marray(predictions)\n", + "\u001b[1;31mUnboundLocalError\u001b[0m: cannot access local variable 'arima_model' where it is not associated with a value" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from statsmodels.tsa.arima.model import ARIMA\n", + "from pmdarima import auto_arima\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from tensorflow.keras.models import Sequential, load_model\n", + "from tensorflow.keras.layers import Dense, LSTM\n", + "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", + "import joblib # For saving the ARIMA model\n", + "import os\n", + "\n", + "# Data preparation\n", + "def prepare_data(data, time_steps):\n", + " X, y = [], []\n", + " for i in range(len(data) - time_steps):\n", + " X.append(data[i:(i + time_steps), 0])\n", + " y.append(data[i + time_steps, 0])\n", + " return np.array(X), np.array(y)\n", + "\n", + "# Hybrid ARIMA-LSTM Model\n", + "def hybrid_model(train_data, test_data, time_steps=60, model_dir='./model'):\n", + " # Ensure data is a numpy array\n", + " train_df = np.array(train_data).reshape(-1, 1)\n", + "\n", + " # Create a directory to save models if it doesn't exist\n", + " if not os.path.exists(model_dir):\n", + " os.makedirs(model_dir)\n", + "\n", + " # ARIMA Model\n", + " arima_model_path = os.path.join(model_dir, 'arima_model.pkl')\n", + " if not os.path.exists(arima_model_path):\n", + " model_auto = auto_arima(train_df, start_p=1, start_q=1, max_p=3, max_q=3, m=1,\n", + " d=None, seasonal=False, start_P=0, D=0, trace=True,\n", + " error_action='ignore', suppress_warnings=True, stepwise=True)\n", + "\n", + " arima_model = ARIMA(train_df, order=model_auto.order)\n", + " arima_results = arima_model.fit()\n", + " # Save ARIMA model\n", + " joblib.dump(arima_results, arima_model_path)\n", + " else:\n", + " arima_results = joblib.load(arima_model_path)\n", + "\n", + " # Get ARIMA residuals\n", + " arima_residuals = train_df - arima_results.fittedvalues.reshape(-1, 1)\n", + "\n", + " # Prepare data for LSTM\n", + " scaler = MinMaxScaler()\n", + " residuals_scaled = scaler.fit_transform(arima_residuals)\n", + "\n", + " X, y = prepare_data(residuals_scaled, time_steps)\n", + " X = np.reshape(X, (X.shape[0], X.shape[1], 1))\n", + "\n", + " # LSTM Model\n", + " lstm_model_path = os.path.join(model_dir, 'lstm_model.keras') # Updated file extension\n", + " if not os.path.exists(lstm_model_path):\n", + " lstm_model = Sequential([\n", + " LSTM(units=50, return_sequences=True, input_shape=(X.shape[1], 1)),\n", + " LSTM(units=50),\n", + " Dense(units=1)\n", + " ])\n", + " lstm_model.compile(optimizer='adam', loss='mean_squared_error')\n", + "\n", + " # Early stopping and model checkpoint\n", + " early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)\n", + " model_checkpoint = ModelCheckpoint(lstm_model_path, save_best_only=True, monitor='loss')\n", + "\n", + " # Train LSTM model\n", + " lstm_model.fit(X, y, epochs=50, batch_size=32, verbose=1, callbacks=[early_stopping, model_checkpoint])\n", + "\n", + " else:\n", + " lstm_model = load_model(lstm_model_path)\n", + "\n", + " # Make predictions for test data\n", + " predictions = []\n", + " test_data = np.array(test_data).reshape(-1, 1)\n", + " combined_data = np.vstack((train_df, test_data))\n", + "\n", + " for i in range(len(test_data)):\n", + " # ARIMA prediction\n", + " arima_forecast = arima_results.forecast(steps=1)\n", + "\n", + " # LSTM prediction\n", + " last_60_days = scaler.transform(combined_data[-(time_steps+1):-1])\n", + " X_test = np.array([last_60_days])\n", + " X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))\n", + " lstm_prediction = lstm_model.predict(X_test)\n", + " lstm_prediction = scaler.inverse_transform(lstm_prediction)\n", + "\n", + " # Combine predictions\n", + " hybrid_prediction = arima_forecast + lstm_prediction[0][0]\n", + " predictions.append(hybrid_prediction[0])\n", + "\n", + " # Update ARIMA model with test data\n", + " arima_results = arima_model.append(test_data[i]).fit()\n", + "\n", + " return np.array(predictions)\n", + "\n", + "# Load and prepare data\n", + "train_data = pd.read_csv('../Data/SBI Train data.csv')\n", + "test_data = pd.read_csv('../Data/SBI Test data.csv')\n", + "\n", + "train_close_prices = train_data['Close']\n", + "test_close_prices = test_data['Close']\n", + "\n", + "# Make predictions\n", + "predictions = hybrid_model(train_close_prices, test_close_prices)\n", + "\n", + "# Calculate accuracy metrics\n", + "mae = mean_absolute_error(test_close_prices, predictions)\n", + "rmse = np.sqrt(mean_squared_error(test_close_prices, predictions))\n", + "\n", + "print(f\"Mean Absolute Error: ${mae:.2f}\")\n", + "print(f\"Root Mean Squared Error: ${rmse:.2f}\")\n", + "\n", + "# Calculate Mean Absolute Percentage Error (MAPE)\n", + "mape = np.mean(np.abs((test_close_prices - predictions) / test_close_prices)) * 100\n", + "print(f\"Mean Absolute Percentage Error: {mape:.2f}%\")\n", + "\n", + "# Plot actual vs predicted prices\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(12,6))\n", + "plt.plot(test_data['Date'], test_close_prices, label='Actual Prices')\n", + "plt.plot(test_data['Date'], predictions, label='Predicted Prices')\n", + "plt.title('Actual vs Predicted Stock Prices')\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Price')\n", + "plt.legend()\n", + "plt.xticks(rotation=45)\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..30db39a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pandas==2.1.1 +numpy==1.26.0 +scikit-learn==1.3.0 +statsmodels==0.14.0 +pickle-mixin==1.0.2 \ No newline at end of file