set data index frequency and non-invertible MA features

rohitinu6 · Oct 22, 2024 · 2829e9f · 2829e9f
1 parent 52bacc2
commit 2829e9f
Show file tree

Hide file tree

Showing 2 changed files with 405 additions and 42 deletions.
diff --git a/ARIMA/.ipynb_checkpoints/ARIMA_V2-checkpoint.ipynb b/ARIMA/.ipynb_checkpoints/ARIMA_V2-checkpoint.ipynb
@@ -0,0 +1,350 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Intial code\n",
+    "for reference purposes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import pandas as pd\n",
+    "# import numpy as np\n",
+    "# from sklearn.metrics import mean_squared_error\n",
+    "# from sklearn.preprocessing import StandardScaler\n",
+    "# from statsmodels.tsa.statespace.sarimax import SARIMAX\n",
+    "# from statsmodels.tools.sm_exceptions import ConvergenceWarning\n",
+    "# import warnings\n",
+    "\n",
+    "# # Ignore convergence warnings\n",
+    "# warnings.simplefilter(\"ignore\")\n",
+    "\n",
+    "# # Load dataset with parsed dates\n",
+    "# data = pd.read_csv('../Data/SBI Train data.csv', parse_dates=['Date'], dayfirst=True)\n",
+    "\n",
+    "# # Set the index to the Date column\n",
+    "# data.set_index('Date', inplace=True)\n",
+    "# # data = data.asfreq('D')\n",
+    "# # Feature Engineering: Add day of week and month\n",
+    "# data['day_of_week'] = data.index.dayofweek\n",
+    "# data['month'] = data.index.month\n",
+    "\n",
+    "# # Add lagged value of the Close price and moving averages\n",
+    "# data['lagged_close'] = data['Close'].shift(1)  \n",
+    "# data['moving_avg_3'] = data['Close'].rolling(window=3).mean()\n",
+    "# data['moving_avg_7'] = data['Close'].rolling(window=7).mean()  # New: 7-day moving average for long-term trend\n",
+    "\n",
+    "# # Add Volume as a feature (scaling might help)\n",
+    "# data['volume'] = data['Volume']\n",
+    "\n",
+    "# # Drop rows with NaN values\n",
+    "# data.dropna(inplace=True)\n",
+    "\n",
+    "# # Standardize the features (important for scaling)\n",
+    "# scaler = StandardScaler()\n",
+    "# exog_features = ['day_of_week', 'month', 'lagged_close', 'moving_avg_3', 'moving_avg_7', 'volume']\n",
+    "# data[exog_features] = scaler.fit_transform(data[exog_features])\n",
+    "\n",
+    "# # Split the data into training and testing sets\n",
+    "# train_size = int(len(data) * 0.8)\n",
+    "# train, test = data.iloc[:train_size], data.iloc[train_size:]\n",
+    "\n",
+    "# # Tune SARIMAX hyperparameters (ARIMA order (p, d, q))\n",
+    "# order = (2, 1, 2)  # Consider using AIC/BIC for finding optimal order\n",
+    "# seasonal_order = (1, 1, 1, 12)  # Adding seasonality with monthly frequency\n",
+    "\n",
+    "# # Fit the SARIMAX model\n",
+    "# try:\n",
+    "#     model = SARIMAX(train['Close'], \n",
+    "#                     exog=train[exog_features],\n",
+    "#                     order=order,\n",
+    "#                     seasonal_order=seasonal_order)\n",
+    "#     model_fit = model.fit(disp=False)\n",
+    "# except ConvergenceWarning as e:\n",
+    "#     print(f\"Convergence warning: {e}\")\n",
+    "# except Exception as e:\n",
+    "#     print(f\"Error: {e}\")\n",
+    "\n",
+    "# # Forecasting\n",
+    "# forecast = model_fit.forecast(steps=len(test), exog=test[exog_features])\n",
+    "\n",
+    "# # Calculate RMSE for forecast\n",
+    "# rmse_arimax = np.sqrt(mean_squared_error(test['Close'], forecast))\n",
+    "# print(f\"Improved ARIMAX Model RMSE: {rmse_arimax}\")\n",
+    "\n",
+    "# test_prices = [i for i in test['Close']]\n",
+    "# # Check residuals diagnostics (optional)\n",
+    "# residuals = test_prices - forecast\n",
+    "# print(\"Mean of residuals:\", residuals.mean())\n",
+    "# print(\"Standard deviation of residuals:\", residuals.std())\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### immporting necessary libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: kaggle in d:\\anaconda\\lib\\site-packages (1.6.17)\n",
+      "Requirement already satisfied: six>=1.10 in d:\\anaconda\\lib\\site-packages (from kaggle) (1.16.0)\n",
+      "Requirement already satisfied: certifi>=2023.7.22 in d:\\anaconda\\lib\\site-packages (from kaggle) (2024.8.30)\n",
+      "Requirement already satisfied: python-dateutil in d:\\anaconda\\lib\\site-packages (from kaggle) (2.9.0.post0)\n",
+      "Requirement already satisfied: requests in d:\\anaconda\\lib\\site-packages (from kaggle) (2.32.2)\n",
+      "Requirement already satisfied: tqdm in d:\\anaconda\\lib\\site-packages (from kaggle) (4.66.4)\n",
+      "Requirement already satisfied: python-slugify in d:\\anaconda\\lib\\site-packages (from kaggle) (5.0.2)\n",
+      "Requirement already satisfied: urllib3 in d:\\anaconda\\lib\\site-packages (from kaggle) (2.2.2)\n",
+      "Requirement already satisfied: bleach in d:\\anaconda\\lib\\site-packages (from kaggle) (4.1.0)\n",
+      "Requirement already satisfied: packaging in d:\\anaconda\\lib\\site-packages (from bleach->kaggle) (23.2)\n",
+      "Requirement already satisfied: webencodings in d:\\anaconda\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n",
+      "Requirement already satisfied: text-unidecode>=1.3 in d:\\anaconda\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in d:\\anaconda\\lib\\site-packages (from requests->kaggle) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in d:\\anaconda\\lib\\site-packages (from requests->kaggle) (3.7)\n",
+      "Requirement already satisfied: colorama in c:\\users\\shristi\\appdata\\roaming\\python\\python312\\site-packages (from tqdm->kaggle) (0.4.6)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install kaggle\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import pickle\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from statsmodels.tsa.statespace.sarimax import SARIMAX\n",
+    "from statsmodels.tools.sm_exceptions import ConvergenceWarning\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Udating features to dataset for proper time-series analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Ignore convergence warnings\n",
+    "warnings.simplefilter(\"ignore\", ConvergenceWarning)\n",
+    "\n",
+    "# Load training dataset with parsed dates\n",
+    "train_data = pd.read_csv('../Data/SBI Train data.csv', parse_dates=['Date'], dayfirst=True)\n",
+    "\n",
+    "# Set the index to the Date column\n",
+    "train_data.index = pd.DatetimeIndex(train_data.index).to_period('M')\n",
+    "\n",
+    "# Feature Engineering: Add day of week and month\n",
+    "train_data['day_of_week'] = train_data.index.dayofweek\n",
+    "train_data['month'] = train_data.index.month\n",
+    "\n",
+    "# Add lagged value of the Close price and moving averages\n",
+    "train_data['lagged_close'] = train_data['Close'].shift(1)\n",
+    "train_data['moving_avg_3'] = train_data['Close'].rolling(window=3).mean()\n",
+    "train_data['moving_avg_7'] = train_data['Close'].rolling(window=7).mean()\n",
+    "\n",
+    "# Add Volume as a feature (scaling might help)\n",
+    "train_data['volume'] = train_data['Volume']\n",
+    "\n",
+    "# Drop rows with NaN values after applying the rolling window and lagging\n",
+    "train_data.dropna(inplace=True)\n",
+    "\n",
+    "# Standardize the features\n",
+    "scaler = StandardScaler()\n",
+    "exog_features = ['day_of_week', 'month', 'lagged_close', 'moving_avg_3', 'moving_avg_7', 'volume']\n",
+    "train_data[exog_features] = scaler.fit_transform(train_data[exog_features])\n",
+    "\n",
+    "# Split the data into training and testing sets\n",
+    "train_size = int(len(train_data) * 0.8)\n",
+    "train, validation = train_data.iloc[:train_size], train_data.iloc[train_size:]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Training and savinng model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model and scaler saved successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Train the SARIMAX model\n",
+    "order = (2, 1, 2)\n",
+    "seasonal_order = (1, 1, 1, 12)\n",
+    "\n",
+    "model = SARIMAX(train['Close'], exog=train[exog_features], order=order, seasonal_order=seasonal_order,enforce_invertibility=False)\n",
+    "model_fit = model.fit(disp=False)\n",
+    "\n",
+    "#insert your folder name where you want the dataset to be downloaded instead of .kaggle\n",
+    "os.system('kaggle datasets download -d shristirwt/sarimax-model -p/.kaggle')\n",
+    "os.system('kaggle datasets download -d shristirwt/scaler-model -p/.kaggle')\n",
+    "\n",
+    "# Save the model to a file using pickle\n",
+    "with open(r'C:\\Users\\SHRISTI\\.kaggle\\sarimax_model.pkl', 'wb') as f:\n",
+    "    pickle.dump(model_fit, f)\n",
+    "\n",
+    "# Optionally save the scaler as well\n",
+    "with open(r'C:\\Users\\SHRISTI\\.kaggle\\scaler.pkl', 'wb') as f:\n",
+    "    pickle.dump(scaler, f)\n",
+    "\n",
+    "print(\"Model and scaler saved successfully.\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Loading saved model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the model and scaler from the files\n",
+    "with open(r'C:\\Users\\SHRISTI\\.kaggle\\sarimax_model.pkl', 'rb') as f:\n",
+    "    loaded_model = pickle.load(f)\n",
+    "\n",
+    "with open(r'C:\\Users\\SHRISTI\\.kaggle\\scaler.pkl', 'rb') as f:\n",
+    "    loaded_scaler = pickle.load(f)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Loading and processing Test data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the test dataset\n",
+    "test_data = pd.read_csv('../Data/SBI Test data.csv', parse_dates=['Date'], dayfirst=True)\n",
+    "\n",
+    "# Set the index to the Date column\n",
+    "test_data.set_index('Date', inplace=True)\n",
+    "\n",
+    "# Apply the same feature engineering on the test data\n",
+    "test_data['day_of_week'] = test_data.index.dayofweek\n",
+    "test_data['month'] = test_data.index.month\n",
+    "test_data['lagged_close'] = test_data['Close'].shift(1)\n",
+    "test_data['moving_avg_3'] = test_data['Close'].rolling(window=3).mean()\n",
+    "test_data['moving_avg_7'] = test_data['Close'].rolling(window=7).mean()\n",
+    "\n",
+    "# Add Volume as a feature\n",
+    "test_data['volume'] = test_data['Volume']\n",
+    "\n",
+    "# Drop rows with NaN values\n",
+    "test_data.dropna(inplace=True)\n",
+    "\n",
+    "# Standardize the features in the test dataset using the loaded scaler\n",
+    "test_data[exog_features] = loaded_scaler.transform(test_data[exog_features])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Predicting share prices using model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test Data RMSE: 4.883649507349637\n",
+      "Mean of residuals: 0.06489726947015648\n",
+      "Standard deviation of residuals: 4.8849520783077764\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Forecasting on the test data using the loaded model\n",
+    "forecast_test = loaded_model.forecast(steps=len(test_data), exog=test_data[exog_features])\n",
+    "\n",
+    "# Calculate RMSE for forecast\n",
+    "rmse_test = np.sqrt(mean_squared_error(test_data['Close'], forecast_test))\n",
+    "print(f\"Test Data RMSE: {rmse_test}\")\n",
+    "\n",
+    "# Check residuals diagnostics (optional)\n",
+    "test_prices = test_data['Close'].values\n",
+    "residuals_test = test_prices - forecast_test\n",
+    "print(\"Mean of residuals:\", residuals_test.mean())\n",
+    "print(\"Standard deviation of residuals:\", residuals_test.std())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}