-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
set data index frequency and non-invertible MA features
- Loading branch information
1 parent
52bacc2
commit 2829e9f
Showing
2 changed files
with
405 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,350 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Intial code\n", | ||
"for reference purposes" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# import pandas as pd\n", | ||
"# import numpy as np\n", | ||
"# from sklearn.metrics import mean_squared_error\n", | ||
"# from sklearn.preprocessing import StandardScaler\n", | ||
"# from statsmodels.tsa.statespace.sarimax import SARIMAX\n", | ||
"# from statsmodels.tools.sm_exceptions import ConvergenceWarning\n", | ||
"# import warnings\n", | ||
"\n", | ||
"# # Ignore convergence warnings\n", | ||
"# warnings.simplefilter(\"ignore\")\n", | ||
"\n", | ||
"# # Load dataset with parsed dates\n", | ||
"# data = pd.read_csv('../Data/SBI Train data.csv', parse_dates=['Date'], dayfirst=True)\n", | ||
"\n", | ||
"# # Set the index to the Date column\n", | ||
"# data.set_index('Date', inplace=True)\n", | ||
"# # data = data.asfreq('D')\n", | ||
"# # Feature Engineering: Add day of week and month\n", | ||
"# data['day_of_week'] = data.index.dayofweek\n", | ||
"# data['month'] = data.index.month\n", | ||
"\n", | ||
"# # Add lagged value of the Close price and moving averages\n", | ||
"# data['lagged_close'] = data['Close'].shift(1) \n", | ||
"# data['moving_avg_3'] = data['Close'].rolling(window=3).mean()\n", | ||
"# data['moving_avg_7'] = data['Close'].rolling(window=7).mean() # New: 7-day moving average for long-term trend\n", | ||
"\n", | ||
"# # Add Volume as a feature (scaling might help)\n", | ||
"# data['volume'] = data['Volume']\n", | ||
"\n", | ||
"# # Drop rows with NaN values\n", | ||
"# data.dropna(inplace=True)\n", | ||
"\n", | ||
"# # Standardize the features (important for scaling)\n", | ||
"# scaler = StandardScaler()\n", | ||
"# exog_features = ['day_of_week', 'month', 'lagged_close', 'moving_avg_3', 'moving_avg_7', 'volume']\n", | ||
"# data[exog_features] = scaler.fit_transform(data[exog_features])\n", | ||
"\n", | ||
"# # Split the data into training and testing sets\n", | ||
"# train_size = int(len(data) * 0.8)\n", | ||
"# train, test = data.iloc[:train_size], data.iloc[train_size:]\n", | ||
"\n", | ||
"# # Tune SARIMAX hyperparameters (ARIMA order (p, d, q))\n", | ||
"# order = (2, 1, 2) # Consider using AIC/BIC for finding optimal order\n", | ||
"# seasonal_order = (1, 1, 1, 12) # Adding seasonality with monthly frequency\n", | ||
"\n", | ||
"# # Fit the SARIMAX model\n", | ||
"# try:\n", | ||
"# model = SARIMAX(train['Close'], \n", | ||
"# exog=train[exog_features],\n", | ||
"# order=order,\n", | ||
"# seasonal_order=seasonal_order)\n", | ||
"# model_fit = model.fit(disp=False)\n", | ||
"# except ConvergenceWarning as e:\n", | ||
"# print(f\"Convergence warning: {e}\")\n", | ||
"# except Exception as e:\n", | ||
"# print(f\"Error: {e}\")\n", | ||
"\n", | ||
"# # Forecasting\n", | ||
"# forecast = model_fit.forecast(steps=len(test), exog=test[exog_features])\n", | ||
"\n", | ||
"# # Calculate RMSE for forecast\n", | ||
"# rmse_arimax = np.sqrt(mean_squared_error(test['Close'], forecast))\n", | ||
"# print(f\"Improved ARIMAX Model RMSE: {rmse_arimax}\")\n", | ||
"\n", | ||
"# test_prices = [i for i in test['Close']]\n", | ||
"# # Check residuals diagnostics (optional)\n", | ||
"# residuals = test_prices - forecast\n", | ||
"# print(\"Mean of residuals:\", residuals.mean())\n", | ||
"# print(\"Standard deviation of residuals:\", residuals.std())\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### immporting necessary libraries" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: kaggle in d:\\anaconda\\lib\\site-packages (1.6.17)\n", | ||
"Requirement already satisfied: six>=1.10 in d:\\anaconda\\lib\\site-packages (from kaggle) (1.16.0)\n", | ||
"Requirement already satisfied: certifi>=2023.7.22 in d:\\anaconda\\lib\\site-packages (from kaggle) (2024.8.30)\n", | ||
"Requirement already satisfied: python-dateutil in d:\\anaconda\\lib\\site-packages (from kaggle) (2.9.0.post0)\n", | ||
"Requirement already satisfied: requests in d:\\anaconda\\lib\\site-packages (from kaggle) (2.32.2)\n", | ||
"Requirement already satisfied: tqdm in d:\\anaconda\\lib\\site-packages (from kaggle) (4.66.4)\n", | ||
"Requirement already satisfied: python-slugify in d:\\anaconda\\lib\\site-packages (from kaggle) (5.0.2)\n", | ||
"Requirement already satisfied: urllib3 in d:\\anaconda\\lib\\site-packages (from kaggle) (2.2.2)\n", | ||
"Requirement already satisfied: bleach in d:\\anaconda\\lib\\site-packages (from kaggle) (4.1.0)\n", | ||
"Requirement already satisfied: packaging in d:\\anaconda\\lib\\site-packages (from bleach->kaggle) (23.2)\n", | ||
"Requirement already satisfied: webencodings in d:\\anaconda\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n", | ||
"Requirement already satisfied: text-unidecode>=1.3 in d:\\anaconda\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n", | ||
"Requirement already satisfied: charset-normalizer<4,>=2 in d:\\anaconda\\lib\\site-packages (from requests->kaggle) (2.0.4)\n", | ||
"Requirement already satisfied: idna<4,>=2.5 in d:\\anaconda\\lib\\site-packages (from requests->kaggle) (3.7)\n", | ||
"Requirement already satisfied: colorama in c:\\users\\shristi\\appdata\\roaming\\python\\python312\\site-packages (from tqdm->kaggle) (0.4.6)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!pip install kaggle\n", | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import pickle\n", | ||
"from sklearn.metrics import mean_squared_error\n", | ||
"from sklearn.preprocessing import StandardScaler\n", | ||
"from statsmodels.tsa.statespace.sarimax import SARIMAX\n", | ||
"from statsmodels.tools.sm_exceptions import ConvergenceWarning\n", | ||
"import warnings" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Udating features to dataset for proper time-series analysis" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"# Ignore convergence warnings\n", | ||
"warnings.simplefilter(\"ignore\", ConvergenceWarning)\n", | ||
"\n", | ||
"# Load training dataset with parsed dates\n", | ||
"train_data = pd.read_csv('../Data/SBI Train data.csv', parse_dates=['Date'], dayfirst=True)\n", | ||
"\n", | ||
"# Set the index to the Date column\n", | ||
"train_data.index = pd.DatetimeIndex(train_data.index).to_period('M')\n", | ||
"\n", | ||
"# Feature Engineering: Add day of week and month\n", | ||
"train_data['day_of_week'] = train_data.index.dayofweek\n", | ||
"train_data['month'] = train_data.index.month\n", | ||
"\n", | ||
"# Add lagged value of the Close price and moving averages\n", | ||
"train_data['lagged_close'] = train_data['Close'].shift(1)\n", | ||
"train_data['moving_avg_3'] = train_data['Close'].rolling(window=3).mean()\n", | ||
"train_data['moving_avg_7'] = train_data['Close'].rolling(window=7).mean()\n", | ||
"\n", | ||
"# Add Volume as a feature (scaling might help)\n", | ||
"train_data['volume'] = train_data['Volume']\n", | ||
"\n", | ||
"# Drop rows with NaN values after applying the rolling window and lagging\n", | ||
"train_data.dropna(inplace=True)\n", | ||
"\n", | ||
"# Standardize the features\n", | ||
"scaler = StandardScaler()\n", | ||
"exog_features = ['day_of_week', 'month', 'lagged_close', 'moving_avg_3', 'moving_avg_7', 'volume']\n", | ||
"train_data[exog_features] = scaler.fit_transform(train_data[exog_features])\n", | ||
"\n", | ||
"# Split the data into training and testing sets\n", | ||
"train_size = int(len(train_data) * 0.8)\n", | ||
"train, validation = train_data.iloc[:train_size], train_data.iloc[train_size:]\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Training and savinng model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Model and scaler saved successfully.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Train the SARIMAX model\n", | ||
"order = (2, 1, 2)\n", | ||
"seasonal_order = (1, 1, 1, 12)\n", | ||
"\n", | ||
"model = SARIMAX(train['Close'], exog=train[exog_features], order=order, seasonal_order=seasonal_order,enforce_invertibility=False)\n", | ||
"model_fit = model.fit(disp=False)\n", | ||
"\n", | ||
"#insert your folder name where you want the dataset to be downloaded instead of .kaggle\n", | ||
"os.system('kaggle datasets download -d shristirwt/sarimax-model -p/.kaggle')\n", | ||
"os.system('kaggle datasets download -d shristirwt/scaler-model -p/.kaggle')\n", | ||
"\n", | ||
"# Save the model to a file using pickle\n", | ||
"with open(r'C:\\Users\\SHRISTI\\.kaggle\\sarimax_model.pkl', 'wb') as f:\n", | ||
" pickle.dump(model_fit, f)\n", | ||
"\n", | ||
"# Optionally save the scaler as well\n", | ||
"with open(r'C:\\Users\\SHRISTI\\.kaggle\\scaler.pkl', 'wb') as f:\n", | ||
" pickle.dump(scaler, f)\n", | ||
"\n", | ||
"print(\"Model and scaler saved successfully.\")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Loading saved model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load the model and scaler from the files\n", | ||
"with open(r'C:\\Users\\SHRISTI\\.kaggle\\sarimax_model.pkl', 'rb') as f:\n", | ||
" loaded_model = pickle.load(f)\n", | ||
"\n", | ||
"with open(r'C:\\Users\\SHRISTI\\.kaggle\\scaler.pkl', 'rb') as f:\n", | ||
" loaded_scaler = pickle.load(f)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Loading and processing Test data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load the test dataset\n", | ||
"test_data = pd.read_csv('../Data/SBI Test data.csv', parse_dates=['Date'], dayfirst=True)\n", | ||
"\n", | ||
"# Set the index to the Date column\n", | ||
"test_data.set_index('Date', inplace=True)\n", | ||
"\n", | ||
"# Apply the same feature engineering on the test data\n", | ||
"test_data['day_of_week'] = test_data.index.dayofweek\n", | ||
"test_data['month'] = test_data.index.month\n", | ||
"test_data['lagged_close'] = test_data['Close'].shift(1)\n", | ||
"test_data['moving_avg_3'] = test_data['Close'].rolling(window=3).mean()\n", | ||
"test_data['moving_avg_7'] = test_data['Close'].rolling(window=7).mean()\n", | ||
"\n", | ||
"# Add Volume as a feature\n", | ||
"test_data['volume'] = test_data['Volume']\n", | ||
"\n", | ||
"# Drop rows with NaN values\n", | ||
"test_data.dropna(inplace=True)\n", | ||
"\n", | ||
"# Standardize the features in the test dataset using the loaded scaler\n", | ||
"test_data[exog_features] = loaded_scaler.transform(test_data[exog_features])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Predicting share prices using model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Test Data RMSE: 4.883649507349637\n", | ||
"Mean of residuals: 0.06489726947015648\n", | ||
"Standard deviation of residuals: 4.8849520783077764\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Forecasting on the test data using the loaded model\n", | ||
"forecast_test = loaded_model.forecast(steps=len(test_data), exog=test_data[exog_features])\n", | ||
"\n", | ||
"# Calculate RMSE for forecast\n", | ||
"rmse_test = np.sqrt(mean_squared_error(test_data['Close'], forecast_test))\n", | ||
"print(f\"Test Data RMSE: {rmse_test}\")\n", | ||
"\n", | ||
"# Check residuals diagnostics (optional)\n", | ||
"test_prices = test_data['Close'].values\n", | ||
"residuals_test = test_prices - forecast_test\n", | ||
"print(\"Mean of residuals:\", residuals_test.mean())\n", | ||
"print(\"Standard deviation of residuals:\", residuals_test.std())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Oops, something went wrong.