diff --git a/stock_market(complete).ipynb b/stock_market(complete).ipynb new file mode 100644 index 0000000..52417b4 --- /dev/null +++ b/stock_market(complete).ipynb @@ -0,0 +1,2509 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "id": "Rqr4Dq5vWXmV" + }, + "outputs": [], + "source": [ + "# importing modules\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, accuracy_score, precision_score, confusion_matrix, recall_score, f1_score\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "from sklearn.svm import SVR\n", + "from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense,LSTM\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BigUM4vtWZdc" + }, + "outputs": [], + "source": [ + "# Load data\n", + "df = pd.read_csv(\"D:\\Pankaj\\GSSOC\\Stock Market\\SBIN.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": { + "id": "RqOe6KPNWs8Q" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vAcMar5WXELR" + }, + "source": [ + "## Data Analysis and Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "vkDxEavVXHqv", + "outputId": "279f15e7-d2d2-442a-c162-e7db9780f2c7" + }, + "outputs": [], + "source": [ + "# Display the first 5 rows of the dataframe\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Vx3DYtfdXJTU", + "outputId": "18c93f2d-5232-4e80-dfcf-e7d748901c37" + }, + "outputs": [], + "source": [ + "# Display information about the dataframe\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "SXdvitYIXKKL", + "outputId": "1a2c42d0-4f6e-4180-8ffe-7a40ef4ddabf" + }, + "outputs": [], + "source": [ + "# Display summary statistics of the dataframe\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 303 + }, + "id": "8rddiK_nXLNi", + "outputId": "4b65dd80-db84-45c8-ef16-2133bc25c68b" + }, + "outputs": [], + "source": [ + "# Display the number of missing values in each column\n", + "df.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I1khhoMECh4y" + }, + "source": [ + "### Detailed" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "id": "HpS7l8oSCf6y" + }, + "outputs": [], + "source": [ + "# Function to create scatter plots\n", + "def create_scatter_plot(x_data, y_data, size_data=None, title=None, x_label=None, y_label=None, figsize=(15,7)):\n", + " plt.figure(figsize=figsize)\n", + " sns.scatterplot(x=x_data, y=y_data, size=size_data)\n", + " if title:\n", + " plt.title(title)\n", + " if x_label:\n", + " plt.xlabel(x_label)\n", + " if y_label:\n", + " plt.ylabel(y_label)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": { + "id": "oVEjj8ZMCnDx" + }, + "outputs": [], + "source": [ + "# Function to create line plots\n", + "def create_line_plot(x_data, y_data, title=None, x_label=None, y_label=None, figsize=(15,7)):\n", + " plt.figure(figsize=figsize)\n", + " sns.lineplot(x=x_data, y=y_data)\n", + " if title:\n", + " plt.title(title)\n", + " if x_label:\n", + " plt.xlabel(x_label)\n", + " if y_label:\n", + " plt.ylabel(y_label)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the relationship between 'Date' and 'Volume'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 769 + }, + "id": "-n8vOGhdCmer", + "outputId": "ea5057f1-9493-43de-ec65-7a017134d171" + }, + "outputs": [], + "source": [ + "# Plot the relationship between 'Date' and 'Volume'\n", + "sns.scatterplot(x = df['Date'], y = df['Volume'])\n", + "sns.kdeplot(x = df['Date'], y = df['Volume'])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FYvz9YaZDeDp" + }, + "source": [ + "Early 1990s spike: There is a high trading volume around the mid-1990s, peaking at over 4 × 10^8 (400 million trades).\n", + "Post-1996 to 2020: The volume significantly decreases, showing lower levels of activity with some fluctuations and minor peaks, particularly after 2016.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the price variation over time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "q3QfzWquCmjk" + }, + "outputs": [], + "source": [ + "# Plot the price variation over time\n", + "create_line_plot(df['Date'], df['High'] - df['Low'],\n", + " title='Price Variation Over Time', x_label='Date', y_label='Price Variation')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mYGFClShDgsj" + }, + "source": [ + "#### Stable period until 2004:\n", + "The price variation was relatively low and stable until around 2003-2004, generally staying under 10 units.\n", + "#### 2004-2008 increase:\n", + "Starting from 2004, price variations gradually began to increase, peaking sharply just before the 2008 financial crisis. Some spikes went beyond 40 units.\n", + "#### 2008-2020 fluctuations:\n", + "After the 2008 peak, price variation showed continuous fluctuations with noticeable peaks, though they were more frequent post-2016.\n", + "#### 2020 onward:\n", + "The recent period (2020-2024) shows significant and frequent fluctuations, with peaks reaching 60+ units, possibly influenced by events like the COVID-19 pandemic and other global factors." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the relationship between 'Adj Close' and 'Volume'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XKsek--aCmoC" + }, + "outputs": [], + "source": [ + "# Plot the relationship between 'Adj Close' and 'Volume'\n", + "create_line_plot(df['Adj Close'], df['Volume'],\n", + " title='Adjusted Close vs Volume',\n", + " x_label='Adjusted Close',\n", + " y_label='Volume')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "omVoWiC-DjJX" + }, + "source": [ + "The overall trend seems to be somewhat volatile, with periods of upward and downward movement." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the relationship between 'Open' and 'Adj Close'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W5TY59ikCv78" + }, + "outputs": [], + "source": [ + "# Plot the relationship between 'Open' and 'Adj Close'\n", + "create_scatter_plot(df['Open'], df['Adj Close'], size_data=df['Volume'],\n", + " title='Open vs Adjusted Close', x_label='Open', y_label='Adj Close')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q_BiuXvnDlfJ" + }, + "source": [ + "The scatter points generally show an upward trend, indicating a positive correlation between the opening and closing prices. This means that, in general, when the stock opens higher, it tends to close higher as well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the relationship between 'Close' and 'Adj Close'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CD0LiK3VCwiU" + }, + "outputs": [], + "source": [ + "# Plot the relationship between 'Close' and 'Adj Close'\n", + "create_scatter_plot(df['Close'], df['Adj Close'], size_data=df['Volume'],\n", + " title='Close vs Adjusted Close', x_label='Close', y_label='Adj Close')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3TgT8XWtDnpy" + }, + "source": [ + "The scatter points show a very strong upward trend, indicating a strong positive correlation between the closing price and the adjusted closing price. This means that, in general, when the stock closes higher, the adjusted closing price is also higher." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the relationship between 'Open' and 'Close'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bgq_AQ-nCwnA" + }, + "outputs": [], + "source": [ + "# Plot the relationship between 'Open' and 'Close'\n", + "create_scatter_plot(df['Open'], df['Close'], size_data=df['Volume'],\n", + " title='Open vs Close', x_label='Open', y_label='Close')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the relationship between 'Open - Close' and 'High - Low'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Zqsy16wWCwrC" + }, + "outputs": [], + "source": [ + "# Plot the relationship between 'Open - Close' and 'High - Low'\n", + "create_line_plot(df['Open'] - df['Close'], df['High'] - df['Low'],\n", + " title='Stock Variation vs Price Variation',\n", + " x_label='Stock Variation (Open - Close)',\n", + " y_label='Price Variation (High - Low)')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PbUZdfH4Dru0" + }, + "source": [ + "#### This infers for stable or less price difference volume of stock trading is higher\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the effect of price differences on volume" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NAT3Sy3hC2a3" + }, + "outputs": [], + "source": [ + "# Plot the effect of price differences on volume\n", + "plt.figure(figsize = (20,10))\n", + "sns.scatterplot(x=df['Open'] - df['Close'], y=df['High'] - df['Low'], hue=df['Volume'], palette='viridis', size=df['Volume'], sizes=(20, 200), alpha=0.6)\n", + "plt.title('Effect of Price Differences on Volume')\n", + "plt.xlabel('Open - Close')\n", + "plt.ylabel('High - Low')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the effect of price differences on adjusted close" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B8zeVZt7C2fT" + }, + "outputs": [], + "source": [ + "# Plot the effect of price differences on adjusted close\n", + "plt.figure(figsize = (20,10))\n", + "sns.lineplot(x = df['Open']-df['Close'], y = df['High'] - df['Low'], hue = df['Adj Close'])\n", + "plt.title('Effect of Price Differences on Adjusted Close')\n", + "plt.xlabel('Open - Close')\n", + "plt.ylabel('High - Low')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot the relationship between 'Open - Close' and 'Adj Close'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EpjnJ3bJC2jq" + }, + "outputs": [], + "source": [ + "# Plot the relationship between 'Open - Close' and 'Adj Close'\n", + "plt.figure(figsize = (20,10))\n", + "sns.lineplot(x = df['Open']-df['Close'], y = df['Adj Close'])\n", + "plt.xlabel('Open - Close')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a pair plot of stock features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "omOa3x81C2uS" + }, + "outputs": [], + "source": [ + "# Create a pair plot of stock features\n", + "subset = df[['Open', 'Close', 'High', 'Low', 'Volume']]\n", + "sns.pairplot(subset)\n", + "plt.suptitle('Pair Plot of Stock Features', y=1.02)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a 3D scatter plot of 'Open', 'Close', and 'Volume'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gNjDWptzC2zP" + }, + "outputs": [], + "source": [ + "# Create a 3D scatter plot of 'Open', 'Close', and 'Volume'\n", + "from mpl_toolkits.mplot3d import Axes3D\n", + "\n", + "fig = plt.figure(figsize=(12, 8))\n", + "ax = fig.add_subplot(111, projection='3d')\n", + "ax.scatter(df['Open']- df['Close'], df['High'] - df['Low'],df['Volume'], c='r', marker='o')\n", + "ax.set_xlabel('Open Price')\n", + "ax.set_ylabel('Close Price')\n", + "ax.set_zlabel('Volume')\n", + "plt.title('3D Scatter Plot of Open, Close, and Volume')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculate the correlation matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Soovqm1IC24P" + }, + "outputs": [], + "source": [ + "# Calculate the correlation matrix\n", + "correlation_matrix = df.corr(numeric_only = True)\n", + "plt.figure(figsize=(12, 8))\n", + "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True, cbar_kws={\"shrink\": .8})\n", + "\n", + "plt.title('Correlation Heatmap')\n", + "plt.show()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d2g_ypkeDvdO" + }, + "source": [ + "The high correlation between 'Open,' 'High,' 'Low,' 'Close,' and 'Adj Close' shows these features are highly interdependent and tend to move together in the same direction.\n", + "The negative correlation of 'Volume' with price-related features suggests that increased trading volume does not necessarily coincide with an increase in stock prices." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### More Charts" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "from plotly.offline import plot\n", + "import plotly.graph_objs as go" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A line chart is a simple and effective way to visualize the closing price of the stock over time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the closing price over time\n", + "plt.plot(df['Date'], df['Close'])\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Closing Price')\n", + "plt.title('SBIN Stock Price Over Time')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A candlestick chart is a more detailed way to visualize the stock price, showing the high, low, open, and close prices for each day" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the candlestick chart\n", + "fig = go.Figure(data=[go.Candlestick(x=df['Date'],\n", + " open=df['Open'],\n", + " high=df['High'],\n", + " low=df['Low'],\n", + " close=df['Close'])])\n", + "plot(fig, filename='candlestick_chart')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "C:\\Users\\Admin\\AppData\\Roaming\\Python\\Python312\\site-packages\\plotly\\offline\\offline.py:557: UserWarning:\n", + "\n", + "Your filename `candlestick_chart` didn't end with .html. Adding .html to the end of your file.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'candlestick_chart.html'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the daily volume\n", + "plt.bar(df['Date'], df['Volume'])\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Volume')\n", + "plt.title('SBIN Daily Volume')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the scatter plot of closing price vs volume\n", + "sns.scatterplot(x=df['Close'], y=df['Volume'])\n", + "plt.xlabel('Closing Price')\n", + "plt.ylabel('Volume')\n", + "plt.title('Relationship between SBIN Closing Price and Volume')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the histogram of closing prices\n", + "plt.hist(df['Close'], bins=50)\n", + "plt.xlabel('Closing Price')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Distribution of SBIN Closing Prices')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A moving average chart is a type of chart that shows the average value of a stock's price over a certain period of time. It is used to smooth out the fluctuations in the price and to identify trends." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the moving average chart\n", + "df['MA_50'] = df['Close'].rolling(window=50).mean()\n", + "df['MA_200'] = df['Close'].rolling(window=200).mean()\n", + "\n", + "plt.plot(df['Date'], df['Close'], label='Close')\n", + "plt.plot(df['Date'], df['MA_50'], label='MA 50')\n", + "plt.plot(df['Date'], df['MA_200'], label='MA 200')\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Price')\n", + "plt.title('SBIN Stock Price with Moving Averages')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This chart shows the closing price of the stock (blue line) along with its 20-day moving average (red line) and two standard deviations plotted above (green line) and below (orange line) it. The Bollinger Bands are used to identify volatility in the stock price" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the Bollinger Bands chart\n", + "df['MA_20'] = df['Close'].rolling(window=20).mean()\n", + "df['Upper_BB'] = df['MA_20'] + 2*df['Close'].rolling(window=20).std()\n", + "df['Lower_BB'] = df['MA_20'] - 2*df['Close'].rolling(window=20).std()\n", + "\n", + "plt.plot(df['Date'], df['Close'], label='Close')\n", + "plt.plot(df['Date'], df['Upper_BB'], label='Upper BB')\n", + "plt.plot(df['Date'], df['Lower_BB'], label='Lower BB')\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Price')\n", + "plt.title('SBIN Stock Price with Bollinger Bands')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " This chart shows the RSI of the stock (blue line) along with two horizontal lines at 30 (red line) and 70 (green line). The RSI is used to identify overbought (above 70) and oversold (below 30) conditions.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the RSI chart\n", + "delta = df['Close'].diff(1)\n", + "up, down = delta.copy(), delta.copy()\n", + "up[up < 0] = 0\n", + "down[down > 0] = 0\n", + "roll_up = up.rolling(window=14).mean()\n", + "roll_down = down.rolling(window=14).mean().abs()\n", + "RS = roll_up / roll_down\n", + "RSI = 100.0 - (100.0 / (1.0 + RS))\n", + "\n", + "plt.plot(df['Date'], RSI, label='RSI')\n", + "plt.axhline(y=30, color='red', linestyle='--')\n", + "plt.axhline(y=70, color='green', linestyle='--')\n", + "plt.xlabel('Date')\n", + "plt.ylabel('RSI')\n", + "plt.title('SBIN Stock Price with RSI')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This chart shows the correlation between the open, high, low, close, and volume of the stock. The correlation is measured on a scale of -1 to 1, where 1 means perfect positive correlation and -1 means perfect negative correlation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the heatmap\n", + "corr_matrix = df[['Open', 'High', 'Low', 'Close', 'Volume']].corr()\n", + "plt.imshow(corr_matrix, cmap='hot', interpolation='nearest')\n", + "plt.title('Correlation Matrix')\n", + "plt.colorbar()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D6ssCbaNXem6" + }, + "source": [ + "## Feature Engineering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WWb3qGu1Xa30" + }, + "source": [ + "### Handle missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "id": "yi9UnG0BXtGG" + }, + "outputs": [], + "source": [ + "# Drop unnecessary columns('Date' and 'Adj Close')\n", + "df.drop(['Date', 'Adj Close'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "id": "_QwN3imwXZGZ" + }, + "outputs": [], + "source": [ + "imputer = SimpleImputer(strategy='mean')\n", + "df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": { + "id": "WYCTE93pXhyC" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cYIyc2QpXvoM" + }, + "source": [ + "### Adding Indicators" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IUH2Nt70Xy-X" + }, + "source": [ + "#### SMA" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PzQGu8JnX8Bk" + }, + "source": [ + "Its the avg of stock price over a specific time period\n", + "\n", + "SMA = (sum of closing price os past n days) / n\n", + "\n", + "It helps identify trends by filtering out shortterm fluctuations\n", + "\n", + "Price above SMA indicate Uptrend and price below SMA indicate lowertrend" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "id": "CFGQWjNTX4Mf" + }, + "outputs": [], + "source": [ + "# Calculate the Simple Moving Average (SMA)\n", + "df[\"SMA_10\"] = df[\"Close\"].rolling(window=10).mean()\n", + "df[\"SMA_50\"] = df[\"Close\"].rolling(window=50).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LoX_CWbbX-tN" + }, + "outputs": [], + "source": [ + "# Drop rows with missing values\n", + "df.dropna(subset=['SMA_10', 'SMA_50'], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jaU2IGH_YcIe" + }, + "source": [ + "##### SMA Graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the SMA graph\n", + "fig, ax1 = plt.subplots(figsize=(10, 6))\n", + "days = int(input(\"of how many past days you want to see graph: \"))\n", + "ax1.plot(df[\"SMA_10\"][-days:], label = \"SMA_10\", color='Red', linewidth=1, alpha=0.8)\n", + "ax1.plot(df[\"SMA_50\"][-days:], label = \"SMA_50\", color='Green', linewidth=1, alpha=0.8)\n", + "ax1.plot(df[\"Close\"][-days:], label = \"Close\", color='Blue', linewidth=2)\n", + "\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qCxKwtnTX1Ck" + }, + "source": [ + "#### RSI" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bbNsRzqmYiJi" + }, + "source": [ + "It is a momentum indicator used to measure the speed and change of price movements. It ranges from 0 to 100 and helps identify whether a stock is overbought or oversold. \n", + "\n", + "RSI > 70: Overbought \n", + "RSI < 30: Oversold" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "id": "lTkIdld1YaxN" + }, + "outputs": [], + "source": [ + "# Calculate the Relative Strength Index (RSI)\n", + "delta = df['Close'].diff(1)\n", + "\n", + "gain = delta.where(delta > 0, 0)\n", + "loss = -delta.where(delta < 0, 0)\n", + "\n", + "avg_gain = gain.rolling(window=14).mean()\n", + "avg_loss = loss.rolling(window=14).mean()\n", + "\n", + "rs = avg_gain / avg_loss # Relative Strength\n", + "df['RSI'] = 100 - (100 / (1 + rs))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop rows with missing values\n", + "df.dropna(subset=['RSI'], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bW10tgm0YlSM" + }, + "source": [ + "##### RSI Graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 807 + }, + "id": "dqR2l5r-YngD", + "outputId": "07d7a94a-4fd6-4f87-cf37-54b6980fa5cb" + }, + "outputs": [], + "source": [ + "# Plot the RSI graph\n", + "\n", + "# Create subplots\n", + "fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), gridspec_kw={'height_ratios': [3, 1]})\n", + "\n", + "days = int(input(\"of how many past days you want to see graph: \"))\n", + "# price graph\n", + "ax1.plot(df['Close'][-days:], label='Close Price', color='blue')\n", + "ax1.set_title('Stock Price')\n", + "ax1.set_ylabel('Price')\n", + "ax1.legend()\n", + "\n", + "# rsi graph\n", + "ax2.plot(df['RSI'][-days:], label='RSI', color='orange')\n", + "ax2.axhline(70, color='red', linestyle='--') # Overbought line\n", + "ax2.axhline(30, color='green', linestyle='--') # Oversold line\n", + "ax2.set_title('Relative Strength Index (RSI)')\n", + "ax2.set_ylabel('RSI')\n", + "ax2.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LaU9_eV9X2Zc" + }, + "source": [ + "#### MACD" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "id": "4j9soKAXXvZX" + }, + "outputs": [], + "source": [ + "# Calculate the Moving Average Convergence Divergence (MACD)\n", + "# Calculate the short-term and long-term EMAs\n", + "df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()\n", + "df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()\n", + "\n", + "# Calculate MACD and Signal line\n", + "df['MACD'] = df['EMA_12'] - df['EMA_26'] # MACD line\n", + "df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean() # Signal line" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vJU2F-TiYrSB" + }, + "source": [ + "##### MACD Graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 807 + }, + "id": "BV8z1ge1Ysl3", + "outputId": "9ddf6a13-ee48-45ea-f6f4-e361d1decc11" + }, + "outputs": [], + "source": [ + "# Plot the MACD\n", + "fig, ax = plt.subplots(2, 1, figsize=(10, 8))\n", + "\n", + "\n", + "days = int(input(\"of how many past days you want to see graph: \"))\n", + "# Plot stock price on first subplot (bold)\n", + "ax[0].plot( df['Close'][-days:], label='Close Price', color='blue', linewidth=3)\n", + "ax[0].set_title('Stock Price')\n", + "ax[0].set_ylabel('Price')\n", + "\n", + "# Plot MACD and Signal line on second subplot\n", + "ax[1].plot( df['MACD'][-days:], label='MACD', color='green', linewidth=2)\n", + "ax[1].plot( df['Signal_Line'][-days:], label='Signal Line', color='red', linestyle='--', alpha=0.7)\n", + "ax[1].set_title('MACD')\n", + "ax[1].set_ylabel('MACD Value')\n", + "\n", + "# Show legends\n", + "ax[0].legend()\n", + "ax[1].legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VC-x8kPuY_FU" + }, + "source": [ + "#### Corrrelations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 425 + }, + "id": "I7pi-rfLZBGf", + "outputId": "549af121-3ef6-476f-e359-2e3ce2cbb166" + }, + "outputs": [], + "source": [ + "corr = df.corr()\n", + "corr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 746 + }, + "id": "1OGAC3uxZCxG", + "outputId": "9d7ad372-d66d-4c2a-b72a-717179294401" + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.heatmap(corr, annot=True, linewidths=0.5)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": { + "id": "GvIj_HpsZJ26" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v8aLHujGZQu_" + }, + "source": [ + "## Model Training Preperation" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "id": "FV2-jJCSZQaA" + }, + "outputs": [], + "source": [ + "# Select features and target variable\n", + "X = df[['Open', 'High', 'Low', 'Volume']]\n", + "y = df['Close']" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "MStjwvxqZV1e" + }, + "outputs": [], + "source": [ + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "id": "adMP-Zn1ZW7s" + }, + "outputs": [], + "source": [ + "# Scale the features using Min-Max scaling\n", + "scaler = MinMaxScaler()\n", + "X_train_scaled = scaler.fit_transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7nfzuX5hZZtj", + "outputId": "6e0f44fa-598b-482f-db74-449179691b30" + }, + "outputs": [], + "source": [ + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "avggYFQCZiF8" + }, + "source": [ + "## Model Training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i63wO9AVZkjf" + }, + "source": [ + "#### 1. LINEAR REGRESSION" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": { + "id": "9AJm7PhjZZ9p" + }, + "outputs": [], + "source": [ + "# Create a linear regression model\n", + "model1 = LinearRegression()\n", + "\n", + "# Train the model\n", + "model1.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the test set\n", + "pred1 = model1.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": { + "id": "lYpraDj7Zwqr" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse1 = np.sqrt(mean_squared_error(y_test, pred1))\n", + "mae1 = mean_absolute_error(y_test, pred1)\n", + "mape1 = mean_absolute_percentage_error(y_test, pred1)\n", + "accuracy1 = accuracy_score(y_test > pred1, y_test > pred1.round())\n", + "precision1 = precision_score(y_test > pred1, y_test > pred1.round())\n", + "confusion1 = confusion_matrix(y_test > pred1, y_test > pred1.round())\n", + "recall1 = recall_score(y_test > pred1, y_test > pred1.round())\n", + "f11 = f1_score(y_test > pred1, y_test > pred1.round())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xOAnEaKoZxuY", + "outputId": "202a1add-cae6-4637-8106-735d3f958007" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse1)\n", + "print(\"MAE:\", mae1)\n", + "print(\"MAPE:\", mape1)\n", + "print(\"Accuracy:\", accuracy1)\n", + "print(\"Precision:\", precision1)\n", + "print(\"Confusion Matrix:\\n\", confusion1)\n", + "print(\"Recall:\", recall1)\n", + "print(\"F1 Score:\", f11)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HxCNbk0JafKE" + }, + "source": [ + "#### 2. SVR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### With Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "id": "XxOnG95gZ0NR" + }, + "outputs": [], + "source": [ + "# Create an SVR model\n", + "model2 = SVR()\n", + "param_grid = {'C':[0.1, 1], 'epsilon':[0.01, 0.1, 0.5], 'kernel':['sigmoid']}\n", + "GV_SVR = GridSearchCV(model2, param_grid = param_grid, scoring = 'accuracy', n_jobs = -1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "heZ6MwXaalFT", + "outputId": "732d3ac8-014f-47fb-f1af-e3f30afbb037" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "GV_SVR.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred2 = GV_SVR.predict(X_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "984H1ej2apPP" + }, + "outputs": [], + "source": [ + "GV_SVR.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Without Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#fitting without grid search\n", + "model2.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred2_1 = model2.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "id": "ErnLBNW6apM4" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse2 = np.sqrt(mean_squared_error(y_test, pred2))\n", + "mae2 = mean_absolute_error(y_test, pred2)\n", + "mape2 = mean_absolute_percentage_error(y_test, pred2)\n", + "accuracy2 = accuracy_score(y_test > pred2, y_test > pred2.round())\n", + "precision2 = precision_score(y_test > pred2, y_test > pred2.round())\n", + "confusion2 = confusion_matrix(y_test > pred2, y_test > pred2.round())\n", + "recall2 = recall_score(y_test > pred2, y_test > pred2.round())\n", + "f12 = f1_score(y_test > pred2, y_test > pred2.round())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "82Gd1kSwapKR", + "outputId": "1dc4a6a9-f0ad-4b96-eeaa-9682f4912767" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse2)\n", + "print(\"MAE:\", mae2)\n", + "print(\"MAPE:\", mape2)\n", + "print(\"Accuracy:\", accuracy2)\n", + "print(\"Precision:\", precision2)\n", + "print(\"Confusion Matrix:\\n\", confusion2)\n", + "print(\"Recall:\", recall2)\n", + "print(\"F1 Score:\", f12)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GsVk_xFZauR6" + }, + "source": [ + "#### 3. Random Forest" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": { + "id": "YwerJPNxapGt" + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "# Create a Random Forest model\n", + "model3 = RandomForestRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "fspXTQOXapBl", + "outputId": "16e51cc3-390d-4dad-8a04-68b48104ac0b" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "model3.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": { + "id": "Hu0iAgE9ao-_" + }, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred3 = model3.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": { + "id": "WTu2YAoQao6Q" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse3 = np.sqrt(mean_squared_error(y_test, pred3))\n", + "mae3 = mean_absolute_error(y_test, pred3)\n", + "mape3 = mean_absolute_percentage_error(y_test, pred3)\n", + "accuracy3 = accuracy_score(y_test > pred3, y_test > pred3.round())\n", + "precision3 = precision_score(y_test > pred3, y_test > pred3.round())\n", + "confusion3 = confusion_matrix(y_test > pred3, y_test > pred3.round())\n", + "recall3 = recall_score(y_test > pred3, y_test > pred3.round())\n", + "f13 = f1_score(y_test > pred3, y_test > pred3.round())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9_7z9JcRao4C", + "outputId": "9b6dbe5a-de58-4519-d12b-afd1783b5024" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse3)\n", + "print(\"MAE:\", mae3)\n", + "print(\"MAPE:\", mape3)\n", + "print(\"Accuracy:\", accuracy3)\n", + "print(\"Precision:\", precision3)\n", + "print(\"Confusion Matrix:\\n\", confusion3)\n", + "print(\"Recall:\", recall3)\n", + "print(\"F1 Score:\", f13)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kg80FkpDa4Am" + }, + "source": [ + "#### 4. Gradient Boosting Models (GBM)" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": { + "id": "D75bIlqqao1t" + }, + "outputs": [], + "source": [ + "model4 = GradientBoostingRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "uUHqBGsTa9TV", + "outputId": "4694a027-32b2-4c7b-c9cc-ee84e6c4b425" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "model4.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "metadata": { + "id": "T7XyO3qIa9QU" + }, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred4 = model4.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": { + "id": "j2DCsz23a9M3" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse4 = np.sqrt(mean_squared_error(y_test, pred4))\n", + "mae4 = mean_absolute_error(y_test, pred4)\n", + "mape4 = mean_absolute_percentage_error(y_test, pred4)\n", + "accuracy4 = accuracy_score(y_test > pred4, y_test > pred4.round())\n", + "precision4 = precision_score(y_test > pred4, y_test > pred4.round())\n", + "confusion4 = confusion_matrix(y_test > pred4, y_test > pred4.round())\n", + "recall4 = recall_score(y_test > pred4, y_test > pred4.round())\n", + "f14 = f1_score(y_test > pred4, y_test > pred4.round())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tl4jTkC9a9Je", + "outputId": "4d2f1ee3-2886-4176-e9dc-2062b3890377" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse4)\n", + "print(\"MAE:\", mae4)\n", + "print(\"MAPE:\", mape4)\n", + "print(\"Accuracy:\", accuracy4)\n", + "print(\"Precision:\", precision4)\n", + "print(\"Confusion Matrix:\\n\", confusion4)\n", + "print(\"Recall:\", recall4)\n", + "print(\"F1 Score:\", f14)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aDoDcLUdbCq6" + }, + "source": [ + "#### 5. Extreme Gradient Boosting (XGBoost)" + ] + }, + { + "cell_type": "code", + "execution_count": 227, + "metadata": { + "id": "rmgm3VADa9Gb" + }, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "# Create an XGBoost model\n", + "model5 = xgb.XGBRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + }, + "id": "txadd-U3a9AP", + "outputId": "0a5d9230-a87f-486b-cda4-197face55486" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "model5.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "metadata": { + "id": "xYWVMZJ5a89t" + }, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred5 = model5.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 230, + "metadata": { + "id": "qeDxXt9pa87W" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse5 = np.sqrt(mean_squared_error(y_test, pred5))\n", + "mae5 = mean_absolute_error(y_test, pred5)\n", + "mape5 = mean_absolute_percentage_error(y_test, pred5)\n", + "accuracy5 = accuracy_score(y_test > pred5, y_test > pred5.round())\n", + "precision5 = precision_score(y_test > pred5, y_test > pred5.round())\n", + "confusion5 = confusion_matrix(y_test > pred5, y_test > pred5.round())\n", + "recall5 = recall_score(y_test > pred5, y_test > pred5.round())\n", + "f15 = f1_score(y_test > pred5, y_test > pred5.round())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TdY1rRy_a83-", + "outputId": "d43b90b2-d098-402f-fbf7-46130f5310e3" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse5)\n", + "print(\"MAE:\", mae5)\n", + "print(\"MAPE:\", mape5)\n", + "print(\"Accuracy:\", accuracy5)\n", + "print(\"Precision:\", precision5)\n", + "print(\"Confusion Matrix:\\n\", confusion5)\n", + "print(\"Recall:\", recall5)\n", + "print(\"F1 Score:\", f15)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EbxPon-VbMGE" + }, + "source": [ + "#### 6. AdaBoostRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "metadata": { + "id": "Xze9G-tUa802" + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import AdaBoostRegressor\n", + "# Create an AdaBoost model\n", + "model6 = AdaBoostRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "DEmXpAV8a8wC", + "outputId": "5cadcb29-13ce-45f0-cfe1-f9e21a24866b" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "model6.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "metadata": { + "id": "KvI77shea8tk" + }, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred6 = model6.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "metadata": { + "id": "catCNPwla8rL" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse6 = np.sqrt(mean_squared_error(y_test, pred6))\n", + "mae6 = mean_absolute_error(y_test, pred6)\n", + "mape6 = mean_absolute_percentage_error(y_test, pred6)\n", + "accuracy6 = accuracy_score(y_test > pred6, y_test > pred6.round())\n", + "precision6 = precision_score(y_test > pred6, y_test > pred6.round())\n", + "confusion6 = confusion_matrix(y_test > pred6, y_test > pred6.round())\n", + "recall6 = recall_score(y_test > pred6, y_test > pred6.round())\n", + "f16 = f1_score(y_test > pred6, y_test > pred6.round())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wpUqeumZa8oZ", + "outputId": "db2f5904-a96b-46d9-fbc3-ec519d5b7c7a" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse6)\n", + "print(\"MAE:\", mae6)\n", + "print(\"MAPE:\", mape6)\n", + "print(\"Accuracy:\", accuracy6)\n", + "print(\"Precision:\", precision6)\n", + "print(\"Confusion Matrix:\\n\", confusion6)\n", + "print(\"Recall:\", recall6)\n", + "print(\"F1 Score:\", f16)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ubml65zDbUZu" + }, + "source": [ + "#### 7. Decision Tree" + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "metadata": { + "id": "qCvEszOKbVGV" + }, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeRegressor\n", + "# Create a Decision Tree model\n", + "model7 = DecisionTreeRegressor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "5lYl0STGbWw8", + "outputId": "9aabd48a-73aa-4737-e243-ae4eb5899765" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "model7.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "metadata": { + "id": "Wu34WtF-bXy0" + }, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred7 = model7.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "metadata": { + "id": "H0NdEFHXbYuI" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse7 = np.sqrt(mean_squared_error(y_test, pred7))\n", + "mae7 = mean_absolute_error(y_test, pred7)\n", + "mape7 = mean_absolute_percentage_error(y_test, pred7)\n", + "accuracy7 = accuracy_score(y_test > pred7, y_test > pred7.round())\n", + "precision7 = precision_score(y_test > pred7, y_test > pred7.round())\n", + "confusion7 = confusion_matrix(y_test > pred7, y_test > pred7.round())\n", + "recall7 = recall_score(y_test > pred7, y_test > pred7.round())\n", + "f17 = f1_score(y_test > pred7, y_test > pred7.round())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Wq6aws7ZbZft", + "outputId": "3f0b27ba-3257-458b-b5d3-abba03fa5b47" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse7)\n", + "print(\"MAE:\", mae7)\n", + "print(\"MAPE:\", mape7)\n", + "print(\"Accuracy:\", accuracy7)\n", + "print(\"Precision:\", precision7)\n", + "print(\"Confusion Matrix:\\n\", confusion7)\n", + "print(\"Recall:\", recall7)\n", + "print(\"F1 Score:\", f17)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5OX62RD6bb-V" + }, + "source": [ + "#### 8. KNeighborsRegressor(KNN)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "id": "5kz2DGKMbaem" + }, + "outputs": [], + "source": [ + "# Create a KNN model\n", + "model8 = KNeighborsRegressor()\n", + "param_grid = {'n_neighbors':[3, 5, 7, 9, 11, 15, 20, 23, 25, 30, 60, 70, 150]}\n", + "GV_KNN = GridSearchCV(model8, param_grid, cv=5, scoring='neg_mean_squared_error')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "PFdqC4oXbeg9", + "outputId": "b06c9cd8-c45c-4a8b-bec3-17fe23c6dc6d" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "model8.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "id": "_Rv2V1W8bfhp" + }, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred8 = model8.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "id": "K_QgGMDybgj8" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse8 = np.sqrt(mean_squared_error(y_test, pred8))\n", + "mae8 = mean_absolute_error(y_test, pred8)\n", + "mape8 = mean_absolute_percentage_error(y_test, pred8)\n", + "accuracy8 = accuracy_score(y_test > pred8, y_test > pred8.round())\n", + "precision8 = precision_score(y_test > pred8, y_test > pred8.round())\n", + "confusion8 = confusion_matrix(y_test > pred8, y_test > pred8.round())\n", + "recall8 = recall_score(y_test > pred8, y_test > pred8.round())\n", + "f18 = f1_score(y_test > pred8, y_test > pred8.round())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PKv6I2oCbha3", + "outputId": "ef250676-44d3-47ff-f7bd-ade1697c4789" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse8)\n", + "print(\"MAE:\", mae8)\n", + "print(\"MAPE:\", mape8)\n", + "print(\"Accuracy:\", accuracy8)\n", + "print(\"Precision:\", precision8)\n", + "print(\"Confusion Matrix:\\n\", confusion8)\n", + "print(\"Recall:\", recall8)\n", + "print(\"F1 Score:\", f18)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Grid Search parameters\n", + "GV_KNN.fit(X_train, y_train)\n", + "pred8_1 = GV_KNN.predict(X_test)\n", + "GV_KNN.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "results = GV_KNN.cv_results_\n", + "mse = -results['mean_test_score']\n", + "k_values = results['param_n_neighbors'].data\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(k_values, mse, marker='o', linestyle='-')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### KNN Hyperparameter Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "\n", + "# KNN Hyperparameter Tuning\n", + "def knn_hyperparameter_tuning(X_train, y_train):\n", + " k_values = range(1, 31) # Example range for k\n", + " mse_values = []\n", + " \n", + " for k in k_values:\n", + " knn_model = KNeighborsRegressor(n_neighbors=k)\n", + " mse = -cross_val_score(knn_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()\n", + " mse_values.append(mse)\n", + " \n", + " return k_values, mse_values\n", + "\n", + "# Perform KNN Hyperparameter Tuning\n", + "k_values, mse_values = knn_hyperparameter_tuning(X_train_scaled, y_train)\n", + "\n", + "# Plotting the results\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(k_values, mse_values, marker='o', linestyle='-')\n", + "plt.title('KNN Hyperparameter Tuning: MSE vs. Number of Neighbors')\n", + "plt.xlabel('Number of Neighbors (k)')\n", + "plt.ylabel('Mean Squared Error (MSE)')\n", + "plt.xticks(k_values) # Show all k values on x-axis\n", + "plt.grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-yoFias0bkhT" + }, + "source": [ + "#### 9. Artificial Neural Networks (ANN)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oj1tv3-Sbm4M", + "outputId": "0d547cbc-d317-400a-a304-1bc283287926" + }, + "outputs": [], + "source": [ + "# Create an ANN model\n", + "model9 = Sequential()\n", + "model9.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))\n", + "model9.add(Dense(16, activation='relu'))\n", + "model9.add(Dense(1, activation='linear'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Llqm4lw3bnxu" + }, + "outputs": [], + "source": [ + "# Compile the model\n", + "model9.compile(loss='mean_squared_error', optimizer='adam')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8gzrlwUTbpG7", + "outputId": "c3b2372d-3881-4568-efd3-59ffe8bce6f3" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "model9.fit(X_train_scaled, y_train, epochs=100, batch_size=32, verbose=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cI6QW1utbp-J", + "outputId": "1c8b27b2-3380-40a1-ac68-17a68092f650" + }, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "pred9 = model9.predict(X_test_scaled).flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "metadata": { + "id": "-qDeEJUgbrUq" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse9 = np.sqrt(mean_squared_error(y_test, pred9))\n", + "mae9 = mean_absolute_error(y_test, pred9)\n", + "mape9 = mean_absolute_percentage_error(y_test, pred9)\n", + "accuracy9 = accuracy_score(y_test > pred9, y_test > pred9.round())\n", + "precision9 = precision_score(y_test > pred9, y_test > pred9.round())\n", + "confusion9 = confusion_matrix(y_test > pred9, y_test > pred9.round())\n", + "recall9 = recall_score(y_test > pred9, y_test > pred9.round())\n", + "f19 = f1_score(y_test > pred9, y_test > pred9.round())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dEhvx1BTbvXO", + "outputId": "af73599a-ae1e-4d56-9288-8193e4aaad2b" + }, + "outputs": [], + "source": [ + "# Print the evaluation metrics\n", + "print(\"RMSE:\", rmse9)\n", + "print(\"MAE:\", mae9)\n", + "print(\"MAPE:\", mape9)\n", + "print(\"Accuracy:\", accuracy9)\n", + "print(\"Precision:\", precision9)\n", + "print(\"Confusion Matrix:\\n\", confusion9)\n", + "print(\"Recall:\", recall9)\n", + "print(\"F1 Score:\", f19)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qosFFsxrbyX7" + }, + "source": [ + "#### 10. LSTM(Long Short term Memory)" + ] + }, + { + "cell_type": "code", + "execution_count": 255, + "metadata": { + "id": "3QajxGzXbxM4" + }, + "outputs": [], + "source": [ + "# Reshape the input data for LSTM\n", + "n_features = X_train_scaled.shape[1]\n", + "n_steps = 10\n", + "n_samples_train = X_train_scaled.shape[0] - n_steps + 1\n", + "n_samples_test = X_test_scaled.shape[0] - n_steps + 1\n", + "\n", + "# Reshape the input data\n", + "X_train_reshaped = np.array([X_train_scaled[i:i+n_steps, :] for i in range(n_samples_train)])\n", + "X_test_reshaped = np.array([X_test_scaled[i:i+n_steps, :] for i in range(n_samples_test)])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gNRiL_B0bxJ3", + "outputId": "045b5c85-f8ef-4fc4-bf38-1c253d886c82" + }, + "outputs": [], + "source": [ + "# Create an LSTM model\n", + "model = Sequential()\n", + "model.add(LSTM(64, activation='relu', input_shape=(n_steps, n_features)))\n", + "model.add(Dense(1))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 257, + "metadata": { + "id": "DsndIHIQbxHV" + }, + "outputs": [], + "source": [ + "# Compile the model\n", + "model.compile(loss='mean_squared_error', optimizer='adam')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SzhA1fh9bxEz" + }, + "outputs": [], + "source": [ + "# Train the model\n", + "model.fit(X_train_reshaped, y_train[n_steps-1:], epochs=100, batch_size=32, verbose=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AQXh8a46bxCx" + }, + "outputs": [], + "source": [ + "# Make predictions on the test set\n", + "y_pred = model.predict(X_test_reshaped).flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iIXfZ2eubxAD" + }, + "outputs": [], + "source": [ + "# Calculate evaluation metrics\n", + "rmse10 = np.sqrt(mean_squared_error(y_test[n_steps-1:], y_pred))\n", + "mae10 = mean_absolute_error(y_test[n_steps-1:], y_pred)\n", + "mape10 = mean_absolute_percentage_error(y_test[n_steps-1:], y_pred)\n", + "accuracy10 = accuracy_score(y_test[n_steps-1:] > y_pred, y_test[n_steps-1:] > y_pred.round())\n", + "precision10 = precision_score(y_test[n_steps-1:] > y_pred, y_test[n_steps-1:] > y_pred.round())\n", + "recall10 = recall_score(y_test[n_steps-1:] > y_pred, y_test[n_steps-1:] > y_pred.round())\n", + "f110 = f1_score(y_test[n_steps-1:] > y_pred, y_test[n_steps-1:] > y_pred.round())\n", + "confusion10 = confusion_matrix(y_test[n_steps-1:] > y_pred, y_test[n_steps-1:] > y_pred.round())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9Gjh33nNbw-O" + }, + "outputs": [], + "source": [ + "# Print evaluation metrics\n", + "print(\"RMSE:\", rmse10)\n", + "print(\"MAE:\", mae10)\n", + "print(\"MAPE:\", mape10)\n", + "print(\"Accuracy:\", accuracy10)\n", + "print(\"Precision:\", precision10)\n", + "print(\"Recall:\", recall10)\n", + "print(\"F1 Score:\", f110)\n", + "print(\"Confusion Matrix:\\n\", confusion10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "l6QVMqW-bw66" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Performance Graphs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6r0qumqybw4g" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-hFs2mnDbwGQ" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Assuming you have a list of accuracies from accuracy1 to accuracy10\n", + "accuracies = [accuracy1*100, accuracy2*100, accuracy3*100, accuracy4*100, accuracy5*100, accuracy6*100, accuracy7*100, accuracy8*100, accuracy9*100, accuracy10*100]\n", + "\n", + "# List of corresponding labels for each accuracy\n", + "labels = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']\n", + "\n", + "# Plotting the bar graph\n", + "plt.bar(labels, accuracies, color='blue')\n", + "plt.xlabel('Accuracy Variables')\n", + "plt.ylabel('Accuracy Values')\n", + "plt.title('Bar Graph of Accuracies')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### RMSE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1tBYk4qmcEMk" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Assuming you have a list of RMSE values from rmse1 to rmse10\n", + "rmse_values = [rmse1, rmse2, rmse3, rmse4, rmse5, rmse6, rmse7, rmse8, rmse9, rmse10]\n", + "\n", + "# List of corresponding labels for each RMSE value\n", + "labels = ['RMSE1', 'RMSE2', 'RMSE3', 'RMSE4', 'RMSE5', 'RMSE6', 'RMSE7', 'RMSE8', 'RMSE9', 'RMSE10']\n", + "\n", + "# Plotting the bar graph\n", + "plt.bar(labels, rmse_values, color='green')\n", + "plt.xlabel('RMSE Variables')\n", + "plt.ylabel('RMSE Values')\n", + "plt.title('Bar Graph of RMSE')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### MAE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BS0oljCZcFhM" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Assuming you have a list of MAE values from mae1 to mae10\n", + "mae_values = [mae1, mae2, mae3, mae4, mae5, mae6, mae7, mae8, mae9, mae10]\n", + "\n", + "# List of corresponding labels for each MAE value\n", + "labels = ['MAE1', 'MAE2', 'MAE3', 'MAE4', 'MAE5', 'MAE6', 'MAE7', 'MAE8', 'MAE9', 'MAE10']\n", + "\n", + "# Plotting the bar graph\n", + "plt.bar(labels, mae_values, color='orange')\n", + "plt.xlabel('MAE Variables')\n", + "plt.ylabel('MAE Values')\n", + "plt.title('Bar Graph of MAE')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### MAPE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9woaoeNVcGx5" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Assuming you have a list of MAPE values from mape1 to mape10\n", + "mape_values = [mape1, mape2, mape3, mape4, mape5, mape6, mape7, mape8, mape9, mape10]\n", + "\n", + "# List of corresponding labels for each MAPE value\n", + "labels = ['MAPE1', 'MAPE2', 'MAPE3', 'MAPE4', 'MAPE5', 'MAPE6', 'MAPE7', 'MAPE8', 'MAPE9', 'MAPE10']\n", + "\n", + "# Plotting the bar graph\n", + "plt.bar(labels, mape_values, color='purple')\n", + "plt.xlabel('MAPE Variables')\n", + "plt.ylabel('MAPE Values')\n", + "plt.title('Bar Graph of MAPE')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Precision" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bbRSmd3tcIIX" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Assuming you have a list of precision values from precision1 to precision10\n", + "precision_values = [precision1, precision2, precision3, precision4, precision5, precision6, precision7, precision8, precision9, precision10]\n", + "\n", + "# List of corresponding labels for each precision value\n", + "labels = ['Precision1', 'Precision2', 'Precision3', 'Precision4', 'Precision5', 'Precision6', 'Precision7', 'Precision8', 'Precision9', 'Precision10']\n", + "\n", + "# Plotting the bar graph\n", + "plt.bar(labels, precision_values, color='red')\n", + "plt.xlabel('Precision Variables')\n", + "plt.ylabel('Precision Values')\n", + "plt.title('Bar Graph of Precision')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Recall" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tDNk7iuWcJpS" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Assuming you have a list of recall values from recall1 to recall10\n", + "recall_values = [recall1, recall2, recall3, recall4, recall5, recall6, recall7, recall8, recall9, recall10]\n", + "\n", + "# List of corresponding labels for each recall value\n", + "labels = ['Recall1', 'Recall2', 'Recall3', 'Recall4', 'Recall5', 'Recall6', 'Recall7', 'Recall8', 'Recall9', 'Recall10']\n", + "\n", + "# Plotting the bar graph\n", + "plt.bar(labels, recall_values, color='cyan')\n", + "plt.xlabel('Recall Variables')\n", + "plt.ylabel('Recall Values')\n", + "plt.title('Bar Graph of Recall')\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}