redeploy with pylint actions

worldbank · Feb 2, 2024 · aa041fa · aa041fa
1 parent 38db720
commit aa041fa
Show file tree

Hide file tree

Showing 10 changed files with 202 additions and 88 deletions.
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -0,0 +1,23 @@
+name: Pylint
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint
+    - name: Analysing the code with pylint
+      run: |
+        pylint --recursive=y src/**/**.py src/**.py
diff --git a/src/meta.py b/src/meta.py
@@ -7,7 +7,16 @@
 
 class FileMetaData:
     """
-    Class to extract metadata for different file types
+    Class to extract metadata for different file types.
+
+    To use:
+        >>> meta = FileMetaData("random.pdf")
+        >>> print(meta.extract())
+        ------------
+        {'filename': 'random.pdf',
+        'created': datetime.datetime(2023, 11, 15, 14, 8, 21, 417070),
+        'modified': datetime.datetime(2023, 11, 13, 13, 28, 11, 750389),
+        'num_pages': 6}
     """
 
     def __init__(self, file_path: str):
@@ -17,22 +26,13 @@ def __init__(self, file_path: str):
         self.file_path = file_path
 
     def extract(self) -> dict:
-        """
-        Extracts and returns dict of metadata about file. The function is compatible with extracting 
-        metadata information from .csv, .xlsx, and .pdf files.
+        """ Extracts and returns dict of metadata about file. 
+        
+        The function is compatible with extracting metadata information 
+        from .csv, .xlsx, and .pdf files.
 
         Returns:
-            metadata (dict): Dictionary containing file metadata
-
-
-        Example: 
-         meta = FileMetaData("random.pdf")
-         print(meta.extract())
-         ----------
-        {'filename': 'random.pdf',
-        'created': datetime.datetime(2023, 11, 15, 14, 8, 21, 417070),
-        'modified': datetime.datetime(2023, 11, 13, 13, 28, 11, 750389),
-        'num_pages': 6}
+            metadata (dict): Dictionary containing file metadata         
         """
         metadata: dict = {
             "filename": os.path.basename(self.file_path),
@@ -77,6 +77,3 @@ def _extract_pdf_metadata(self, metadata: dict) -> None:
         with open(self.file_path, 'rb') as f:
             reader = PdfReader(f)
             metadata['num_pages'] = len(reader.pages)
-
-
-
diff --git a/src/tourism/combine.py b/src/tourism/combine.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+from scipy.optimize import nnls, minimize
 
 
 def calculate_mse(predictions_df: pd.DataFrame, method: str) -> pd.Series:
@@ -42,7 +43,6 @@ def get_rpw(pred_df: pd.DataFrame,
 def get_constrained_ls(y: pd.DataFrame,
                        X: pd.DataFrame) -> np.array:
 
-    from scipy.optimize import nnls, minimize
 
     A, b = np.array(X), np.array(y)
     x0, norm = nnls(A, b)

diff --git a/src/tourism/cross_validate.py b/src/tourism/cross_validate.py
@@ -1,10 +1,18 @@
+"""
+The module provides cross-validation class compatible with SARIMAX, VECM, and 
+VARMAX (TBD) with option of Rolling and SlidingWindow methods.
+
+Last Modified:
+    2024-02-01
+"""
 from .scaler import ScaledLogitScaler, Differencing
 from .ts_eval import calculate_evaluation
 from statsmodels.tsa.statespace.sarimax import SARIMAX
 from statsmodels.tsa.vector_ar.vecm import VECM
 from pmdarima.model_selection import RollingForecastCV, SlidingWindowForecastCV
 from tqdm import tqdm
 
+
 class TimeSeriesCrossValidator:
     def __init__(self,
                  method: str,
@@ -45,7 +53,7 @@ def _transform(self):
     def _initialize_cv(self, hyper_params=None):
         if self.cv_method == "Rolling":
             self.cv = RollingForecastCV(
-                h=12, step=1, initial=hyper_params["inital"])
+                h=12, step=1, initial=hyper_params["initial"])
         elif self.cv_method == "SlidingWindow":
             self.cv = SlidingWindowForecastCV(
                 h=12, step=1, window_size=hyper_params["window_size"])
@@ -62,11 +70,11 @@ def _fit_model(self, endog, exog):
         elif self.method == 'VECM':
             select_order = self.model_params["select_order"]
             model = VECM(endog,
-                     exog=exog,
-                     k_ar_diff=select_order,
-                     coint_rank=1)
+                         exog=exog,
+                         k_ar_diff=select_order,
+                         coint_rank=1)
             return model.fit()
-           
+
     def _predict_model(self, res, steps, exog, last_train_value):
 
         if self.method == "SARIMAX":
@@ -76,10 +84,10 @@ def _predict_model(self, res, steps, exog, last_train_value):
         elif self.method == "VECM":
             predictions = res.predict(steps=steps, exog_fc=exog)
             if self.transformation == "difference":
-                predictions = self.scaler.inverse_transform(predictions, temporary=last_train_value)
-
-        return predictions if predictions is not None else None
+                predictions = self.scaler.inverse_transform(
+                    predictions, temporary=last_train_value)
 
+        return predictions if predictions is not None else None
 
     def cross_validate(self, hyper_params):
         """
@@ -101,16 +109,18 @@ def cross_validate(self, hyper_params):
             for train_idx, test_idx in cv_splits:
                 train, test = self.transformed_data.iloc[train_idx], self.data.iloc[test_idx, 0]
                 exog_train = self.exog_data.iloc[train_idx,
-                                                :] if self.exog_data is not None else None
+                                                 :] if self.exog_data is not None else None
                 exog_test = self.exog_data.iloc[test_idx,
                                                 :] if self.exog_data is not None else None
                 res = self._fit_model(train, exog_train)
                 predictions = self._predict_model(res, steps=len(exog_test), exog=exog_test,
-                                                       last_train_value=train.iloc[-1, :].values)
+                                                  last_train_value=train.iloc[-1, :].values)
 
                 if len(test) == len(predictions):
-                    predictions = predictions.iloc[:, 0] if self.method != "SARIMAX" else predictions
-                    eval_metrics = calculate_evaluation(test.values, predictions)
+                    predictions = predictions.iloc[:,
+                                                   0] if self.method != "SARIMAX" else predictions
+                    eval_metrics = calculate_evaluation(
+                        test.values, predictions)
                     errors.append(eval_metrics)
                 else:
                     raise AttributeError("The predicted data do not")

diff --git a/src/tourism/data.py b/src/tourism/data.py
@@ -1,5 +1,9 @@
 """
+The module provides loaders for Country-level visitor data, Covid data, and Google-
+trends data, and model-required data. 
 
+Last Modified:
+    2024-02-01
 """
 
 import os
@@ -162,6 +166,11 @@ def process_aviation_data(self):
 
 
 class SARIMAXData:
+    """
+    A class for preparing and managing time series data for SARIMAX. This class is 
+    designed to handle the loading, processing, and merging of country-specific 
+    economic, COVID Stringency Index, and Google Trends data.
+    """
     def __init__(self,
                  country: str,
                  y_var: str,
@@ -232,6 +241,7 @@ def __init__(self, country: str,
                  aviation_path: str = DEFAULT_AVIATION_DATA_PATH):
         super().__init__(country, y_var, exog_var)
         self.aviation_path = aviation_path
+        self.avi_data = None
         self.aviation_data_loader = AviationDataLoader(
             self.country, select_col, self.aviation_path)
 
@@ -241,6 +251,6 @@ def read_and_merge(self):
         """
         # Inherit from the read_and_merge method from SARIMAXData
         super().read_and_merge()
-        self.avi = self.aviation_data_loader.process_aviation_data()
-        self.data = (self.data.merge(self.avi, how="outer", on="date")
+        self.avi_data = self.aviation_data_loader.process_aviation_data()
+        self.data = (self.data.merge(self.avi_data, how="outer", on="date")
                      .reset_index(drop=True))
diff --git a/src/tourism/mtsmodel.py b/src/tourism/mtsmodel.py
@@ -7,10 +7,12 @@
 from statsmodels.tsa.api import VARMAX
 import statsmodels.formula.api as smf
 from statsmodels.tsa.vector_ar.vecm import select_order, VECM
-from .scaler import ScaledLogitScaler
-from .ts_eval import (naive_method, mean_method, seasonal_naive_method, calculate_evaluation)
+from .scaler import ScaledLogitScaler, Differencing
+from .ts_eval import (naive_method, mean_method,
+                      seasonal_naive_method, calculate_evaluation)
 from .ts_utils import cointegration_test, get_adf_df
-from .data import *
+from .data import (MultiTSData, TRENDS_DATA_FOLDER,
+                   COVID_DATA_PATH, DEFAULT_AVIATION_DATA_PATH)
 
 __all__ = [
     "VARPipeline",
@@ -36,11 +38,14 @@ def __init__(self,
         self.raw_data = None
         self.transformed_data = None
         self.scaler = None
+        self.differencer = None
         self.method = None
         self.transformation = []
         self.is_stationary = None
         self.is_cointegrated = None
         self.test_results = {'stationarity': {}, 'cointegration': {}}
+        self.fitted_models = None
+        self.prediction_dfs = None
 
     @staticmethod
     def test_stationarity(df, y_var) -> bool:
@@ -59,12 +64,13 @@ def transform_data(self):
         Transform the time series data based on the specified method.
         """
         transformed_data = {}
-        transformed_data["original"] = self.data[self.y_var].dropna()
-        transformed_data["difference"] = self.data[self.y_var].diff().dropna()
         self.scaler = ScaledLogitScaler()
-        self.scaler.fit(self.data[self.y_var].dropna())
-        transformed_data["scaledlogit"] = self.scaler.transform(
-            self.data[self.y_var].dropna())
+        self.differencer = Differencing()
+        original = self.data[self.y_var].dropna()
+        transformed_data["original"] = original
+        transformed_data["difference"] = self.differencer.transform(original)
+        self.scaler.fit(original)
+        transformed_data["scaledlogit"] = self.scaler.transform(original)
         return transformed_data
 
     def determine_analysis_method(self):
@@ -125,9 +131,9 @@ def fit_varma(endog_data, exog_data):
         (p, q), tr = sorted_results[0]["model"]
 
         model = VARMAX(endog=endog_data,
-                         exog=exog_data,
-                         order=(p, q),
-                         trend=tr)
+                       exog=exog_data,
+                       order=(p, q),
+                       trend=tr)
         fitted_model = model.fit(disp=False)
         return fitted_model, grid_search_results
 
@@ -167,32 +173,37 @@ def fit(self):
                 predict_df = pd.DataFrame(mod.model.endog, columns=self.y_var)
                 predict_df["pred_total"] = np.NaN
                 if self.method == "VARMAX":
-                    predict_df.loc[mod.model.k_ar:, "pred_total"] = mod.fittedvalues.iloc[:, 0].tolist()
-                predict_df.loc[mod.model.k_ar:, "pred_total"] = mod.fittedvalues[:, 0]
-                predict_df['date'] = pd.date_range(start="2019-01-01", periods=len(predict_df), freq='MS')
+                    predict_df.loc[mod.model.k_ar:,
+                                   "pred_total"] = mod.fittedvalues.iloc[:, 0].tolist()
+                predict_df.loc[mod.model.k_ar:,
+                               "pred_total"] = mod.fittedvalues[:, 0]
+                predict_df['date'] = pd.date_range(
+                    start="2019-01-01", periods=len(predict_df), freq='MS')
                 self.prediction_dfs[t_type] = predict_df
 
-
-    def plot_comparison(self):
-        for t_type, pred_df in self.prediction_dfs.items():
-            fig, ax = plt.subplots(figsize=(8, 6))
-            pred_df.plot(x="date", y=["total", "pred_total"], ax=ax)
-            return fig
-
     def evaluate_models(self):
+        """
+        Compare models to benchmark evaluation methods.
+
+        Return:
+            benchmark (pd.DataFrame): contains `naive` and `mean` method for forecasting.
+        """
         naive_pred = naive_method(self.data[self.y0])
         mean_pred = mean_method(self.data[self.y0])
 
         benchmark = pd.DataFrame()
         if len(self.prediction_dfs) == 1:
-            fittedvalues = self.prediction_dfs[self.transformation[0]]["pred_total"]
-            for name, pred in zip(["naive", "mean", "VAR (scaled)"], [naive_pred, mean_pred, fittedvalues]):
+            fittedvalues = self.prediction_dfs[self.transformation[0]
+                                               ]["pred_total"]
+            for name, pred in zip(["naive", "mean", "VAR (scaled)"],
+                                  [naive_pred, mean_pred, fittedvalues]):
                 eval_metrics = calculate_evaluation(self.data[self.y0], pred)
                 eval_df = pd.DataFrame(eval_metrics, index=[name])
                 benchmark = pd.concat([benchmark, eval_df], axis=0)
 
         return benchmark
 
+
 class RatioPipe(MultiTSData):
     def __init__(self, country,
                  y_var,
@@ -204,7 +215,8 @@ def __init__(self, country,
         Args:
             country (str): The country.
             y_var (str): The dependent variable.
-            x2 (str, optional): The variable with `y_var` to produce ratio. Defaults to "seats_arrivals_intl".
+            x2 (str, optional): The variable with `y_var` to produce ratio. Defaults to 
+                `seats_arrivals_intl`.
             exog_var (str): The exogenous variable.
             transform_method (str): The transformation method.
             training_ratio (float): The training ratio.
@@ -213,6 +225,7 @@ def __init__(self, country,
         self.x1 = y_var
         self.x2 = x2
         self.model = None
+        self.model_data = None
         self.res = None
         self.prediction = None
         self.benchmark = None
@@ -267,18 +280,18 @@ def get_prediction(self):
         self.prediction = pd.concat(
             [self.model_data[select_cols], pred_df], axis=1).dropna()
         self.prediction["pred_mean"] = self.prediction["mean"] * \
-            self.pred_df[self.x2]
+            self.prediction[self.x2]
 
         return self.prediction
 
     def get_benchmark_evaluation(self):
-        naive_pred = naive_method(self.pred_df[self.x1])
-        mean_pred = mean_method(self.pred_df[self.x1])
-        snaive_pred = seasonal_naive_method(self.pred_df[self.x1])
+        naive_pred = naive_method(self.prediction[self.x1])
+        mean_pred = mean_method(self.prediction[self.x1])
+        snaive_pred = seasonal_naive_method(self.prediction[self.x1])
 
         benchmark = pd.DataFrame()
-        for idx, method in enumerate([naive_pred, mean_pred, snaive_pred, self.pred_df["pred_mean"]]):
-            metrics = calculate_evaluation(self.pred_df[self.x1], method)
+        for idx, pred in enumerate([naive_pred, mean_pred, snaive_pred, self.prediction["pred_mean"]]):
+            metrics = calculate_evaluation(self.prediction[self.x1], pred)
             metrics_df = pd.DataFrame(metrics, index=[idx])
             benchmark = pd.concat([benchmark, metrics_df], axis=0)
         benchmark.index = ["naive", "mean", "seasonal naive", "ratio"]