Skip to content

Commit

Permalink
redeploy with pylint actions
Browse files Browse the repository at this point in the history
  • Loading branch information
ccxzhang committed Feb 2, 2024
1 parent 38db720 commit aa041fa
Show file tree
Hide file tree
Showing 10 changed files with 202 additions and 88 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Pylint

on: [push]

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pylint
- name: Analysing the code with pylint
run: |
pylint --recursive=y src/**/**.py src/**.py
33 changes: 15 additions & 18 deletions src/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,16 @@

class FileMetaData:
"""
Class to extract metadata for different file types
Class to extract metadata for different file types.
To use:
>>> meta = FileMetaData("random.pdf")
>>> print(meta.extract())
------------
{'filename': 'random.pdf',
'created': datetime.datetime(2023, 11, 15, 14, 8, 21, 417070),
'modified': datetime.datetime(2023, 11, 13, 13, 28, 11, 750389),
'num_pages': 6}
"""

def __init__(self, file_path: str):
Expand All @@ -17,22 +26,13 @@ def __init__(self, file_path: str):
self.file_path = file_path

def extract(self) -> dict:
"""
Extracts and returns dict of metadata about file. The function is compatible with extracting
metadata information from .csv, .xlsx, and .pdf files.
""" Extracts and returns dict of metadata about file.
The function is compatible with extracting metadata information
from .csv, .xlsx, and .pdf files.
Returns:
metadata (dict): Dictionary containing file metadata
Example:
meta = FileMetaData("random.pdf")
print(meta.extract())
----------
{'filename': 'random.pdf',
'created': datetime.datetime(2023, 11, 15, 14, 8, 21, 417070),
'modified': datetime.datetime(2023, 11, 13, 13, 28, 11, 750389),
'num_pages': 6}
metadata (dict): Dictionary containing file metadata
"""
metadata: dict = {
"filename": os.path.basename(self.file_path),
Expand Down Expand Up @@ -77,6 +77,3 @@ def _extract_pdf_metadata(self, metadata: dict) -> None:
with open(self.file_path, 'rb') as f:
reader = PdfReader(f)
metadata['num_pages'] = len(reader.pages)



2 changes: 1 addition & 1 deletion src/tourism/combine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import numpy as np
from scipy.optimize import nnls, minimize


def calculate_mse(predictions_df: pd.DataFrame, method: str) -> pd.Series:
Expand Down Expand Up @@ -42,7 +43,6 @@ def get_rpw(pred_df: pd.DataFrame,
def get_constrained_ls(y: pd.DataFrame,
X: pd.DataFrame) -> np.array:

from scipy.optimize import nnls, minimize

A, b = np.array(X), np.array(y)
x0, norm = nnls(A, b)
Expand Down
34 changes: 22 additions & 12 deletions src/tourism/cross_validate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
"""
The module provides cross-validation class compatible with SARIMAX, VECM, and
VARMAX (TBD) with option of Rolling and SlidingWindow methods.
Last Modified:
2024-02-01
"""
from .scaler import ScaledLogitScaler, Differencing
from .ts_eval import calculate_evaluation
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.vector_ar.vecm import VECM
from pmdarima.model_selection import RollingForecastCV, SlidingWindowForecastCV
from tqdm import tqdm


class TimeSeriesCrossValidator:
def __init__(self,
method: str,
Expand Down Expand Up @@ -45,7 +53,7 @@ def _transform(self):
def _initialize_cv(self, hyper_params=None):
if self.cv_method == "Rolling":
self.cv = RollingForecastCV(
h=12, step=1, initial=hyper_params["inital"])
h=12, step=1, initial=hyper_params["initial"])
elif self.cv_method == "SlidingWindow":
self.cv = SlidingWindowForecastCV(
h=12, step=1, window_size=hyper_params["window_size"])
Expand All @@ -62,11 +70,11 @@ def _fit_model(self, endog, exog):
elif self.method == 'VECM':
select_order = self.model_params["select_order"]
model = VECM(endog,
exog=exog,
k_ar_diff=select_order,
coint_rank=1)
exog=exog,
k_ar_diff=select_order,
coint_rank=1)
return model.fit()

def _predict_model(self, res, steps, exog, last_train_value):

if self.method == "SARIMAX":
Expand All @@ -76,10 +84,10 @@ def _predict_model(self, res, steps, exog, last_train_value):
elif self.method == "VECM":
predictions = res.predict(steps=steps, exog_fc=exog)
if self.transformation == "difference":
predictions = self.scaler.inverse_transform(predictions, temporary=last_train_value)

return predictions if predictions is not None else None
predictions = self.scaler.inverse_transform(
predictions, temporary=last_train_value)

return predictions if predictions is not None else None

def cross_validate(self, hyper_params):
"""
Expand All @@ -101,16 +109,18 @@ def cross_validate(self, hyper_params):
for train_idx, test_idx in cv_splits:
train, test = self.transformed_data.iloc[train_idx], self.data.iloc[test_idx, 0]
exog_train = self.exog_data.iloc[train_idx,
:] if self.exog_data is not None else None
:] if self.exog_data is not None else None
exog_test = self.exog_data.iloc[test_idx,
:] if self.exog_data is not None else None
res = self._fit_model(train, exog_train)
predictions = self._predict_model(res, steps=len(exog_test), exog=exog_test,
last_train_value=train.iloc[-1, :].values)
last_train_value=train.iloc[-1, :].values)

if len(test) == len(predictions):
predictions = predictions.iloc[:, 0] if self.method != "SARIMAX" else predictions
eval_metrics = calculate_evaluation(test.values, predictions)
predictions = predictions.iloc[:,
0] if self.method != "SARIMAX" else predictions
eval_metrics = calculate_evaluation(
test.values, predictions)
errors.append(eval_metrics)
else:
raise AttributeError("The predicted data do not")
Expand Down
14 changes: 12 additions & 2 deletions src/tourism/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
"""
The module provides loaders for Country-level visitor data, Covid data, and Google-
trends data, and model-required data.
Last Modified:
2024-02-01
"""

import os
Expand Down Expand Up @@ -162,6 +166,11 @@ def process_aviation_data(self):


class SARIMAXData:
"""
A class for preparing and managing time series data for SARIMAX. This class is
designed to handle the loading, processing, and merging of country-specific
economic, COVID Stringency Index, and Google Trends data.
"""
def __init__(self,
country: str,
y_var: str,
Expand Down Expand Up @@ -232,6 +241,7 @@ def __init__(self, country: str,
aviation_path: str = DEFAULT_AVIATION_DATA_PATH):
super().__init__(country, y_var, exog_var)
self.aviation_path = aviation_path
self.avi_data = None
self.aviation_data_loader = AviationDataLoader(
self.country, select_col, self.aviation_path)

Expand All @@ -241,6 +251,6 @@ def read_and_merge(self):
"""
# Inherit from the read_and_merge method from SARIMAXData
super().read_and_merge()
self.avi = self.aviation_data_loader.process_aviation_data()
self.data = (self.data.merge(self.avi, how="outer", on="date")
self.avi_data = self.aviation_data_loader.process_aviation_data()
self.data = (self.data.merge(self.avi_data, how="outer", on="date")
.reset_index(drop=True))
73 changes: 43 additions & 30 deletions src/tourism/mtsmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
from statsmodels.tsa.api import VARMAX
import statsmodels.formula.api as smf
from statsmodels.tsa.vector_ar.vecm import select_order, VECM
from .scaler import ScaledLogitScaler
from .ts_eval import (naive_method, mean_method, seasonal_naive_method, calculate_evaluation)
from .scaler import ScaledLogitScaler, Differencing
from .ts_eval import (naive_method, mean_method,
seasonal_naive_method, calculate_evaluation)
from .ts_utils import cointegration_test, get_adf_df
from .data import *
from .data import (MultiTSData, TRENDS_DATA_FOLDER,
COVID_DATA_PATH, DEFAULT_AVIATION_DATA_PATH)

__all__ = [
"VARPipeline",
Expand All @@ -36,11 +38,14 @@ def __init__(self,
self.raw_data = None
self.transformed_data = None
self.scaler = None
self.differencer = None
self.method = None
self.transformation = []
self.is_stationary = None
self.is_cointegrated = None
self.test_results = {'stationarity': {}, 'cointegration': {}}
self.fitted_models = None
self.prediction_dfs = None

@staticmethod
def test_stationarity(df, y_var) -> bool:
Expand All @@ -59,12 +64,13 @@ def transform_data(self):
Transform the time series data based on the specified method.
"""
transformed_data = {}
transformed_data["original"] = self.data[self.y_var].dropna()
transformed_data["difference"] = self.data[self.y_var].diff().dropna()
self.scaler = ScaledLogitScaler()
self.scaler.fit(self.data[self.y_var].dropna())
transformed_data["scaledlogit"] = self.scaler.transform(
self.data[self.y_var].dropna())
self.differencer = Differencing()
original = self.data[self.y_var].dropna()
transformed_data["original"] = original
transformed_data["difference"] = self.differencer.transform(original)
self.scaler.fit(original)
transformed_data["scaledlogit"] = self.scaler.transform(original)
return transformed_data

def determine_analysis_method(self):
Expand Down Expand Up @@ -125,9 +131,9 @@ def fit_varma(endog_data, exog_data):
(p, q), tr = sorted_results[0]["model"]

model = VARMAX(endog=endog_data,
exog=exog_data,
order=(p, q),
trend=tr)
exog=exog_data,
order=(p, q),
trend=tr)
fitted_model = model.fit(disp=False)
return fitted_model, grid_search_results

Expand Down Expand Up @@ -167,32 +173,37 @@ def fit(self):
predict_df = pd.DataFrame(mod.model.endog, columns=self.y_var)
predict_df["pred_total"] = np.NaN
if self.method == "VARMAX":
predict_df.loc[mod.model.k_ar:, "pred_total"] = mod.fittedvalues.iloc[:, 0].tolist()
predict_df.loc[mod.model.k_ar:, "pred_total"] = mod.fittedvalues[:, 0]
predict_df['date'] = pd.date_range(start="2019-01-01", periods=len(predict_df), freq='MS')
predict_df.loc[mod.model.k_ar:,
"pred_total"] = mod.fittedvalues.iloc[:, 0].tolist()
predict_df.loc[mod.model.k_ar:,
"pred_total"] = mod.fittedvalues[:, 0]
predict_df['date'] = pd.date_range(
start="2019-01-01", periods=len(predict_df), freq='MS')
self.prediction_dfs[t_type] = predict_df


def plot_comparison(self):
for t_type, pred_df in self.prediction_dfs.items():
fig, ax = plt.subplots(figsize=(8, 6))
pred_df.plot(x="date", y=["total", "pred_total"], ax=ax)
return fig

def evaluate_models(self):
"""
Compare models to benchmark evaluation methods.
Return:
benchmark (pd.DataFrame): contains `naive` and `mean` method for forecasting.
"""
naive_pred = naive_method(self.data[self.y0])
mean_pred = mean_method(self.data[self.y0])

benchmark = pd.DataFrame()
if len(self.prediction_dfs) == 1:
fittedvalues = self.prediction_dfs[self.transformation[0]]["pred_total"]
for name, pred in zip(["naive", "mean", "VAR (scaled)"], [naive_pred, mean_pred, fittedvalues]):
fittedvalues = self.prediction_dfs[self.transformation[0]
]["pred_total"]
for name, pred in zip(["naive", "mean", "VAR (scaled)"],
[naive_pred, mean_pred, fittedvalues]):
eval_metrics = calculate_evaluation(self.data[self.y0], pred)
eval_df = pd.DataFrame(eval_metrics, index=[name])
benchmark = pd.concat([benchmark, eval_df], axis=0)

return benchmark


class RatioPipe(MultiTSData):
def __init__(self, country,
y_var,
Expand All @@ -204,7 +215,8 @@ def __init__(self, country,
Args:
country (str): The country.
y_var (str): The dependent variable.
x2 (str, optional): The variable with `y_var` to produce ratio. Defaults to "seats_arrivals_intl".
x2 (str, optional): The variable with `y_var` to produce ratio. Defaults to
`seats_arrivals_intl`.
exog_var (str): The exogenous variable.
transform_method (str): The transformation method.
training_ratio (float): The training ratio.
Expand All @@ -213,6 +225,7 @@ def __init__(self, country,
self.x1 = y_var
self.x2 = x2
self.model = None
self.model_data = None
self.res = None
self.prediction = None
self.benchmark = None
Expand Down Expand Up @@ -267,18 +280,18 @@ def get_prediction(self):
self.prediction = pd.concat(
[self.model_data[select_cols], pred_df], axis=1).dropna()
self.prediction["pred_mean"] = self.prediction["mean"] * \
self.pred_df[self.x2]
self.prediction[self.x2]

return self.prediction

def get_benchmark_evaluation(self):
naive_pred = naive_method(self.pred_df[self.x1])
mean_pred = mean_method(self.pred_df[self.x1])
snaive_pred = seasonal_naive_method(self.pred_df[self.x1])
naive_pred = naive_method(self.prediction[self.x1])
mean_pred = mean_method(self.prediction[self.x1])
snaive_pred = seasonal_naive_method(self.prediction[self.x1])

benchmark = pd.DataFrame()
for idx, method in enumerate([naive_pred, mean_pred, snaive_pred, self.pred_df["pred_mean"]]):
metrics = calculate_evaluation(self.pred_df[self.x1], method)
for idx, pred in enumerate([naive_pred, mean_pred, snaive_pred, self.prediction["pred_mean"]]):
metrics = calculate_evaluation(self.prediction[self.x1], pred)
metrics_df = pd.DataFrame(metrics, index=[idx])
benchmark = pd.concat([benchmark, metrics_df], axis=0)
benchmark.index = ["naive", "mean", "seasonal naive", "ratio"]
Expand Down
Loading

0 comments on commit aa041fa

Please sign in to comment.