From 28eb979ab6c436a141b8a97cba81bd3aa7564e0d Mon Sep 17 00:00:00 2001 From: Arnaud Gelas Date: Wed, 3 Nov 2021 21:06:28 +0100 Subject: [PATCH] Add poetry support for dependency management black, isort, flake8, mypy and bandit are added as dev dependencies. --- .flake8 | 11 ++ .github/workflows/build.yml | 43 +++++++ .github/workflows/pylint.yml | 23 ---- .github/workflows/pypi.yml | 34 +++-- .gitignore | 2 +- .pylintrc | 6 +- pyproject.toml | 65 +++++++++- setup.py | 77 +++++------ setup.sh | 37 ++++++ src/opensignals/__about__.py | 12 +- src/opensignals/__main__.py | 12 +- src/opensignals/data/yahoo.py | 233 +++++++++++++++++----------------- src/opensignals/features.py | 175 +++++++++++++------------ src/opensignals/utils.py | 10 +- tasks.py | 58 +++++++++ 15 files changed, 487 insertions(+), 311 deletions(-) create mode 100644 .flake8 create mode 100644 .github/workflows/build.yml delete mode 100644 .github/workflows/pylint.yml create mode 100644 setup.sh create mode 100755 tasks.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..0cbbc36 --- /dev/null +++ b/.flake8 @@ -0,0 +1,11 @@ +[flake8] +count = True +show-source = True +statistics = True +extend-exclude=.venv, .github +select=E9,F63,F7,F82 +max-line-length = 100 +max-complexity = 10 + +#see https://flake8.pycqa.org/en/latest/user/options.html +~ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..e005b1f --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,43 @@ +name: build + +on: + push: + branches-ignore: + - "dependabot/**" + pull_request: + release: + types: [created] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install poetry + uses: snok/install-poetry@v1.2 + with: + virtualenvs-create: true + virtualenvs-in-project: true + - name: Setup + run: bash setup.sh + - name: Check format with black + run: poetry run invoke check-format-with-black + - name: Check sorted import module + run: poetry run invoke sort-imports-with-isort + - name: Lint with pylint + run: poetry run invoke lint-with-pylint + - name: Lint with flake8 + run: poetry run invoke lint-with-flake8 + - name: Type checking with mypy + run: poetry run invoke lint-with-mypy + - name: Build + run: poetry build + - name: Publish Dry Run + if: github.event_name == 'push' + run: poetry publish --username="${{ secrets.PYPI_USERNAME }}" --password="${{ secrets.PYPI_PASSWORD }}" --dry-run + \ No newline at end of file diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index 6452986..0000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Pylint - -on: [push] - -jobs: - build: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pylint - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Analysing the code with pylint - run: | - python -m pylint --fail-under=9 `find -regextype egrep -regex '(.*.py)$'` diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 0d438ee..e9d71c8 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -10,27 +10,23 @@ jobs: name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI runs-on: ubuntu-18.04 steps: - - uses: actions/checkout@master - - name: Set up Python 3.7 - uses: actions/setup-python@v2.2.2 + - name: Check out repository + uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 with: - python-version: 3.7 - - name: Install pypa/build - run: >- - python -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - . + python-version: 3.9 + - name: Install poetry + uses: snok/install-poetry@v1.2 + with: + virtualenvs-create: true + virtualenvs-in-project: true + - name: Setup + run: bash setup.sh + - name: Build + run: poetry build - name: Publish distribution 📦 to PyPI if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master + run: poetry publish with: password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 69c9c04..97e0a7f 100644 --- a/.gitignore +++ b/.gitignore @@ -140,4 +140,4 @@ cython_debug/ db/ .idea/ *.csv - +.invoke-completion.sh diff --git a/.pylintrc b/.pylintrc index c9d9b5b..b9a7a47 100644 --- a/.pylintrc +++ b/.pylintrc @@ -17,14 +17,14 @@ extension-pkg-whitelist= fail-on= # Specify a score threshold to be exceeded before program exits with error. -fail-under=10.0 +fail-under=9.0 # Files or directories to be skipped. They should be base names, not paths. -ignore=CVS +ignore= # Add files or directories matching the regex patterns to the ignore-list. The # regex matches against paths. -ignore-paths= +ignore-paths=.venv/*, .git/*, .github/* # Files or directories matching the regex patterns are skipped. The regex # matches against base names, not paths. diff --git a/pyproject.toml b/pyproject.toml index 2480ca2..0b3e8ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,63 @@ +[tool.poetry] +name = "opensignals" +version = "0.0.2" +description = "" +authors = ["Jordi Villar "] + +[tool.poetry.urls] +Issues = "https://github.com/councilofelders/opensignals/issues" + +[tool.poetry.dependencies] +python = ">=3.8,<3.11" +docopt = "^0.6.2" +pandas = "^1.3.4" +numpy = "^1.21.3" +pyarrow = "^6.0.0" +requests = "^2.26.0" +tqdm = "^4.62.3" + +[tool.poetry.dev-dependencies] +pylint = "^2.11.1" +black = "^21.10b0" +isort = "^5.10.0" +invoke = "^1.6.0" +bandit = "^1.7.0" +flake8 = "^4.0.1" +mypy = "^0.910" +types-python-dateutil = "^2.8.2" + [build-system] -requires = [ - 'setuptools>=42', - 'wheel' +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.black] +line-length = 100 + +[tool.isort] +profile = "black" + +[tool.mypy] +files = "src/**/*.py" +exclude = ".venv/" +allow_untyped_decorators = false +warn_unused_configs = true +allow_subclassing_any = false +#allow_untyped_calls = false +#allow_untyped_defs = false +allow_incomplete_defs = false +check_untyped_defs = true +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +warn_return_any = true +ignore_missing_imports = false +pretty = true + +[[tool.mypy.overrides]] +module = [ + "pandas", + "requests", + "tqdm", + "docopt" ] -build-backend = 'setuptools.build_meta' \ No newline at end of file +ignore_missing_imports = true diff --git a/setup.py b/setup.py index 1a1f1c9..9b517d0 100644 --- a/setup.py +++ b/setup.py @@ -1,65 +1,66 @@ from pathlib import Path -from setuptools import setup, find_packages -metadata_path = Path(__file__).parent / 'src' / 'opensignals' / '__about__.py' +from setuptools import find_packages, setup + +metadata_path = Path(__file__).parent / "src" / "opensignals" / "__about__.py" metadata = {} with metadata_path.open() as file: raw_code = file.read() exec(raw_code, metadata) -metadata = {key.strip('_'): value for key, value in metadata.items()} -metadata['name'] = metadata.pop('package_name') +metadata = {key.strip("_"): value for key, value in metadata.items()} +metadata["name"] = metadata.pop("package_name") setup( - long_description=open('README.md').read(), - long_description_content_type='text/markdown', - url='https://github.com/councilofelders/opensignals', + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/councilofelders/opensignals", project_urls={ - 'Bug Tracker': 'https://github.com/councilofelders/opensignals/issues', + "Bug Tracker": "https://github.com/councilofelders/opensignals/issues", }, - packages=find_packages('src'), - package_dir={'': 'src'}, - py_modules=[path.stem for path in Path('src').glob('*.py')], - python_requires='>=3.7', + packages=find_packages("src"), + package_dir={"": "src"}, + py_modules=[path.stem for path in Path("src").glob("*.py")], + python_requires=">=3.7", zip_safe=True, include_package_data=True, install_requires=[ - 'docopt>=0.6.2', - 'pandas>=1.2.2', - 'numpy>=1.20.1', - 'pyarrow>=3.0.0', - 'requests', - 'tqdm', + "docopt>=0.6.2", + "pandas>=1.2.2", + "numpy>=1.20.1", + "pyarrow>=3.0.0", + "requests", + "tqdm", ], extras_require=dict( test=[ - 'pytest==5.1.2', - 'pytest-cov==2.7.1', - 'pytest-flake8==1.0.6', - 'pytest-mypy==0.4.0', - 'pydocstyle==4.0.1', - 'pep8-naming==0.8.1', - 'pytest-docstyle==2.0.0', - 'flake8 == 3.8.1', + "pytest==5.1.2", + "pytest-cov==2.7.1", + "pytest-flake8==1.0.6", + "pytest-mypy==0.4.0", + "pydocstyle==4.0.1", + "pep8-naming==0.8.1", + "pytest-docstyle==2.0.0", + "flake8 == 3.8.1", ], ), entry_points={ - 'console_scripts': [ - 'opensignals = opensignals.__main__:main', + "console_scripts": [ + "opensignals = opensignals.__main__:main", ] }, **metadata, classifiers=[ - 'License :: OSI Approved :: Apache Software License', - 'Development Status :: 3 - Alpha', + "License :: OSI Approved :: Apache Software License", + "Development Status :: 3 - Alpha", # 'Development Status :: 4 - Beta', # 'Development Status :: 5 - Production/Stable', - 'Operating System :: OS Independent', - 'Intended Audience :: Developers', - 'Topic :: Office/Business :: Financial', - 'Topic :: Office/Business :: Financial :: Investment', - 'Topic :: Scientific/Engineering :: Interface Engine/Protocol Translator', - 'Topic :: Software Development :: Libraries', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Programming Language :: Python :: 3', + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Topic :: Office/Business :: Financial", + "Topic :: Office/Business :: Financial :: Investment", + "Topic :: Scientific/Engineering :: Interface Engine/Protocol Translator", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3", ], ) diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..2f0691e --- /dev/null +++ b/setup.sh @@ -0,0 +1,37 @@ +#! /usr/bin/bash +echo "install poetry if not in path" +if ! command -v poetry &> /dev/null +then + echo "poetry is missing. Installing ..." + curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python - + source "$HOME/.poetry/env" +fi + +echo "setting up poetry config" +poetry config virtualenvs.path .venv +poetry config virtualenvs.create true +poetry config virtualenvs.in-project true + +echo "installing dependencies" +poetry install --no-interaction --no-root + +echo "installing libraries" +poetry install --no-interaction + +if [ -n "`$SHELL -c 'echo $ZSH_VERSION'`" ]; then + shell="zsh" +elif [ -n "`$SHELL -c 'echo $BASH_VERSION'`" ]; then + shell="bash" +else + # assume something else + exit +fi + +echo "setup tab completion for invoke for $shell" +# see http://docs.pyinvoke.org/en/stable/invoke.html#shell-tab-completion +poetry run invoke --print-completion-script $shell > .invoke-completion.sh + +echo "To activate the tab completion, run the following command:" +echo "" +echo " $ source .invoke-completion.sh" +echo "" diff --git a/src/opensignals/__about__.py b/src/opensignals/__about__.py index 7d2bf52..38aac36 100644 --- a/src/opensignals/__about__.py +++ b/src/opensignals/__about__.py @@ -1,10 +1,10 @@ """Contain information about the project.""" -__package_name__ = 'opensignals' -__version__ = '0.0.2' +__package_name__ = "opensignals" +__version__ = "0.0.2" -__description__ = 'Open Signals' +__description__ = "Open Signals" -__author__ = 'Jordi Villar' -__author_email__ = 'jrdi.villar@gmail.com' +__author__ = "Jordi Villar" +__author_email__ = "jrdi.villar@gmail.com" -__license__ = 'Apache License' +__license__ = "Apache License" diff --git a/src/opensignals/__main__.py b/src/opensignals/__main__.py index 326338d..0614da4 100644 --- a/src/opensignals/__main__.py +++ b/src/opensignals/__main__.py @@ -1,4 +1,3 @@ - """opensignals Usage: @@ -25,7 +24,6 @@ from opensignals import __version__ from opensignals.data import yahoo -from opensignals.features import RSI def main(argv: Optional[List[str]] = None) -> None: @@ -40,16 +38,16 @@ def main(argv: Optional[List[str]] = None) -> None: """ args = docopt(__doc__, version=__version__) - if args['--verbose'] and int(args['--verbose']) > 1: + if args["--verbose"] and int(args["--verbose"]) > 1: logging.basicConfig(level=logging.DEBUG) - elif args['--verbose'] and int(args['--verbose']) == 0: + elif args["--verbose"] and int(args["--verbose"]) == 0: logging.basicConfig(level=logging.WARNING) else: logging.basicConfig(level=logging.INFO) - if args['download']: - yahoo.download_data(Path(args['--dir']), args['--recreate']) + if args["download"]: + yahoo.download_data(Path(args["--dir"]), args["--recreate"]) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/opensignals/data/yahoo.py b/src/opensignals/data/yahoo.py index 74349fb..4e92a70 100644 --- a/src/opensignals/data/yahoo.py +++ b/src/opensignals/data/yahoo.py @@ -1,106 +1,100 @@ +import logging import random import shutil -from datetime import datetime, date, time -import logging import time as _time from concurrent import futures +from datetime import date, datetime, time import numpy as np import pandas as pd import requests +from dateutil.relativedelta import FR, relativedelta from tqdm import tqdm -from dateutil.relativedelta import relativedelta, FR from opensignals import utils logger = logging.getLogger(__name__) -AWS_BASE_URL = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com' -SIGNALS_UNIVERSE = f'{AWS_BASE_URL}/latest_universe.csv' -SIGNALS_TICKER_MAP = f'{AWS_BASE_URL}/signals_ticker_map_w_bbg.csv' -SIGNALS_TARGETS = f'{AWS_BASE_URL}/signals_train_val_bbg.csv' +AWS_BASE_URL = "https://numerai-signals-public-data.s3-us-west-2.amazonaws.com" +SIGNALS_UNIVERSE = f"{AWS_BASE_URL}/latest_universe.csv" +SIGNALS_TICKER_MAP = f"{AWS_BASE_URL}/signals_ticker_map_w_bbg.csv" +SIGNALS_TARGETS = f"{AWS_BASE_URL}/signals_train_val_bbg.csv" def get_tickers(): ticker_map = pd.read_csv(SIGNALS_TICKER_MAP) - ticker_map = ticker_map.dropna(subset=['yahoo']) - logger.info(f'Number of eligible tickers: {ticker_map.shape[0]}') + ticker_map = ticker_map.dropna(subset=["yahoo"]) + logger.info(f"Number of eligible tickers: {ticker_map.shape[0]}") - if ticker_map['yahoo'].duplicated().any(): + if ticker_map["yahoo"].duplicated().any(): num = ticker_map["yahoo"].duplicated().values().sum() - raise Exception( - f'Found duplicated {num}' - ' yahoo tickers' - ) + raise Exception(f"Found duplicated {num}" " yahoo tickers") - if ticker_map['bloomberg_ticker'].duplicated().any(): + if ticker_map["bloomberg_ticker"].duplicated().any(): num = ticker_map["bloomberg_ticker"].duplicated().values().sum() - raise Exception( - f'Found duplicated {num}' - ' bloomberg_ticker tickers' - ) + raise Exception(f"Found duplicated {num}" " bloomberg_ticker tickers") return ticker_map def get_ticker_data(db_dir): - ticker_data = pd.DataFrame({ - 'bloomberg_ticker': pd.Series([], dtype='str'), - 'date': pd.Series([], dtype='datetime64[ns]') - }) - if len(list(db_dir.rglob('*.parquet'))) > 0: + ticker_data = pd.DataFrame( + { + "bloomberg_ticker": pd.Series([], dtype="str"), + "date": pd.Series([], dtype="datetime64[ns]"), + } + ) + if len(list(db_dir.rglob("*.parquet"))) > 0: ticker_data = pd.read_parquet(db_dir) num = ticker_data.bloomberg_ticker.unique().shape[0] - logger.info(f'Retrieving data for {num} tickers from the database') + logger.info(f"Retrieving data for {num} tickers from the database") return ticker_data def get_ticker_missing( - ticker_data, - ticker_map, - last_friday=datetime.today() - relativedelta(weekday=FR(-1))): - tickers_available_data = ticker_data.groupby('bloomberg_ticker').agg({'date': [max, min]}) - tickers_available_data.columns = ['date_max', 'date_min'] + ticker_data, + ticker_map, + last_friday=datetime.today() - relativedelta(weekday=FR(-1)), +): + tickers_available_data = ticker_data.groupby("bloomberg_ticker").agg({"date": [max, min]}) + tickers_available_data.columns = ["date_max", "date_min"] eligible_tickers_available_data = ticker_map.merge( - tickers_available_data.reset_index(), - on='bloomberg_ticker', - how='left' + tickers_available_data.reset_index(), on="bloomberg_ticker", how="left" ) ticker_not_found = eligible_tickers_available_data.loc[ - eligible_tickers_available_data.date_max.isna(), ['bloomberg_ticker', 'yahoo'] + eligible_tickers_available_data.date_max.isna(), ["bloomberg_ticker", "yahoo"] ] - ticker_not_found['start'] = '2002-12-01' + ticker_not_found["start"] = "2002-12-01" last_friday_52 = last_friday - relativedelta(weeks=52) tickers_outdated = eligible_tickers_available_data.loc[ ( - (eligible_tickers_available_data.date_max < last_friday.strftime('%Y-%m-%d')) & - (eligible_tickers_available_data.date_max > last_friday_52.strftime('%Y-%m-%d')) + (eligible_tickers_available_data.date_max < last_friday.strftime("%Y-%m-%d")) + & (eligible_tickers_available_data.date_max > last_friday_52.strftime("%Y-%m-%d")) ), - ['bloomberg_ticker', 'yahoo', 'date_max'] + ["bloomberg_ticker", "yahoo", "date_max"], ] - tickers_outdated['start'] = ( - tickers_outdated['date_max'] + pd.DateOffset(1) - ).dt.strftime('%Y-%m-%d') - tickers_outdated.drop(columns=['date_max'], inplace=True) - - return pd.concat( - [ticker_not_found, tickers_outdated] + tickers_outdated["start"] = (tickers_outdated["date_max"] + pd.DateOffset(1)).dt.strftime( + "%Y-%m-%d" ) + tickers_outdated.drop(columns=["date_max"], inplace=True) + + return pd.concat([ticker_not_found, tickers_outdated]) def get_data( - db_dir, - features_generators=None, - last_friday=datetime.today() - relativedelta(weekday=FR(-1)), - target='target_20d', - feature_prefix=None): + db_dir, + features_generators=None, + last_friday=datetime.today() - relativedelta(weekday=FR(-1)), + target="target_20d", + feature_prefix=None, +): """generate data set""" if features_generators is None: @@ -109,34 +103,29 @@ def get_data( ticker_data = get_ticker_data(db_dir) ticker_universe = pd.read_csv(SIGNALS_UNIVERSE) - ticker_data = ticker_data[ticker_data.bloomberg_ticker.isin( - ticker_universe['bloomberg_ticker'])] + ticker_data = ticker_data[ + ticker_data.bloomberg_ticker.isin(ticker_universe["bloomberg_ticker"]) + ] targets = pd.read_csv(SIGNALS_TARGETS) - targets['date'] = pd.to_datetime( - targets['friday_date'], - format='%Y%m%d' - ) - targets['target'] = targets[target] + targets["date"] = pd.to_datetime(targets["friday_date"], format="%Y%m%d") + targets["target"] = targets[target] feature_names = [] for features_generator in features_generators: - ticker_data, feature_names_aux = features_generator.generate_features(ticker_data, feature_prefix) + ticker_data, feature_names_aux = features_generator.generate_features( + ticker_data, feature_prefix + ) feature_names.extend(feature_names_aux) # merge our feature data with Numerai targets - ml_data = pd.merge( - ticker_data, targets, - on=['date', 'bloomberg_ticker'], - how='left' - ) + ml_data = pd.merge(ticker_data, targets, on=["date", "bloomberg_ticker"], how="left") - logger.info(f'Found {ml_data.target.isna().sum()}' - 'rows without target, filling with 0.5') - ml_data['target'] = ml_data['target'].fillna(0.5) + logger.info(f"Found {ml_data.target.isna().sum()}" "rows without target, filling with 0.5") + ml_data["target"] = ml_data["target"].fillna(0.5) # convert date to datetime and index on it - ml_data = ml_data.set_index('date') + ml_data = ml_data.set_index("date") # for training and testing we want clean, complete data only ml_data = ml_data.dropna(subset=feature_names) @@ -146,16 +135,16 @@ def get_data( ml_data = ml_data[ml_data.index.value_counts() > 50] # train test split - train_data = ml_data[ml_data['data_type'] == 'train'] - test_data = ml_data[ml_data['data_type'] == 'validation'] + train_data = ml_data[ml_data["data_type"] == "train"] + test_data = ml_data[ml_data["data_type"] == "validation"] # generate live data - date_string = last_friday.strftime('%Y-%m-%d') + date_string = last_friday.strftime("%Y-%m-%d") live_data = ticker_data[ticker_data.date == date_string].copy() # get data from the day before, for markets that were closed last_thursday = last_friday - relativedelta(days=1) - thursday_date_string = last_thursday.strftime('%Y-%m-%d') + thursday_date_string = last_thursday.strftime("%Y-%m-%d") thursday_data = ticker_data[ticker_data.date == thursday_date_string] # Only select tickers than aren't already present in live_data @@ -164,26 +153,28 @@ def get_data( ].copy() live_data = pd.concat([live_data, thursday_data]) - live_data = live_data.set_index('date') + live_data = live_data.set_index("date") return train_data, test_data, live_data, feature_names def download_tickers(tickers, start): - start_epoch = int(datetime.strptime(start, '%Y-%m-%d').timestamp()) + start_epoch = int(datetime.strptime(start, "%Y-%m-%d").timestamp()) end_epoch = int(datetime.combine(date.today(), time()).timestamp()) - pbar = tqdm( - total=len(tickers), - unit='tickers' - ) + pbar = tqdm(total=len(tickers), unit="tickers") dfs = {} with futures.ThreadPoolExecutor() as executor: _futures = [] for ticker in tickers: _futures.append( - executor.submit(download_ticker, ticker=ticker, start_epoch=start_epoch, end_epoch=end_epoch) + executor.submit( + download_ticker, + ticker=ticker, + start_epoch=start_epoch, + end_epoch=end_epoch, + ) ) for future in futures.as_completed(_futures): @@ -198,31 +189,38 @@ def download_tickers(tickers, start): def download_ticker(ticker, start_epoch, end_epoch): """dowload data for a given ticker""" + def empty_df(): - return pd.DataFrame(columns=[ - "date", "bloomberg_ticker", - "open", "high", "low", "close", - "adj_close", "volume", "currency", "provider"]) + return pd.DataFrame( + columns=[ + "date", + "bloomberg_ticker", + "open", + "high", + "low", + "close", + "adj_close", + "volume", + "currency", + "provider", + ] + ) retries = 20 tries = retries + 1 backoff = 1 - url = f'https://query2.finance.yahoo.com/v8/finance/chart/{ticker}' + url = f"https://query2.finance.yahoo.com/v8/finance/chart/{ticker}" user_agent = random.choice(utils.USER_AGENTS) params = dict( period1=start_epoch, period2=end_epoch, - interval='1d', - events='div,splits', + interval="1d", + events="div,splits", ) while tries > 0: tries -= 1 try: - data = requests.get( - url=url, - params=params, - headers={'User-Agent': user_agent} - ) + data = requests.get(url=url, params=params, headers={"User-Agent": user_agent}) data_json = data.json() quotes = data_json["chart"]["result"][0] if "timestamp" not in quotes: @@ -240,18 +238,20 @@ def empty_df(): if "adjclose" in quotes["indicators"]: adjclose = quotes["indicators"]["adjclose"][0]["adjclose"] - df = pd.DataFrame({ - "date": pd.to_datetime(timestamps, unit="s").normalize(), - "bloomberg_ticker": ticker, - "open": np.array(opens, dtype='float32'), - "high": np.array(highs, dtype='float32'), - "low": np.array(lows, dtype='float32'), - "close": np.array(closes, dtype='float32'), - "adj_close": np.array(adjclose, dtype='float32'), - "volume": np.array(volumes, dtype='float32'), - "currency": quotes['meta']['currency'], - "provider": 'yahoo' - }) + df = pd.DataFrame( + { + "date": pd.to_datetime(timestamps, unit="s").normalize(), + "bloomberg_ticker": ticker, + "open": np.array(opens, dtype="float32"), + "high": np.array(highs, dtype="float32"), + "low": np.array(lows, dtype="float32"), + "close": np.array(closes, dtype="float32"), + "adj_close": np.array(adjclose, dtype="float32"), + "volume": np.array(volumes, dtype="float32"), + "currency": quotes["meta"]["currency"], + "provider": "yahoo", + } + ) return ticker, df.drop_duplicates().dropna() @@ -264,7 +264,7 @@ def empty_df(): def download_data(db_dir, recreate=False): if recreate: - logging.warning(f'Removing dataset {db_dir} to recreate it') + logging.warning(f"Removing dataset {db_dir} to recreate it") shutil.rmtree(db_dir, ignore_errors=True) db_dir.mkdir(exist_ok=True) @@ -275,17 +275,17 @@ def download_data(db_dir, recreate=False): n_ticker_missing = ticker_missing.shape[0] if n_ticker_missing <= 0: - logger.info('Dataset up to date') + logger.info("Dataset up to date") return - logger.info(f'Downloading missing data for {n_ticker_missing} tickers') + logger.info(f"Downloading missing data for {n_ticker_missing} tickers") - ticker_missing_grouped = ticker_missing.groupby('start').apply( - lambda x: ' '.join(x.yahoo.astype(str)) + ticker_missing_grouped = ticker_missing.groupby("start").apply( + lambda x: " ".join(x.yahoo.astype(str)) ) concat_dfs = [] for start_date, tickers in ticker_missing_grouped.iteritems(): - temp_df = download_tickers(tickers.split(' '), start=start_date) + temp_df = download_tickers(tickers.split(" "), start=start_date) # Yahoo Finance returning previous day in some situations # (e.g. Friday in TelAviv markets) @@ -293,22 +293,23 @@ def download_data(db_dir, recreate=False): if temp_df.empty: continue - temp_df['created_at'] = datetime.now() - temp_df['volume'] = temp_df['volume'].astype('float64') - temp_df['bloomberg_ticker'] = temp_df['bloomberg_ticker'].map( - dict(zip(ticker_map['yahoo'], ticker_map['bloomberg_ticker']))) + temp_df["created_at"] = datetime.now() + temp_df["volume"] = temp_df["volume"].astype("float64") + temp_df["bloomberg_ticker"] = temp_df["bloomberg_ticker"].map( + dict(zip(ticker_map["yahoo"], ticker_map["bloomberg_ticker"])) + ) concat_dfs.append(temp_df) if len(concat_dfs) == 0: - logger.info('Dataset up to date') + logger.info("Dataset up to date") return df = pd.concat(concat_dfs) n_ticker_data = df.bloomberg_ticker.unique().shape[0] if n_ticker_data <= 0: - logger.info('Dataset up to date') + logger.info("Dataset up to date") return - logger.info(f'Storing data for {n_ticker_data} tickers') - df.to_parquet(db_dir / f'{datetime.utcnow().timestamp()}.parquet', index=False) + logger.info(f"Storing data for {n_ticker_data} tickers") + df.to_parquet(db_dir / f"{datetime.utcnow().timestamp()}.parquet", index=False) diff --git a/src/opensignals/features.py b/src/opensignals/features.py index 100c663..fa0110f 100644 --- a/src/opensignals/features.py +++ b/src/opensignals/features.py @@ -7,21 +7,21 @@ class VarChange: - def __init__(self, num_days=1, variable='adj_close'): + def __init__(self, num_days=1, variable="adj_close"): self.num_days = num_days self.variable = variable def generate_features(self, ticker_data): - logger.info(f'generating var change {self.num_days} ' - f'for {self.variable}...') - feature_prefix_name = f'{self.variable}_x{self.num_days}' - ticker_groups = ticker_data.groupby('bloomberg_ticker') - ticker_data[feature_prefix_name] = \ - ticker_groups[self.variable].transform( - lambda x: x.shift(self.num_days)) - - ticker_data[f'{feature_prefix_name}_diff'] = \ + logger.info(f"generating var change {self.num_days} " f"for {self.variable}...") + feature_prefix_name = f"{self.variable}_x{self.num_days}" + ticker_groups = ticker_data.groupby("bloomberg_ticker") + ticker_data[feature_prefix_name] = ticker_groups[self.variable].transform( + lambda x: x.shift(self.num_days) + ) + + ticker_data[f"{feature_prefix_name}_diff"] = ( ticker_data[self.variable] / ticker_data[feature_prefix_name] - 1 + ) return ticker_data, [] @@ -41,18 +41,18 @@ def _parse_num_days(num_days): class RSI: """Relative Strength Index""" - def __init__(self, num_days=5, interval=10, variable='adj_close'): + def __init__(self, num_days=5, interval=10, variable="adj_close"): self.steps = _parse_num_days(num_days) self.interval = interval self.variable = variable @staticmethod def relative_strength_index(prices, interval): - ''' + """ Computes Relative Strength Index given a price series and lookback interval See more here https://www.investopedia.com/terms/r/rsi.asp - ''' + """ delta = prices.diff() # copy deltas, set losses to 0, get rolling avg @@ -72,82 +72,79 @@ def relative_strength_index(prices, interval): def get_feature_names(self, prefix_name): # define column names of features, target, and prediction - feat_quintile_lag = {step: f'{prefix_name}_quintile_lag_{step}' - for step in self.steps} - feat_rsi_diff = {step: f'{prefix_name}_diff_{step}' - for step in self.steps[:-1]} - feat_rsi_diff_abs = {step: f'{prefix_name}_abs_diff_{step}' - for step in self.steps[:-1]} + feat_quintile_lag = {step: f"{prefix_name}_quintile_lag_{step}" for step in self.steps} + feat_rsi_diff = {step: f"{prefix_name}_diff_{step}" for step in self.steps[:-1]} + feat_rsi_diff_abs = {step: f"{prefix_name}_abs_diff_{step}" for step in self.steps[:-1]} return feat_quintile_lag, feat_rsi_diff, feat_rsi_diff_abs def generate_features(self, ticker_data, feature_prefix=None): # add Relative Strength Index - logger.info(f'generating RSI {self.interval} for {self.variable}...') + logger.info(f"generating RSI {self.interval} for {self.variable}...") - feature_prefix_name = f'RSI_{self.interval}_{self.variable}' + feature_prefix_name = f"RSI_{self.interval}_{self.variable}" if feature_prefix: - feature_prefix_name = f'{feature_prefix}_RSI_{self.interval}_{self.variable}' + feature_prefix_name = f"{feature_prefix}_RSI_{self.interval}_{self.variable}" - ticker_groups = ticker_data.groupby('bloomberg_ticker') - ticker_data[feature_prefix_name] = \ - ticker_groups[self.variable].transform( - lambda x: self.relative_strength_index(x, self.interval) + ticker_groups = ticker_data.groupby("bloomberg_ticker") + ticker_data[feature_prefix_name] = ticker_groups[self.variable].transform( + lambda x: self.relative_strength_index(x, self.interval) ) # group by era (date) - logger.debug('grouping by dates...') - date_groups = ticker_data.groupby('date') + logger.debug("grouping by dates...") + date_groups = ticker_data.groupby("date") # create quintile labels within each era, useful for learning # srelative ranking - logger.debug('generating RSI quintiles...') - col = f'{feature_prefix_name}_quintile' + logger.debug("generating RSI quintiles...") + col = f"{feature_prefix_name}_quintile" ticker_data[col] = date_groups[feature_prefix_name].transform( - lambda group: pd.qcut(group, 5, labels=False, duplicates='drop') + lambda group: pd.qcut(group, 5, labels=False, duplicates="drop") ) ticker_data.dropna(inplace=True) - ticker_data[col] = ticker_data[col].astype('int8') + ticker_data[col] = ticker_data[col].astype("int8") - ( - feat_quintile_lag, feat_rsi_diff, feat_rsi_diff_abs - ) = self.get_feature_names(feature_prefix_name) + (feat_quintile_lag, feat_rsi_diff, feat_rsi_diff_abs) = self.get_feature_names( + feature_prefix_name + ) # create lagged features grouped by ticker - logger.debug('grouping by ticker...') - ticker_groups = ticker_data.groupby('bloomberg_ticker') + logger.debug("grouping by ticker...") + ticker_groups = ticker_data.groupby("bloomberg_ticker") # lag 0 is that day's value, lag 1 is yesterday's value, etc - logger.debug('generating lagged RSI quintiles...') + logger.debug("generating lagged RSI quintiles...") for day in self.steps: - col = f'{feature_prefix_name}_quintile' - ticker_data[feat_quintile_lag[day]] = ticker_groups[col].transform( - lambda group: group.shift(day) - ).astype('float16') + col = f"{feature_prefix_name}_quintile" + ticker_data[feat_quintile_lag[day]] = ( + ticker_groups[col].transform(lambda group: group.shift(day)).astype("float16") + ) # create difference of the lagged features and # absolute difference of the lagged features (change in RSI # quintile by day) - logger.debug('generating lagged RSI diffs...') + logger.debug("generating lagged RSI diffs...") for i in range(len(self.steps) - 1): step = self.steps[i] ticker_data[feat_rsi_diff[step]] = ( - ticker_data[feat_quintile_lag[step]] - - ticker_data[feat_quintile_lag[self.steps[i + 1]]] + ticker_data[feat_quintile_lag[step]] + - ticker_data[feat_quintile_lag[self.steps[i + 1]]] ) - ticker_data[feat_rsi_diff_abs[step]] = \ - np.abs(ticker_data[feat_rsi_diff[step]]) + ticker_data[feat_rsi_diff_abs[step]] = np.abs(ticker_data[feat_rsi_diff[step]]) - feature_names = (list(feat_quintile_lag.values()) + - list(feat_rsi_diff.values()) + - list(feat_rsi_diff_abs.values())) + feature_names = ( + list(feat_quintile_lag.values()) + + list(feat_rsi_diff.values()) + + list(feat_rsi_diff_abs.values()) + ) return ticker_data, feature_names class SMA: """Simple Moving Average""" - def __init__(self, num_days=5, interval=10, variable='adj_close'): + def __init__(self, num_days=5, interval=10, variable="adj_close"): self.steps = _parse_num_days(num_days) self.interval = interval self.variable = variable @@ -158,70 +155,70 @@ def simple_moving_average(prices, interval): def get_feature_names(self, prefix_name): # define column names of features, target, and prediction - feat_quintile_lag = {step: f'{prefix_name}_quintile_lag_{step}' - for step in self.steps} - feat_rsi_diff = {step: f'{prefix_name}_diff_{step}' - for step in self.steps[:-1]} - feat_rsi_diff_abs = {step: f'{prefix_name}_abs_diff_{step}' - for step in self.steps[:-1]} + feat_quintile_lag = {step: f"{prefix_name}_quintile_lag_{step}" for step in self.steps} + feat_rsi_diff = {step: f"{prefix_name}_diff_{step}" for step in self.steps[:-1]} + feat_rsi_diff_abs = {step: f"{prefix_name}_abs_diff_{step}" for step in self.steps[:-1]} return feat_quintile_lag, feat_rsi_diff, feat_rsi_diff_abs def generate_features(self, ticker_data, feature_prefix=None): # add Relative Strength Index - logger.info(f'generating SMA {self.interval} for {self.variable}...') + logger.info(f"generating SMA {self.interval} for {self.variable}...") - feature_prefix_name = f'SMA_{self.interval}_{self.variable}' + feature_prefix_name = f"SMA_{self.interval}_{self.variable}" if feature_prefix: - feature_prefix_name = f'{feature_prefix}_SMA_{self.interval}_{self.variable}' + feature_prefix_name = f"{feature_prefix}_SMA_{self.interval}_{self.variable}" - ticker_groups = ticker_data.groupby('bloomberg_ticker') - ticker_data[feature_prefix_name] = \ - ticker_groups[self.variable].transform( - lambda x: self.simple_moving_average(x, self.interval) + ticker_groups = ticker_data.groupby("bloomberg_ticker") + ticker_data[feature_prefix_name] = ticker_groups[self.variable].transform( + lambda x: self.simple_moving_average(x, self.interval) ) # group by era (date) - logger.debug('grouping by dates...') - date_groups = ticker_data.groupby('date') + logger.debug("grouping by dates...") + date_groups = ticker_data.groupby("date") # create quintile labels within each era, useful for # learning relative ranking - logger.debug('generating SMA quintiles...') - col = f'{feature_prefix_name}_quintile' + logger.debug("generating SMA quintiles...") + col = f"{feature_prefix_name}_quintile" ticker_data[col] = date_groups[feature_prefix_name].transform( - lambda group: pd.qcut(group, 5, labels=False, duplicates='drop') + lambda group: pd.qcut(group, 5, labels=False, duplicates="drop") ) ticker_data.dropna(inplace=True) - ticker_data[col] = ticker_data[col].astype('int8') + ticker_data[col] = ticker_data[col].astype("int8") - ( - feat_quintile_lag, feat_sma_diff, feat_sma_diff_abs - ) = self.get_feature_names(feature_prefix_name) + (feat_quintile_lag, feat_sma_diff, feat_sma_diff_abs) = self.get_feature_names( + feature_prefix_name + ) # create lagged features grouped by ticker - logger.debug('grouping by ticker...') - ticker_groups = ticker_data.groupby('bloomberg_ticker') + logger.debug("grouping by ticker...") + ticker_groups = ticker_data.groupby("bloomberg_ticker") # lag 0 is that day's value, lag 1 is yesterday's value, etc - logger.debug('generating lagged SMA quintiles...') + logger.debug("generating lagged SMA quintiles...") for day in self.steps: - col = f'{feature_prefix_name}_quintile' - ticker_data[feat_quintile_lag[day]] = ticker_groups[col].transform( - lambda group: group.shift(day) - ).astype('float16') + col = f"{feature_prefix_name}_quintile" + ticker_data[feat_quintile_lag[day]] = ( + ticker_groups[col].transform(lambda group: group.shift(day)).astype("float16") + ) # create difference of the lagged features and # absolute difference of the lagged features (change in # SMA quintile by day) - logger.debug('generating lagged SMA diffs...') + logger.debug("generating lagged SMA diffs...") for i in range(len(self.steps) - 1): ticker_data[feat_sma_diff[self.steps[i]]] = ( - ticker_data[feat_quintile_lag[self.steps[i]]] - - ticker_data[feat_quintile_lag[self.steps[i + 1]]]) - ticker_data[feat_sma_diff_abs[self.steps[i]]] = \ - np.abs(ticker_data[feat_sma_diff[self.steps[i]]]) - - feature_names = (list(feat_quintile_lag.values()) + - list(feat_sma_diff.values()) + - list(feat_sma_diff_abs.values())) + ticker_data[feat_quintile_lag[self.steps[i]]] + - ticker_data[feat_quintile_lag[self.steps[i + 1]]] + ) + ticker_data[feat_sma_diff_abs[self.steps[i]]] = np.abs( + ticker_data[feat_sma_diff[self.steps[i]]] + ) + + feature_names = ( + list(feat_quintile_lag.values()) + + list(feat_sma_diff.values()) + + list(feat_sma_diff_abs.values()) + ) return ticker_data, feature_names diff --git a/src/opensignals/utils.py b/src/opensignals/utils.py index 60d604a..eb43569 100644 --- a/src/opensignals/utils.py +++ b/src/opensignals/utils.py @@ -1,10 +1,10 @@ USER_AGENTS = [ ( - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/39.0.2171.95 Safari/537.36' + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko)" + " Chrome/39.0.2171.95 Safari/537.36" ), ( - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' - ' Chrome/42.0.2311.135 Safari/537.36 Edge/12.246' - ) + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" + " Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" + ), ] diff --git a/tasks.py b/tasks.py new file mode 100755 index 0000000..95b5bb0 --- /dev/null +++ b/tasks.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +""" +This script can be used to build locally or methods from the file can be used in the github actions. +The github actions shall just call methods from this file so we are not locked in the the build server solution. +""" +import contextlib +import os +import shutil + +from invoke import task + + +@task +def check_format_with_black(c, fix=False): + format_cmd = "black ." + check_cmd = format_cmd + " --check" + if fix: + c.run(format_cmd) + c.run(check_cmd) + + +@task +def sort_imports_with_isort(c, fix=False): + if fix: + c.run("isort .") + c.run("isort --check .") + + +@task +def lint_with_pylint(c): + c.run("pylint ./src") + + +@task +def lint_with_flake8(c): + c.run("flake8 .") + + +@task +def lint_with_bandit(c): + c.run("bandit -r src/ --exclude .venv") + + +@task +def lint_with_mypy(c): + c.run("mypy") + + +@task +def clean(c): + folders = [".venv", ".pytest_cache", ".mypy_cache", "dist", "reports", "coverage"] + for folder in folders: + shutil.rmtree(folder, ignore_errors=True) + files = [".coverage"] + for file in files: + with contextlib.suppress(FileNotFoundError): # -> like ignore_errors=True + os.remove(file)