From fc41ab91ce7b6aebd3400d0ea25ded8ee3a6a40b Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 8 Feb 2021 11:20:45 -0300 Subject: [PATCH 01/86] [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. --- .github/workflows/publish.yml | 2 +- .github/workflows/staging.yml | 45 +++++++++++++++++++++++++++++++++++ .github/workflows/test.yml | 1 + 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/staging.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 7dff34a7..ba94bc7e 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -7,7 +7,7 @@ on: jobs: Pipeline: - if: github.ref == 'refs/heads/master' || contains(github.ref, 'hotfix/') + if: github.ref == 'refs/heads/master' runs-on: ubuntu-16.04 container: quintoandar/python-3-7-java diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml new file mode 100644 index 00000000..70929170 --- /dev/null +++ b/.github/workflows/staging.yml @@ -0,0 +1,45 @@ +name: "Publish Dev Package" +on: + push: + branches: + - staging + + +jobs: + Pipeline: + if: github.ref == 'refs/heads/staging' + + runs-on: ubuntu-16.04 + container: quintoandar/python-3-7-java + + steps: + - uses: actions/checkout@v2 + + - name: Install dependencies + run: make ci-install + + - name: Build package + run: make package + + - name: Get version + run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 ).dev$(git rev-parse --short "$GITHUB_SHA")" >> $GITHUB_ENV + + - name: Create release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ env.version }} + release_name: Release ${{ env.version }} + prerelease: true + + - name: Release already exist + if: ${{ failure() }} + run: echo Release already exist + + - name: Publish release to pypi.org + if: ${{ success() }} + env: + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 29394a0e..b39246fd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,6 +3,7 @@ on: push: branches: - master + - staging - hotfix/** pull_request: From 4be4ffe59a7ff8d4b472e3010d564c06e5a81ace Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 8 Feb 2021 18:36:49 -0300 Subject: [PATCH 02/86] [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. --- .github/workflows/staging.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 70929170..e02009a4 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -24,6 +24,12 @@ jobs: - name: Get version run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 ).dev$(git rev-parse --short "$GITHUB_SHA")" >> $GITHUB_ENV + - name: Change package name and path + run: | + mkdir ./dist/p/ \; + find . -wholename "./dist/butterfree-*.whl" -exec mv '{}' ./dist/p/butterfree-${{ env.version }}-py3-none-any.whl \; + find . -wholename "./dist/butterfree-*.tar.gz" -exec mv '{}' ./dist/p/butterfree-${{ env.version }}.tar.gz \; + - name: Create release uses: actions/create-release@v1 env: @@ -42,4 +48,4 @@ jobs: env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose ./dist/p/* From a3a601bd037a25dacd60bc5a643c4e2eec81a39e Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 8 Feb 2021 19:06:29 -0300 Subject: [PATCH 03/86] Apply only wheel. (#285) --- .github/workflows/staging.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index e02009a4..504bb086 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -48,4 +48,4 @@ jobs: env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose ./dist/p/* + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose ./dist/p/butterfree-${{ env.version }}-py3-none-any.whl From 433960802a33ed3c545a04c6ab8c7d2594a1d7ee Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 9 Feb 2021 16:49:47 -0300 Subject: [PATCH 04/86] [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. --- .github/workflows/staging.yml | 12 ++++-------- Makefile | 24 ++++++++++++++++++------ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 504bb086..62e97ea2 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -18,17 +18,13 @@ jobs: - name: Install dependencies run: make ci-install - - name: Build package - run: make package - - name: Get version run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 ).dev$(git rev-parse --short "$GITHUB_SHA")" >> $GITHUB_ENV - - name: Change package name and path + - name: Build package run: | - mkdir ./dist/p/ \; - find . -wholename "./dist/butterfree-*.whl" -exec mv '{}' ./dist/p/butterfree-${{ env.version }}-py3-none-any.whl \; - find . -wholename "./dist/butterfree-*.tar.gz" -exec mv '{}' ./dist/p/butterfree-${{ env.version }}.tar.gz \; + make change-version NEW_VERSION="${{ env.version }}" + make package - name: Create release uses: actions/create-release@v1 @@ -48,4 +44,4 @@ jobs: env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose ./dist/p/butterfree-${{ env.version }}-py3-none-any.whl + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* diff --git a/Makefile b/Makefile index 41ad00ab..e6de9baa 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,11 @@ +# globals + +PACKAGE_NAME := $(shell grep __package_name__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2) +VERSION := $(shell grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2) + + +#custom targets + .PHONY: environment ## create virtual environment for butterfree environment: @@ -119,16 +127,20 @@ clean: @find ./ -name '*~' -exec rm -f {} \; .PHONY: version -## dump package name into VERSION env variable and show +## show version version: - @export VERSION=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2) - @$(info VERSION is [${VERSION}]) + @echo "VERSION: $(VERSION)" + +.PHONY: change-version +## change the version to string received in the NEW_VERSION variable and show +change-version: + @sed -i 's/$(VERSION)/$(NEW_VERSION)/g' setup.py + @echo "VERSION: $(NEW_VERSION)" .PHONY: package-name -## dump package name into PACKAGE_NAME env variable and show +## show package name package-name: - @PACKAGE_NAME=$(grep __package_name__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 | sed 's/.*/&${build}/') - @echo $PACKAGE_NAME + @echo "PACKAGE_NAME: $(PACKAGE_NAME)" .PHONY: package ## build butterfree package wheel From a82433c40b011b03869ebe05070d90955b1b566b Mon Sep 17 00:00:00 2001 From: hmeretti Date: Tue, 9 Feb 2021 18:54:23 -0300 Subject: [PATCH 05/86] Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo --- .../transform/features/timestamp_feature.py | 15 ++++++------- .../butterfree/transform/features/conftest.py | 17 ++++++++++---- .../features/test_timestamp_feature.py | 22 +++++++++++++++---- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/butterfree/transform/features/timestamp_feature.py b/butterfree/transform/features/timestamp_feature.py index 2aac8925..b131eaee 100644 --- a/butterfree/transform/features/timestamp_feature.py +++ b/butterfree/transform/features/timestamp_feature.py @@ -1,6 +1,6 @@ """TimestampFeature entity.""" from pyspark.sql import DataFrame -from pyspark.sql.functions import from_unixtime, to_timestamp +from pyspark.sql.functions import to_timestamp from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN @@ -65,13 +65,12 @@ def transform(self, dataframe: DataFrame) -> DataFrame: """ column_name = self.from_column if self.from_column else self.name + ts_column = dataframe[column_name] if self.from_ms: - dataframe = dataframe.withColumn( - column_name, from_unixtime(dataframe[column_name] / 1000.0) - ) - if self.mask: - dataframe = dataframe.withColumn( - column_name, to_timestamp(dataframe[column_name], self.mask) - ) + ts_column = ts_column / 1000 + + dataframe = dataframe.withColumn( + column_name, to_timestamp(ts_column, self.mask) + ) return super().transform(dataframe) diff --git a/tests/unit/butterfree/transform/features/conftest.py b/tests/unit/butterfree/transform/features/conftest.py index e79c5075..ae644470 100644 --- a/tests/unit/butterfree/transform/features/conftest.py +++ b/tests/unit/butterfree/transform/features/conftest.py @@ -18,8 +18,8 @@ def feature_set_dataframe(spark_context, spark_session): @fixture def feature_set_dataframe_ms_from_column(spark_context, spark_session): data = [ - {"id": 1, "ts": 1581542311000, "feature": 100}, - {"id": 2, "ts": 1581542322000, "feature": 200}, + {"id": 1, "ts": 1581542311112, "feature": 100}, + {"id": 2, "ts": 1581542322223, "feature": 200}, ] return spark_session.read.json(spark_context.parallelize(data, 1)) @@ -27,8 +27,17 @@ def feature_set_dataframe_ms_from_column(spark_context, spark_session): @fixture def feature_set_dataframe_ms(spark_context, spark_session): data = [ - {"id": 1, TIMESTAMP_COLUMN: 1581542311000, "feature": 100}, - {"id": 2, TIMESTAMP_COLUMN: 1581542322000, "feature": 200}, + {"id": 1, TIMESTAMP_COLUMN: 1581542311112, "feature": 100}, + {"id": 2, TIMESTAMP_COLUMN: 1581542322223, "feature": 200}, + ] + return spark_session.read.json(spark_context.parallelize(data, 1)) + + +@fixture +def feature_set_dataframe_small_time_diff(spark_context, spark_session): + data = [ + {"id": 1, TIMESTAMP_COLUMN: 1581542311001, "feature": 100}, + {"id": 2, TIMESTAMP_COLUMN: 1581542311002, "feature": 200}, ] return spark_session.read.json(spark_context.parallelize(data, 1)) diff --git a/tests/unit/butterfree/transform/features/test_timestamp_feature.py b/tests/unit/butterfree/transform/features/test_timestamp_feature.py index c7745036..a5a688c2 100644 --- a/tests/unit/butterfree/transform/features/test_timestamp_feature.py +++ b/tests/unit/butterfree/transform/features/test_timestamp_feature.py @@ -32,8 +32,8 @@ def test_transform_ms_from_column(self, feature_set_dataframe_ms_from_column): df = df.withColumn("timestamp", df["timestamp"].cast(StringType())).collect() - assert df[0]["timestamp"] == "2020-02-12 21:18:31" - assert df[1]["timestamp"] == "2020-02-12 21:18:42" + assert df[0]["timestamp"] == "2020-02-12 21:18:31.112" + assert df[1]["timestamp"] == "2020-02-12 21:18:42.223" def test_transform_ms(self, feature_set_dataframe_ms): @@ -43,8 +43,22 @@ def test_transform_ms(self, feature_set_dataframe_ms): df = df.withColumn("timestamp", df["timestamp"].cast(StringType())).collect() - assert df[0]["timestamp"] == "2020-02-12 21:18:31" - assert df[1]["timestamp"] == "2020-02-12 21:18:42" + assert df[0]["timestamp"] == "2020-02-12 21:18:31.112" + assert df[1]["timestamp"] == "2020-02-12 21:18:42.223" + + def test_transform_ms_from_column_small_time_diff( + self, feature_set_dataframe_small_time_diff + ): + + test_key = TimestampFeature(from_ms=True) + + df = test_key.transform(feature_set_dataframe_small_time_diff).orderBy( + "timestamp" + ) + + df = df.withColumn("timestamp", df["timestamp"].cast(StringType())).collect() + + assert df[0]["timestamp"] != df[1]["timestamp"] def test_transform_mask(self, feature_set_dataframe_date): From dcbf5408d7ffaca80c7679cd3cc64a819c3bf37d Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 10 Feb 2021 14:46:31 -0300 Subject: [PATCH 06/86] Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. --- .github/workflows/staging.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 62e97ea2..8b39e5ac 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -1,8 +1,8 @@ name: "Publish Dev Package" on: push: - branches: - - staging + paths: + - 'setup.py' jobs: @@ -19,12 +19,10 @@ jobs: run: make ci-install - name: Get version - run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 ).dev$(git rev-parse --short "$GITHUB_SHA")" >> $GITHUB_ENV + run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 )" >> $GITHUB_ENV - name: Build package - run: | - make change-version NEW_VERSION="${{ env.version }}" - make package + run: make package - name: Create release uses: actions/create-release@v1 From a0a933596ce0a2233b30ccbd4e8e60e62ae8aa5f Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 10 Feb 2021 17:20:46 -0300 Subject: [PATCH 07/86] Create a dev package. (#288) --- CHANGELOG.md | 20 ++++++++++++++++++++ setup.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48b5cbf1..72994621 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,26 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. +## [Unreleased] +### Added +* [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273)) + +### Changed +* [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master ([#280](https://github.com/quintoandar/butterfree/pull/280)) +* Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) + +### Fixed +* Change trigger for pipeline staging ([#287](https://github.com/quintoandar/butterfree/pull/287)) + +## [1.1.2](https://github.com/quintoandar/butterfree/releases/tag/1.1.2) +### Fixed +* [HOTFIX] Add both cache and count back to Butterfree ([#274](https://github.com/quintoandar/butterfree/pull/274)) +* [MLOP-606] Change docker image in Github Actions Pipeline ([#275](https://github.com/quintoandar/butterfree/pull/275)) +* FIX Read the Docs build ([#272](https://github.com/quintoandar/butterfree/pull/272)) +* [BUG] Fix style ([#271](https://github.com/quintoandar/butterfree/pull/271)) +* [MLOP-594] Remove from_column in some transforms ([#270](https://github.com/quintoandar/butterfree/pull/270)) +* [MLOP-536] Rename S3 config to Metastore config ([#269](https://github.com/quintoandar/butterfree/pull/269)) + ## [1.1.1](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) ### Added * [MLOP-590] Adapt KafkaConfig to receive a custom topic name ([#266](https://github.com/quintoandar/butterfree/pull/266)) diff --git a/setup.py b/setup.py index 47ba0b98..bf471fec 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.1.1" +__version__ = "1.1.3.dev0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 74278986a49f1825beee0fd8df65a585764e5524 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 10 Feb 2021 18:09:38 -0300 Subject: [PATCH 08/86] [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. --- README.md | 2 ++ WORKFLOW.md | 63 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index d221d866..728f7b02 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ Or after listing `butterfree` in your `requirements.txt` file: pip install -r requirements.txt ``` +Dev Package are available for testing using the .devN versions of the Butterfree on PyPi. + ## License [Apache License 2.0](https://github.com/quintoandar/butterfree/blob/staging/LICENSE) diff --git a/WORKFLOW.md b/WORKFLOW.md index 601e3793..5eaa18cd 100644 --- a/WORKFLOW.md +++ b/WORKFLOW.md @@ -2,20 +2,18 @@ ## Features -A feature is based on the `master` branch and merged back into the `master` branch. - -![](https://docs.microsoft.com/en-us/azure/devops/repos/git/media/branching-guidance/featurebranching.png?view=azure-devops) +A feature is based on the `staging` branch and merged back into the `staging` branch. ### Working Locally ``` -# checkout master, fetch the latest changes and pull them from remote into local -git checkout master +# checkout staging, fetch the latest changes and pull them from remote into local +git checkout staging git fetch -git pull origin master +git pull origin staging -# create a feature branch that is based off master +# create a feature branch that is based off staging git checkout -b /some-description # do your work @@ -24,10 +22,10 @@ git commit -m "first commit" git add another git commit -m "second commit" -# rebase against master to pull in any changes that have been made +# rebase against staging to pull in any changes that have been made # since you started your feature branch. git fetch -git rebase origin/master +git rebase origin/staging # push your local changes up to the remote git push @@ -35,41 +33,71 @@ git push # if you've already pushed changes and have rebased, your history has changed # so you will need to force the push git fetch -git rebase origin/master +git rebase origin/staging git push --force-with-lease ```` ### GitHub workflow -- Open a Pull Request against `master`. Check our PR guidelines [here](https://github.com/quintoandar/butterfree/blob/master/CONTRIBUTING.md#pull-request-guideline). +- Open a Pull Request against `staging`. Check our PR guidelines [here](https://github.com/quintoandar/butterfree/blob/master/CONTRIBUTING.md#pull-request-guideline). - When the Pull Request has been approved, merge using `squash and merge`, adding a brief description: ie, ` Enable stream pipelines in Butterfree`. - This squashes all your commits into a single clean commit. Remember to clean detailed descriptions, otherwise our git logs will be a mess. -If you are unable to squash merge because of conflicts, you need to rebase against `master` again: +If you are unable to squash merge because of conflicts, you need to rebase against `staging` again: ``` # in your feature branch git fetch -git rebase origin/master +git rebase origin/staging # fix conflicts if they exist git push --force-with-lease ``` +## Pre-Releases + +The pre-release will always occur when we change the version in the setup.py file to staging branch. + + +### Working Locally + +``` +# create a feature branch +git checkout staging +git fetch +git pull origin staging +git checkout -b pre-release/ + +# finalize the changelog in Unreleased and bump the version into setup.py then: +git add CHANGELOG.md +git add setup.py +git commit -m "pre-release " + +# push the new version +git fetch +git push --force-with-lease +``` + +### Github workflow + +- Open a Pull Request against `staging`. +- When the PR's approved and the code is tested, `squash and merge` to squash your commits into a single commit. +- The creation of the pre-release tag and the update of the PyPi version will be done +automatically from the Publish Dev Package workflow, you can follow [here](https://github.com/quintoandar/butterfree/actions?query=workflow%3A%22Publish+Dev+Package%22). ## Releases -The release will always occur when we change the version in the setup.py file. +The release will always occur when we change the version in the setup.py file to master branch. ### Working Locally ``` # create a feature branch -git checkout master +git checkout staging git fetch -git pull origin master +git pull origin staging git checkout -b release/ # finalize the changelog, bump the version into setup.py and update the documentation then: @@ -121,7 +149,6 @@ git checkout master@ git fetch git pull origin master git checkout -b hotfix/ -git checkout -b describe-the-problem git add patch.fix git add setup.py @@ -133,7 +160,7 @@ Don't forget to update the Changelog and the version in `setup.py`. ### Github workflow -- Open a Pull Request against `hotfix/` +- Open a Pull Request against `master`. - When the PR's approved and the code is tested, `squash and merge` to squash your commits into a single commit. - A tag will automatically be triggered in our CI/CD. This tag/release will use the version for its title and push a new version of Butterfree's python package to our private server. From 245eaa594846166972241b03fddc61ee5117b1f7 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Thu, 11 Feb 2021 11:51:04 -0300 Subject: [PATCH 09/86] [MLOP-632] Butterfree dev workflow, automate release description (#279) --- .github/workflows/publish.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index ba94bc7e..3620cdbb 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -24,6 +24,10 @@ jobs: - name: Get version run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2)" >> $GITHUB_ENV + - name: Get release notes + id: get_release_notes + uses: ffurrer2/extract-release-notes@v1 + - name: Create release uses: actions/create-release@v1 env: @@ -31,6 +35,7 @@ jobs: with: tag_name: ${{ env.version }} release_name: Release ${{ env.version }} + body: ${{ steps.get_release_notes.outputs.release_notes }} - name: Release already exist if: ${{ failure() }} From d6ecfa425136fab07826c01d0a7ac271f7a37a30 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Thu, 18 Feb 2021 15:14:44 -0300 Subject: [PATCH 10/86] [MLOP-636] Create migration classes (#282) --- butterfree/migrations/__init__.py | 7 +++ butterfree/migrations/cassandra_migration.py | 23 ++++++++ butterfree/migrations/metastore_migration.py | 23 ++++++++ butterfree/migrations/migration.py | 62 ++++++++++++++++++++ 4 files changed, 115 insertions(+) create mode 100644 butterfree/migrations/__init__.py create mode 100644 butterfree/migrations/cassandra_migration.py create mode 100644 butterfree/migrations/metastore_migration.py create mode 100644 butterfree/migrations/migration.py diff --git a/butterfree/migrations/__init__.py b/butterfree/migrations/__init__.py new file mode 100644 index 00000000..5f709bfe --- /dev/null +++ b/butterfree/migrations/__init__.py @@ -0,0 +1,7 @@ +"""Holds available migrations.""" + +from butterfree.migrations.cassandra_migration import CassandraMigration +from butterfree.migrations.metastore_migration import MetastoreMigration +from butterfree.migrations.migration import DatabaseMigration + +__all__ = ["DatabaseMigration", "CassandraMigration", "MetastoreMigration"] diff --git a/butterfree/migrations/cassandra_migration.py b/butterfree/migrations/cassandra_migration.py new file mode 100644 index 00000000..e9cecdc7 --- /dev/null +++ b/butterfree/migrations/cassandra_migration.py @@ -0,0 +1,23 @@ +"""Cassandra Migration entity.""" + +from typing import Any, Dict, List + +from butterfree.migrations import DatabaseMigration + + +class CassandraMigration(DatabaseMigration): + """Cassandra class for Migrations.""" + + def create_query( + self, + fs_schema: List[Dict[str, Any]], + db_schema: List[Dict[str, Any]], + table_name: str, + ) -> Any: + """Create a query regarding Cassandra. + + Returns: + Schema object. + + """ + pass diff --git a/butterfree/migrations/metastore_migration.py b/butterfree/migrations/metastore_migration.py new file mode 100644 index 00000000..bb208f2a --- /dev/null +++ b/butterfree/migrations/metastore_migration.py @@ -0,0 +1,23 @@ +"""Metastore Migration entity.""" + +from typing import Any, Dict, List + +from butterfree.migrations import DatabaseMigration + + +class MetastoreMigration(DatabaseMigration): + """Metastore class for Migrations.""" + + def create_query( + self, + fs_schema: List[Dict[str, Any]], + db_schema: List[Dict[str, Any]], + table_name: str, + ) -> Any: + """Create a query regarding Metastore. + + Returns: + Schema object. + + """ + pass diff --git a/butterfree/migrations/migration.py b/butterfree/migrations/migration.py new file mode 100644 index 00000000..c53945bf --- /dev/null +++ b/butterfree/migrations/migration.py @@ -0,0 +1,62 @@ +"""Migration entity.""" + +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, List + +from butterfree.pipelines import FeatureSetPipeline + + +class DatabaseMigration(ABC): + """Abstract base class for Migrations.""" + + @abstractmethod + def create_query( + self, + fs_schema: List[Dict[str, Any]], + db_schema: List[Dict[str, Any]], + table_name: str, + ) -> Any: + """Create a query regarding a data source. + + Returns: + The desired query for the given database. + + """ + + def _validate_schema( + self, fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]] + ) -> Any: + """Provides schema validation for feature sets. + + Compares the schema of your local feature set to the + corresponding table in a given database. + + Args: + fs_schema: object that contains feature set's schemas. + db_schema: object that contains the table og a given db schema. + + """ + + def _get_schema(self, db_client: Callable, table_name: str) -> List[Dict[str, Any]]: + """Get a table schema in the respective database. + + Returns: + Schema object. + """ + pass + + def _apply_migration(self, query: str, db_client: Callable) -> None: + """Apply the migration in the respective database.""" + + def _send_logs_to_s3(self) -> None: + """Send all migration logs to S3.""" + pass + + def run(self, pipelines: List[FeatureSetPipeline]) -> None: + """Runs the migrations. + + Args: + pipelines: the feature set pipelines. + + """ + pass From 32e24d6382a973452dd20611c22358cd8d5976bd Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Fri, 19 Feb 2021 10:18:09 -0300 Subject: [PATCH 11/86] [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. --- .gitignore | 1 + CHANGELOG.md | 6 + butterfree/clients/cassandra_client.py | 41 +- butterfree/clients/spark_client.py | 58 +- butterfree/configs/db/metastore_config.py | 28 + butterfree/configs/environment.py | 4 +- butterfree/constants/window_definitions.py | 16 + butterfree/dataframe_service/__init__.py | 9 +- .../dataframe_service/incremental_strategy.py | 116 + butterfree/dataframe_service/partitioning.py | 25 + butterfree/extract/readers/file_reader.py | 12 +- butterfree/extract/readers/reader.py | 88 +- butterfree/extract/source.py | 24 +- butterfree/hooks/__init__.py | 5 + butterfree/hooks/hook.py | 20 + butterfree/hooks/hookable_component.py | 148 ++ .../hooks/schema_compatibility/__init__.py | 9 + ...ssandra_table_schema_compatibility_hook.py | 58 + .../spark_table_schema_compatibility_hook.py | 46 + butterfree/load/sink.py | 13 +- .../historical_feature_store_writer.py | 113 +- .../writers/online_feature_store_writer.py | 50 +- butterfree/load/writers/writer.py | 21 +- butterfree/pipelines/feature_set_pipeline.py | 56 +- .../transform/aggregated_feature_set.py | 49 +- butterfree/transform/feature_set.py | 38 +- butterfree/transform/utils/window_spec.py | 20 +- examples/interval_runs/interval_runs.ipynb | 2152 +++++++++++++++++ setup.py | 2 +- .../integration/butterfree/load/test_sink.py | 35 +- .../butterfree/pipelines/conftest.py | 202 ++ .../pipelines/test_feature_set_pipeline.py | 311 ++- .../butterfree/transform/conftest.py | 55 + .../transform/test_aggregated_feature_set.py | 50 + .../butterfree/transform/test_feature_set.py | 44 + tests/unit/butterfree/clients/conftest.py | 11 +- .../clients/test_cassandra_client.py | 4 +- .../butterfree/clients/test_spark_client.py | 69 +- .../butterfree/dataframe_service/conftest.py | 14 + .../test_incremental_srategy.py | 70 + .../dataframe_service/test_partitioning.py | 20 + tests/unit/butterfree/extract/conftest.py | 55 + .../extract/readers/test_file_reader.py | 10 +- .../butterfree/extract/readers/test_reader.py | 58 + tests/unit/butterfree/hooks/__init__.py | 0 .../hooks/schema_compatibility/__init__.py | 0 ...ssandra_table_schema_compatibility_hook.py | 49 + ...t_spark_table_schema_compatibility_hook.py | 53 + .../hooks/test_hookable_component.py | 107 + tests/unit/butterfree/load/conftest.py | 25 + tests/unit/butterfree/load/test_sink.py | 34 +- .../test_historical_feature_store_writer.py | 144 +- .../test_online_feature_store_writer.py | 41 +- tests/unit/butterfree/pipelines/conftest.py | 63 + .../pipelines/test_feature_set_pipeline.py | 182 +- tests/unit/butterfree/transform/conftest.py | 82 + .../transform/test_aggregated_feature_set.py | 68 +- .../butterfree/transform/test_feature_set.py | 43 +- 58 files changed, 4738 insertions(+), 389 deletions(-) create mode 100644 butterfree/constants/window_definitions.py create mode 100644 butterfree/dataframe_service/incremental_strategy.py create mode 100644 butterfree/dataframe_service/partitioning.py create mode 100644 butterfree/hooks/__init__.py create mode 100644 butterfree/hooks/hook.py create mode 100644 butterfree/hooks/hookable_component.py create mode 100644 butterfree/hooks/schema_compatibility/__init__.py create mode 100644 butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py create mode 100644 butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py create mode 100644 examples/interval_runs/interval_runs.ipynb create mode 100644 tests/unit/butterfree/dataframe_service/test_incremental_srategy.py create mode 100644 tests/unit/butterfree/dataframe_service/test_partitioning.py create mode 100644 tests/unit/butterfree/hooks/__init__.py create mode 100644 tests/unit/butterfree/hooks/schema_compatibility/__init__.py create mode 100644 tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py create mode 100644 tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py create mode 100644 tests/unit/butterfree/hooks/test_hookable_component.py create mode 100644 tests/unit/butterfree/pipelines/conftest.py diff --git a/.gitignore b/.gitignore index 72b591f3..62434612 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ coverage.xml *.cover .hypothesis/ *cov.xml +test_folder/ # Translations *.mo diff --git a/CHANGELOG.md b/CHANGELOG.md index 72994621..679e9834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,17 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] ### Added +* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) + +## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3) +### Added * [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273)) ### Changed * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master ([#280](https://github.com/quintoandar/butterfree/pull/280)) * Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) +* [MLOP-633] Butterfree dev workflow, update documentation ([#281](https://github.com/quintoandar/butterfree/commit/74278986a49f1825beee0fd8df65a585764e5524)) +* [MLOP-632] Butterfree dev workflow, automate release description ([#279](https://github.com/quintoandar/butterfree/commit/245eaa594846166972241b03fddc61ee5117b1f7)) ### Fixed * Change trigger for pipeline staging ([#287](https://github.com/quintoandar/butterfree/pull/287)) diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 1e541688..938d4e4d 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -33,33 +33,31 @@ class CassandraClient(AbstractClient): """Cassandra Client. Attributes: - cassandra_user: username to use in connection. - cassandra_password: password to use in connection. - cassandra_key_space: key space used in connection. - cassandra_host: cassandra endpoint used in connection. + user: username to use in connection. + password: password to use in connection. + keyspace: key space used in connection. + host: cassandra endpoint used in connection. """ def __init__( self, - cassandra_host: List[str], - cassandra_key_space: str, - cassandra_user: Optional[str] = None, - cassandra_password: Optional[str] = None, + host: List[str], + keyspace: str, + user: Optional[str] = None, + password: Optional[str] = None, ) -> None: - self.cassandra_host = cassandra_host - self.cassandra_key_space = cassandra_key_space - self.cassandra_user = cassandra_user - self.cassandra_password = cassandra_password + self.host = host + self.keyspace = keyspace + self.user = user + self.password = password self._session: Optional[Session] = None @property def conn(self, *, ssl_path: str = None) -> Session: # type: ignore """Establishes a Cassandra connection.""" auth_provider = ( - PlainTextAuthProvider( - username=self.cassandra_user, password=self.cassandra_password - ) - if self.cassandra_user is not None + PlainTextAuthProvider(username=self.user, password=self.password) + if self.user is not None else None ) ssl_opts = ( @@ -73,12 +71,12 @@ def conn(self, *, ssl_path: str = None) -> Session: # type: ignore ) cluster = Cluster( - contact_points=self.cassandra_host, + contact_points=self.host, auth_provider=auth_provider, ssl_options=ssl_opts, load_balancing_policy=RoundRobinPolicy(), ) - self._session = cluster.connect(self.cassandra_key_space) + self._session = cluster.connect(self.keyspace) self._session.row_factory = dict_factory return self._session @@ -106,7 +104,7 @@ def get_schema(self, table: str) -> List[Dict[str, str]]: """ query = ( f"SELECT column_name, type FROM system_schema.columns " # noqa - f"WHERE keyspace_name = '{self.cassandra_key_space}' " # noqa + f"WHERE keyspace_name = '{self.keyspace}' " # noqa f" AND table_name = '{table}';" # noqa ) @@ -114,8 +112,7 @@ def get_schema(self, table: str) -> List[Dict[str, str]]: if not response: raise RuntimeError( - f"No columns found for table: {table}" - f"in key space: {self.cassandra_key_space}" + f"No columns found for table: {table}" f"in key space: {self.keyspace}" ) return response @@ -143,7 +140,7 @@ def _get_create_table_query( else: columns_str = joined_parsed_columns - query = f"CREATE TABLE {self.cassandra_key_space}.{table} " f"({columns_str}); " + query = f"CREATE TABLE {self.keyspace}.{table} " f"({columns_str}); " return query diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index 0a8c717c..0f0113e2 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -34,9 +34,10 @@ def conn(self) -> SparkSession: def read( self, format: str, - options: Dict[str, Any], + path: Optional[Union[str, List[str]]] = None, schema: Optional[StructType] = None, stream: bool = False, + **options: Any, ) -> DataFrame: """Use the SparkSession.read interface to load data into a dataframe. @@ -45,9 +46,10 @@ def read( Args: format: string with the format to be used by the DataframeReader. - options: options to setup the DataframeReader. + path: optional string or a list of string for file-system. stream: flag to indicate if data must be read in stream mode. schema: an optional pyspark.sql.types.StructType for the input schema. + options: options to setup the DataframeReader. Returns: Dataframe @@ -55,14 +57,16 @@ def read( """ if not isinstance(format, str): raise ValueError("format needs to be a string with the desired read format") - if not isinstance(options, dict): - raise ValueError("options needs to be a dict with the setup configurations") + if not isinstance(path, (str, list)): + raise ValueError("path needs to be a string or a list of string") df_reader: Union[ DataStreamReader, DataFrameReader ] = self.conn.readStream if stream else self.conn.read + df_reader = df_reader.schema(schema) if schema else df_reader - return df_reader.format(format).options(**options).load() + + return df_reader.format(format).load(path, **options) # type: ignore def read_table(self, table: str, database: str = None) -> DataFrame: """Use the SparkSession.read interface to read a metastore table. @@ -223,3 +227,47 @@ def create_temporary_view(self, dataframe: DataFrame, name: str) -> Any: if not dataframe.isStreaming: return dataframe.createOrReplaceTempView(name) return dataframe.writeStream.format("memory").queryName(name).start() + + def add_table_partitions( + self, partitions: List[Dict[str, Any]], table: str, database: str = None + ) -> None: + """Add partitions to an existing table. + + Args: + partitions: partitions to add to the table. + It's expected a list of partition dicts to add to the table. + Example: `[{"year": 2020, "month": 8, "day": 14}, ...]` + table: table to add the partitions. + database: name of the database where the table is saved. + """ + for partition_dict in partitions: + if not all( + ( + isinstance(key, str) + and (isinstance(value, str) or isinstance(value, int)) + ) + for key, value in partition_dict.items() + ): + raise ValueError( + "Partition keys must be column names " + "and values must be string or int." + ) + + database_expr = f"`{database}`." if database else "" + key_values_expr = [ + ", ".join( + [ + "{} = {}".format(k, v) + if not isinstance(v, str) + else "{} = '{}'".format(k, v) + for k, v in partition.items() + ] + ) + for partition in partitions + ] + partitions_expr = " ".join(f"PARTITION ( {expr} )" for expr in key_values_expr) + command = ( + f"ALTER TABLE {database_expr}`{table}` ADD IF NOT EXISTS {partitions_expr}" + ) + + self.conn.sql(command) diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index d94b792c..a3b315d5 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -3,8 +3,11 @@ import os from typing import Any, Dict, List, Optional +from pyspark.sql import DataFrame + from butterfree.configs import environment from butterfree.configs.db import AbstractWriteConfig +from butterfree.dataframe_service import extract_partition_values class MetastoreConfig(AbstractWriteConfig): @@ -87,6 +90,31 @@ def get_options(self, key: str) -> Dict[Optional[str], Optional[str]]: "path": os.path.join(f"{self.file_system}://{self.path}/", key), } + def get_path_with_partitions(self, key: str, dataframe: DataFrame) -> List: + """Get options for AWS S3 from partitioned parquet file. + + Options will be a dictionary with the write and read configuration for + Spark to AWS S3. + + Args: + key: path to save data into AWS S3 bucket. + dataframe: spark dataframe containing data from a feature set. + + Returns: + A list of string for file-system backed data sources. + """ + path_list = [] + dataframe_values = extract_partition_values( + dataframe, partition_columns=["year", "month", "day"] + ) + for row in dataframe_values: + path_list.append( + f"{self.file_system}://{self.path}/{key}/year={row['year']}/" + f"month={row['month']}/day={row['day']}" + ) + + return path_list + def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Translate feature set spark schema to the corresponding database.""" pass diff --git a/butterfree/configs/environment.py b/butterfree/configs/environment.py index 6f5accbc..f98a7a01 100644 --- a/butterfree/configs/environment.py +++ b/butterfree/configs/environment.py @@ -35,8 +35,8 @@ def get_variable(variable_name: str, default_value: str = None) -> Optional[str] """Gets an environment variable. The variable comes from it's explicitly declared value in the running - environment or from the default value declared in the environment.yaml - specification or from the default_value. + environment or from the default value declared in specification or from the + default_value. Args: variable_name: environment variable name. diff --git a/butterfree/constants/window_definitions.py b/butterfree/constants/window_definitions.py new file mode 100644 index 00000000..560904f7 --- /dev/null +++ b/butterfree/constants/window_definitions.py @@ -0,0 +1,16 @@ +"""Allowed windows units and lengths in seconds.""" + +ALLOWED_WINDOWS = { + "second": 1, + "seconds": 1, + "minute": 60, + "minutes": 60, + "hour": 3600, + "hours": 3600, + "day": 86400, + "days": 86400, + "week": 604800, + "weeks": 604800, + "year": 29030400, + "years": 29030400, +} diff --git a/butterfree/dataframe_service/__init__.py b/butterfree/dataframe_service/__init__.py index 5116261d..c227dae2 100644 --- a/butterfree/dataframe_service/__init__.py +++ b/butterfree/dataframe_service/__init__.py @@ -1,4 +1,11 @@ """Dataframe optimization components regarding Butterfree.""" +from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy +from butterfree.dataframe_service.partitioning import extract_partition_values from butterfree.dataframe_service.repartition import repartition_df, repartition_sort_df -__all__ = ["repartition_df", "repartition_sort_df"] +__all__ = [ + "extract_partition_values", + "IncrementalStrategy", + "repartition_df", + "repartition_sort_df", +] diff --git a/butterfree/dataframe_service/incremental_strategy.py b/butterfree/dataframe_service/incremental_strategy.py new file mode 100644 index 00000000..6554d3b7 --- /dev/null +++ b/butterfree/dataframe_service/incremental_strategy.py @@ -0,0 +1,116 @@ +"""IncrementalStrategy entity.""" + +from __future__ import annotations + +from pyspark.sql import DataFrame + + +class IncrementalStrategy: + """Define an incremental strategy to be used on data sources. + + Entity responsible for defining a column expression that will be used to + filter the original data source. The purpose is to get only the data related + to a specific pipeline execution time interval. + + Attributes: + column: column expression on which incremental filter will be applied. + The expression need to result on a date or timestamp format, so the + filter can properly work with the defined upper and lower bounds. + """ + + def __init__(self, column: str = None): + self.column = column + + def from_milliseconds(self, column_name: str) -> IncrementalStrategy: + """Create a column expression from ts column defined as milliseconds. + + Args: + column_name: column name where the filter will be applied. + + Returns: + `IncrementalStrategy` with the defined column expression. + """ + return IncrementalStrategy(column=f"from_unixtime({column_name}/ 1000.0)") + + def from_string(self, column_name: str, mask: str = None) -> IncrementalStrategy: + """Create a column expression from ts column defined as a simple string. + + Args: + column_name: column name where the filter will be applied. + mask: mask defining the date/timestamp format on the string. + + Returns: + `IncrementalStrategy` with the defined column expression. + """ + return IncrementalStrategy(column=f"to_date({column_name}, '{mask}')") + + def from_year_month_day_partitions( + self, + year_column: str = "year", + month_column: str = "month", + day_column: str = "day", + ) -> IncrementalStrategy: + """Create a column expression from year, month and day partitions. + + Args: + year_column: column name from the year partition. + month_column: column name from the month partition. + day_column: column name from the day partition. + + Returns: + `IncrementalStrategy` with the defined column expression. + """ + return IncrementalStrategy( + column=f"concat(string({year_column}), " + f"'-', string({month_column}), " + f"'-', string({day_column}))" + ) + + def get_expression(self, start_date: str = None, end_date: str = None) -> str: + """Get the incremental filter expression using the defined dates. + + Both arguments can be set to defined a specific date interval, but it's + only necessary to set one of the arguments for this method to work. + + Args: + start_date: date lower bound to use in the filter. + end_date: date upper bound to use in the filter. + + Returns: + Filter expression based on defined column and bounds. + + Raises: + ValuerError: If both arguments, start_date and end_date, are None. + ValueError: If the column expression was not defined. + """ + if not self.column: + raise ValueError("column parameter can't be None") + if not (start_date or end_date): + raise ValueError("Both arguments start_date and end_date can't be None.") + if start_date: + expression = f"date({self.column}) >= date('{start_date}')" + if end_date: + expression += f" and date({self.column}) <= date('{end_date}')" + return expression + return f"date({self.column}) <= date('{end_date}')" + + def filter_with_incremental_strategy( + self, dataframe: DataFrame, start_date: str = None, end_date: str = None + ) -> DataFrame: + """Filters the dataframe according to the date boundaries. + + Args: + dataframe: dataframe that will be filtered. + start_date: date lower bound to use in the filter. + end_date: date upper bound to use in the filter. + + Returns: + Filtered dataframe based on defined time boundaries. + """ + return ( + dataframe.where( + self.get_expression(start_date=start_date, end_date=end_date) + ) + if start_date or end_date + else dataframe + ) diff --git a/butterfree/dataframe_service/partitioning.py b/butterfree/dataframe_service/partitioning.py new file mode 100644 index 00000000..21e9b0ab --- /dev/null +++ b/butterfree/dataframe_service/partitioning.py @@ -0,0 +1,25 @@ +"""Module defining partitioning methods.""" + +from typing import Any, Dict, List + +from pyspark.sql import DataFrame + + +def extract_partition_values( + dataframe: DataFrame, partition_columns: List[str] +) -> List[Dict[str, Any]]: + """Extract distinct partition values from a given dataframe. + + Args: + dataframe: dataframe from where to extract partition values. + partition_columns: name of partition columns presented on the dataframe. + + Returns: + distinct partition values. + """ + return ( + dataframe.select(*partition_columns) + .distinct() + .rdd.map(lambda row: row.asDict(True)) + .collect() + ) diff --git a/butterfree/extract/readers/file_reader.py b/butterfree/extract/readers/file_reader.py index 17f68f1c..8cf15599 100644 --- a/butterfree/extract/readers/file_reader.py +++ b/butterfree/extract/readers/file_reader.py @@ -87,9 +87,7 @@ def __init__( self.path = path self.format = format self.schema = schema - self.options = dict( - {"path": self.path}, **format_options if format_options else {} - ) + self.options = dict(format_options if format_options else {}) self.stream = stream def consume(self, client: SparkClient) -> DataFrame: @@ -106,11 +104,15 @@ def consume(self, client: SparkClient) -> DataFrame: """ schema = ( - client.read(format=self.format, options=self.options,).schema + client.read(format=self.format, path=self.path, **self.options).schema if (self.stream and not self.schema) else self.schema ) return client.read( - format=self.format, options=self.options, schema=schema, stream=self.stream, + format=self.format, + schema=schema, + stream=self.stream, + path=self.path, + **self.options, ) diff --git a/butterfree/extract/readers/reader.py b/butterfree/extract/readers/reader.py index 78be2823..597c870f 100644 --- a/butterfree/extract/readers/reader.py +++ b/butterfree/extract/readers/reader.py @@ -2,14 +2,16 @@ from abc import ABC, abstractmethod from functools import reduce -from typing import Any, Callable, Dict, List +from typing import Any, Callable, Dict, List, Optional from pyspark.sql import DataFrame from butterfree.clients import SparkClient +from butterfree.dataframe_service import IncrementalStrategy +from butterfree.hooks import HookableComponent -class Reader(ABC): +class Reader(ABC, HookableComponent): """Abstract base class for Readers. Attributes: @@ -19,9 +21,11 @@ class Reader(ABC): """ - def __init__(self, id: str): + def __init__(self, id: str, incremental_strategy: IncrementalStrategy = None): + super().__init__() self.id = id self.transformations: List[Dict[str, Any]] = [] + self.incremental_strategy = incremental_strategy def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any @@ -48,14 +52,19 @@ def with_( self.transformations.append(new_transformation) return self - def _apply_transformations(self, df: DataFrame) -> Any: - return reduce( - lambda result_df, transformation: transformation["transformer"]( - result_df, *transformation["args"], **transformation["kwargs"] - ), - self.transformations, - df, - ) + def with_incremental_strategy( + self, incremental_strategy: IncrementalStrategy + ) -> "Reader": + """Define the incremental strategy for the Reader. + + Args: + incremental_strategy: definition of the incremental strategy. + + Returns: + Reader with defined incremental strategy. + """ + self.incremental_strategy = incremental_strategy + return self @abstractmethod def consume(self, client: SparkClient) -> DataFrame: @@ -70,24 +79,61 @@ def consume(self, client: SparkClient) -> DataFrame: :return: Spark dataframe """ - def build(self, client: SparkClient, columns: List[Any] = None) -> None: + def build( + self, + client: SparkClient, + columns: List[Any] = None, + start_date: str = None, + end_date: str = None, + ) -> None: """Register the data got from the reader in the Spark metastore. Create a temporary view in Spark metastore referencing the data extracted from the target origin after the application of all the defined pre-processing transformations. + The arguments start_date and end_date are going to be use only when there + is a defined `IncrementalStrategy` on the `Reader`. + Args: client: client responsible for connecting to Spark session. - columns: list of tuples for renaming/filtering the dataset. + columns: list of tuples for selecting/renaming columns on the df. + start_date: lower bound to use in the filter expression. + end_date: upper bound to use in the filter expression. """ - transformed_df = self._apply_transformations(self.consume(client)) - - if columns: - select_expression = [] - for old_expression, new_column_name in columns: - select_expression.append(f"{old_expression} as {new_column_name}") - transformed_df = transformed_df.selectExpr(*select_expression) + column_selection_df = self._select_columns(columns, client) + transformed_df = self._apply_transformations(column_selection_df) + + if self.incremental_strategy: + transformed_df = self.incremental_strategy.filter_with_incremental_strategy( + transformed_df, start_date, end_date + ) + + post_hook_df = self.run_post_hooks(transformed_df) + + post_hook_df.createOrReplaceTempView(self.id) + + def _select_columns( + self, columns: Optional[List[Any]], client: SparkClient + ) -> DataFrame: + df = self.consume(client) + return df.selectExpr( + *( + [ + f"{old_expression} as {new_column_name}" + for old_expression, new_column_name in columns + ] + if columns + else df.columns + ) + ) - transformed_df.createOrReplaceTempView(self.id) + def _apply_transformations(self, df: DataFrame) -> DataFrame: + return reduce( + lambda result_df, transformation: transformation["transformer"]( + result_df, *transformation["args"], **transformation["kwargs"] + ), + self.transformations, + df, + ) diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 00ac9e43..6d905c6b 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -6,9 +6,10 @@ from butterfree.clients import SparkClient from butterfree.extract.readers.reader import Reader +from butterfree.hooks import HookableComponent -class Source: +class Source(HookableComponent): """The definition of the the entry point data for the ETL pipeline. A FeatureSet (the next step in the pipeline) expects a single dataframe as @@ -51,31 +52,44 @@ class Source: """ def __init__(self, readers: List[Reader], query: str) -> None: + super().__init__() + self.enable_pre_hooks = False self.readers = readers self.query = query - def construct(self, client: SparkClient) -> DataFrame: + def construct( + self, client: SparkClient, start_date: str = None, end_date: str = None + ) -> DataFrame: """Construct an entry point dataframe for a feature set. This method will assemble multiple readers, by building each one and - querying them using a Spark SQL. + querying them using a Spark SQL. It's important to highlight that in + order to filter a dataframe regarding date boundaries, it's important + to define a IncrementalStrategy, otherwise your data will not be filtered. + Besides, both start and end dates parameters are optional. After that, there's the caching of the dataframe, however since cache() in Spark is lazy, an action is triggered in order to force persistence. Args: client: client responsible for connecting to Spark session. + start_date: user defined start date for filtering. + end_date: user defined end date for filtering. Returns: DataFrame with the query result against all readers. """ for reader in self.readers: - reader.build(client) # create temporary views for each reader + reader.build( + client=client, start_date=start_date, end_date=end_date + ) # create temporary views for each reader dataframe = client.sql(self.query) if not dataframe.isStreaming: dataframe.cache().count() - return dataframe + post_hook_df = self.run_post_hooks(dataframe) + + return post_hook_df diff --git a/butterfree/hooks/__init__.py b/butterfree/hooks/__init__.py new file mode 100644 index 00000000..90bedeb2 --- /dev/null +++ b/butterfree/hooks/__init__.py @@ -0,0 +1,5 @@ +"""Holds Hooks definitions.""" +from butterfree.hooks.hook import Hook +from butterfree.hooks.hookable_component import HookableComponent + +__all__ = ["Hook", "HookableComponent"] diff --git a/butterfree/hooks/hook.py b/butterfree/hooks/hook.py new file mode 100644 index 00000000..f7d8c562 --- /dev/null +++ b/butterfree/hooks/hook.py @@ -0,0 +1,20 @@ +"""Hook abstract class entity.""" + +from abc import ABC, abstractmethod + +from pyspark.sql import DataFrame + + +class Hook(ABC): + """Definition of a hook function to call on a Dataframe.""" + + @abstractmethod + def run(self, dataframe: DataFrame) -> DataFrame: + """Run interface for Hook. + + Args: + dataframe: dataframe to use in the Hook. + + Returns: + dataframe result from the Hook. + """ diff --git a/butterfree/hooks/hookable_component.py b/butterfree/hooks/hookable_component.py new file mode 100644 index 00000000..d89babce --- /dev/null +++ b/butterfree/hooks/hookable_component.py @@ -0,0 +1,148 @@ +"""Definition of hookable component.""" + +from __future__ import annotations + +from typing import List + +from pyspark.sql import DataFrame + +from butterfree.hooks.hook import Hook + + +class HookableComponent: + """Defines a component with the ability to hold pre and post hook functions. + + All main module of Butterfree have a common object that enables their integration: + dataframes. Spark's dataframe is the glue that enables the transmission of data + between the main modules. Hooks have a simple interface, they are functions that + accepts a dataframe and outputs a dataframe. These Hooks can be triggered before or + after the main execution of a component. + + Components from Butterfree that inherit HookableComponent entity, are components + that can define a series of steps to occur before or after the execution of their + main functionality. + + Attributes: + pre_hooks: function steps to trigger before component main functionality. + post_hooks: function steps to trigger after component main functionality. + enable_pre_hooks: property to indicate if the component can define pre_hooks. + enable_post_hooks: property to indicate if the component can define post_hooks. + """ + + def __init__(self) -> None: + self.pre_hooks = [] + self.post_hooks = [] + self.enable_pre_hooks = True + self.enable_post_hooks = True + + @property + def pre_hooks(self) -> List[Hook]: + """Function steps to trigger before component main functionality.""" + return self.__pre_hook + + @pre_hooks.setter + def pre_hooks(self, value: List[Hook]) -> None: + if not isinstance(value, list): + raise ValueError("pre_hooks should be a list of Hooks.") + if not all(isinstance(item, Hook) for item in value): + raise ValueError( + "All items on pre_hooks list should be an instance of Hook." + ) + self.__pre_hook = value + + @property + def post_hooks(self) -> List[Hook]: + """Function steps to trigger after component main functionality.""" + return self.__post_hook + + @post_hooks.setter + def post_hooks(self, value: List[Hook]) -> None: + if not isinstance(value, list): + raise ValueError("post_hooks should be a list of Hooks.") + if not all(isinstance(item, Hook) for item in value): + raise ValueError( + "All items on post_hooks list should be an instance of Hook." + ) + self.__post_hook = value + + @property + def enable_pre_hooks(self) -> bool: + """Property to indicate if the component can define pre_hooks.""" + return self.__enable_pre_hooks + + @enable_pre_hooks.setter + def enable_pre_hooks(self, value: bool) -> None: + if not isinstance(value, bool): + raise ValueError("enable_pre_hooks accepts only boolean values.") + self.__enable_pre_hooks = value + + @property + def enable_post_hooks(self) -> bool: + """Property to indicate if the component can define post_hooks.""" + return self.__enable_post_hooks + + @enable_post_hooks.setter + def enable_post_hooks(self, value: bool) -> None: + if not isinstance(value, bool): + raise ValueError("enable_post_hooks accepts only boolean values.") + self.__enable_post_hooks = value + + def add_pre_hook(self, *hooks: Hook) -> HookableComponent: + """Add a pre-hook steps to the component. + + Args: + hooks: Hook steps to add to pre_hook list. + + Returns: + Component with the Hook inserted in pre_hook list. + + Raises: + ValueError: if the component does not accept pre-hooks. + """ + if not self.enable_pre_hooks: + raise ValueError("This component does not enable adding pre-hooks") + self.pre_hooks += list(hooks) + return self + + def add_post_hook(self, *hooks: Hook) -> HookableComponent: + """Add a post-hook steps to the component. + + Args: + hooks: Hook steps to add to post_hook list. + + Returns: + Component with the Hook inserted in post_hook list. + + Raises: + ValueError: if the component does not accept post-hooks. + """ + if not self.enable_post_hooks: + raise ValueError("This component does not enable adding post-hooks") + self.post_hooks += list(hooks) + return self + + def run_pre_hooks(self, dataframe: DataFrame) -> DataFrame: + """Run all defined pre-hook steps from a given dataframe. + + Args: + dataframe: data to input in the defined pre-hook steps. + + Returns: + dataframe after passing for all defined pre-hooks. + """ + for hook in self.pre_hooks: + dataframe = hook.run(dataframe) + return dataframe + + def run_post_hooks(self, dataframe: DataFrame) -> DataFrame: + """Run all defined post-hook steps from a given dataframe. + + Args: + dataframe: data to input in the defined post-hook steps. + + Returns: + dataframe after passing for all defined post-hooks. + """ + for hook in self.post_hooks: + dataframe = hook.run(dataframe) + return dataframe diff --git a/butterfree/hooks/schema_compatibility/__init__.py b/butterfree/hooks/schema_compatibility/__init__.py new file mode 100644 index 00000000..edf748bf --- /dev/null +++ b/butterfree/hooks/schema_compatibility/__init__.py @@ -0,0 +1,9 @@ +"""Holds Schema Compatibility Hooks definitions.""" +from butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook import ( # noqa + CassandraTableSchemaCompatibilityHook, +) +from butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook import ( # noqa + SparkTableSchemaCompatibilityHook, +) + +__all__ = ["SparkTableSchemaCompatibilityHook", "CassandraTableSchemaCompatibilityHook"] diff --git a/butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py b/butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py new file mode 100644 index 00000000..cdb40472 --- /dev/null +++ b/butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py @@ -0,0 +1,58 @@ +"""Cassandra table schema compatibility Hook definition.""" + +from pyspark.sql import DataFrame + +from butterfree.clients import CassandraClient +from butterfree.constants import DataType +from butterfree.hooks.hook import Hook + + +class CassandraTableSchemaCompatibilityHook(Hook): + """Hook to verify the schema compatibility with a Cassandra's table. + + Verifies if all columns presented on the dataframe exists and are the same + type on the target Cassandra's table. + + Attributes: + cassandra_client: client to connect to Cassandra DB. + table: table name. + """ + + def __init__(self, cassandra_client: CassandraClient, table: str): + self.cassandra_client = cassandra_client + self.table = table + + def run(self, dataframe: DataFrame) -> DataFrame: + """Check the schema compatibility from a given Dataframe. + + This method does not change anything on the Dataframe. + + Args: + dataframe: dataframe to verify schema compatibility. + + Returns: + unchanged dataframe. + + Raises: + ValueError if the schemas are incompatible. + """ + table_schema = self.cassandra_client.get_schema(self.table) + type_cassandra = [ + type.cassandra + for field_id in range(len(dataframe.schema.fieldNames())) + for type in DataType + if dataframe.schema.fields.__getitem__(field_id).dataType == type.spark + ] + schema = [ + {"column_name": f"{column}", "type": f"{type}"} + for column, type in zip(dataframe.columns, type_cassandra) + ] + + if not all([column in table_schema for column in schema]): + raise ValueError( + "There's a schema incompatibility " + "between the defined dataframe and the Cassandra table.\n" + f"Dataframe schema = {schema}" + f"Target table schema = {table_schema}" + ) + return dataframe diff --git a/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py b/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py new file mode 100644 index 00000000..b08dd56a --- /dev/null +++ b/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py @@ -0,0 +1,46 @@ +"""Spark table schema compatibility Hook definition.""" + +from pyspark.sql import DataFrame + +from butterfree.clients import SparkClient +from butterfree.hooks.hook import Hook + + +class SparkTableSchemaCompatibilityHook(Hook): + """Hook to verify the schema compatibility with a Spark's table. + + Verifies if all columns presented on the dataframe exists and are the same + type on the target Spark's table. + + Attributes: + spark_client: client to connect to Spark's metastore. + table: table name. + database: database name. + """ + + def __init__(self, spark_client: SparkClient, table: str, database: str = None): + self.spark_client = spark_client + self.table_expression = (f"`{database}`." if database else "") + f"`{table}`" + + def run(self, dataframe: DataFrame) -> DataFrame: + """Check the schema compatibility from a given Dataframe. + + This method does not change anything on the Dataframe. + + Args: + dataframe: dataframe to verify schema compatibility. + + Returns: + unchanged dataframe. + + Raises: + ValueError if the schemas are incompatible. + """ + table_schema = self.spark_client.conn.table(self.table_expression).schema + if not all([column in table_schema for column in dataframe.schema]): + raise ValueError( + "The dataframe has a schema incompatible with the defined table.\n" + f"Dataframe schema = {dataframe.schema}" + f"Target table schema = {table_schema}" + ) + return dataframe diff --git a/butterfree/load/sink.py b/butterfree/load/sink.py index b4bf93e8..0b0c10c9 100644 --- a/butterfree/load/sink.py +++ b/butterfree/load/sink.py @@ -5,13 +5,14 @@ from pyspark.sql.streaming import StreamingQuery from butterfree.clients import SparkClient +from butterfree.hooks import HookableComponent from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet from butterfree.validations import BasicValidation from butterfree.validations.validation import Validation -class Sink: +class Sink(HookableComponent): """Define the destinations for the feature set pipeline. A Sink is created from a set of writers. The main goal of the Sink is to @@ -26,6 +27,8 @@ class Sink: """ def __init__(self, writers: List[Writer], validation: Optional[Validation] = None): + super().__init__() + self.enable_post_hooks = False self.writers = writers self.validation = validation @@ -94,12 +97,16 @@ def flush( Streaming handlers for each defined writer, if writing streaming dfs. """ + pre_hook_df = self.run_pre_hooks(dataframe) + if self.validation is not None: - self.validation.input(dataframe).check() + self.validation.input(pre_hook_df).check() handlers = [ writer.write( - feature_set=feature_set, dataframe=dataframe, spark_client=spark_client + feature_set=feature_set, + dataframe=pre_hook_df, + spark_client=spark_client, ) for writer in self.writers ] diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index d70f68f0..456d9e6b 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -1,7 +1,7 @@ """Holds the Historical Feature Store writer class.""" import os -from typing import Union +from typing import Any, Union from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import dayofmonth, month, year @@ -12,6 +12,8 @@ from butterfree.constants import columns from butterfree.constants.spark_constants import DEFAULT_NUM_PARTITIONS from butterfree.dataframe_service import repartition_df +from butterfree.hooks import Hook +from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet @@ -60,6 +62,20 @@ class HistoricalFeatureStoreWriter(Writer): For what settings you can use on S3Config and default settings, to read S3Config class. + We can write with interval mode, where HistoricalFeatureStoreWrite + will need to use Dynamic Partition Inserts, + the behaviour of OVERWRITE keyword is controlled by + spark.sql.sources.partitionOverwriteMode configuration property. + The dynamic overwrite mode is enabled Spark will only delete the + partitions for which it has data to be written to. + All the other partitions remain intact. + + >>> spark_client = SparkClient() + >>> writer = HistoricalFeatureStoreWriter(interval_mode=True) + >>> writer.write(feature_set=feature_set, + ... dataframe=dataframe, + ... spark_client=spark_client) + We can instantiate HistoricalFeatureStoreWriter class to validate the df to be written. @@ -95,15 +111,17 @@ def __init__( num_partitions: int = None, validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD, debug_mode: bool = False, + interval_mode: bool = False, + check_schema_hook: Hook = None, ): - super(HistoricalFeatureStoreWriter, self).__init__() + super(HistoricalFeatureStoreWriter, self).__init__(debug_mode, interval_mode) self.db_config = db_config or MetastoreConfig() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" ) self.num_partitions = num_partitions or DEFAULT_NUM_PARTITIONS self.validation_threshold = validation_threshold - self.debug_mode = debug_mode + self.check_schema_hook = check_schema_hook def write( self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, @@ -122,7 +140,25 @@ def write( """ dataframe = self._create_partitions(dataframe) - dataframe = self._apply_transformations(dataframe) + partition_df = self._apply_transformations(dataframe) + + if self.debug_mode: + dataframe = partition_df + else: + dataframe = self.check_schema( + spark_client, partition_df, feature_set.name, self.database + ) + + if self.interval_mode: + if self.debug_mode: + spark_client.create_temporary_view( + dataframe=dataframe, + name=f"historical_feature_store__{feature_set.name}", + ) + return + + self._incremental_mode(feature_set, dataframe, spark_client) + return if self.debug_mode: spark_client.create_temporary_view( @@ -132,6 +168,7 @@ def write( return s3_key = os.path.join("historical", feature_set.entity, feature_set.name) + spark_client.write_table( dataframe=dataframe, database=self.database, @@ -140,6 +177,34 @@ def write( **self.db_config.get_options(s3_key), ) + def _incremental_mode( + self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient + ) -> None: + + partition_overwrite_mode = spark_client.conn.conf.get( + "spark.sql.sources.partitionOverwriteMode" + ).lower() + + if partition_overwrite_mode != "dynamic": + raise RuntimeError( + "m=load_incremental_table, " + "spark.sql.sources.partitionOverwriteMode={}, " + "msg=partitionOverwriteMode have to be configured to 'dynamic'".format( + partition_overwrite_mode + ) + ) + + s3_key = os.path.join("historical", feature_set.entity, feature_set.name) + options = {"path": self.db_config.get_options(s3_key).get("path")} + + spark_client.write_dataframe( + dataframe=dataframe, + format_=self.db_config.format_, + mode=self.db_config.mode, + **options, + partitionBy=self.PARTITION_BY, + ) + def _assert_validation_count( self, table_name: str, written_count: int, dataframe_count: int ) -> None: @@ -169,12 +234,26 @@ def validate( """ table_name = ( - f"{self.database}.{feature_set.name}" - if not self.debug_mode - else f"historical_feature_store__{feature_set.name}" + f"{feature_set.name}" + if self.interval_mode and not self.debug_mode + else ( + f"{self.database}.{feature_set.name}" + if not self.debug_mode + else f"historical_feature_store__{feature_set.name}" + ) + ) + + written_count = ( + spark_client.read( + self.db_config.format_, + path=self.db_config.get_path_with_partitions(table_name, dataframe), + ).count() + if self.interval_mode and not self.debug_mode + else spark_client.read_table(table_name).count() ) - written_count = spark_client.read_table(table_name).count() + dataframe_count = dataframe.count() + self._assert_validation_count(table_name, written_count, dataframe_count) def _create_partitions(self, dataframe: DataFrame) -> DataFrame: @@ -191,3 +270,21 @@ def _create_partitions(self, dataframe: DataFrame) -> DataFrame: columns.PARTITION_DAY, dayofmonth(dataframe[columns.TIMESTAMP_COLUMN]) ) return repartition_df(dataframe, self.PARTITION_BY, self.num_partitions) + + def check_schema( + self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + ) -> DataFrame: + """Instantiate the schema check hook to check schema between dataframe and database. + + Args: + client: client for Spark or Cassandra connections with external services. + dataframe: Spark dataframe containing data from a feature set. + table_name: table name where the dataframe will be saved. + database: database name where the dataframe will be saved. + """ + if not self.check_schema_hook: + self.check_schema_hook = SparkTableSchemaCompatibilityHook( + client, table_name, database + ) + + return self.check_schema_hook.run(dataframe) diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index a81a1040..fade3789 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -7,9 +7,11 @@ from pyspark.sql.functions import col, row_number from pyspark.sql.streaming import StreamingQuery -from butterfree.clients import SparkClient +from butterfree.clients import CassandraClient, SparkClient from butterfree.configs.db import AbstractWriteConfig, CassandraConfig from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.hooks import Hook +from butterfree.hooks.schema_compatibility import CassandraTableSchemaCompatibilityHook from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet @@ -66,6 +68,12 @@ class OnlineFeatureStoreWriter(Writer): Both methods (writer and validate) will need the Spark Client, Feature Set and DataFrame, to write or to validate, according to OnlineFeatureStoreWriter class arguments. + + There's an important aspect to be highlighted here: if you're using + the incremental mode, we do not check if your data is the newest before + writing to the online feature store. + + This behavior is known and will be fixed soon. """ __name__ = "Online Feature Store Writer" @@ -75,11 +83,13 @@ def __init__( db_config: Union[AbstractWriteConfig, CassandraConfig] = None, debug_mode: bool = False, write_to_entity: bool = False, + interval_mode: bool = False, + check_schema_hook: Hook = None, ): - super(OnlineFeatureStoreWriter, self).__init__() + super(OnlineFeatureStoreWriter, self).__init__(debug_mode, interval_mode) self.db_config = db_config or CassandraConfig() - self.debug_mode = debug_mode self.write_to_entity = write_to_entity + self.check_schema_hook = check_schema_hook @staticmethod def filter_latest(dataframe: DataFrame, id_columns: List[Any]) -> DataFrame: @@ -170,6 +180,22 @@ def write( """ table_name = feature_set.entity if self.write_to_entity else feature_set.name + if not self.debug_mode: + config = ( + self.db_config + if self.db_config == CassandraConfig + else CassandraConfig() + ) + + cassandra_client = CassandraClient( + host=[config.host], + keyspace=config.keyspace, + user=config.username, + password=config.password, + ) + + dataframe = self.check_schema(cassandra_client, dataframe, table_name) + if dataframe.isStreaming: dataframe = self._apply_transformations(dataframe) if self.debug_mode: @@ -236,3 +262,21 @@ def get_db_schema(self, feature_set: FeatureSet) -> List[Dict[Any, Any]]: """ db_schema = self.db_config.translate(feature_set.get_schema()) return db_schema + + def check_schema( + self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + ) -> DataFrame: + """Instantiate the schema check hook to check schema between dataframe and database. + + Args: + client: client for Spark or Cassandra connections with external services. + dataframe: Spark dataframe containing data from a feature set. + table_name: table name where the dataframe will be saved. + database: database name where the dataframe will be saved. + """ + if not self.check_schema_hook: + self.check_schema_hook = CassandraTableSchemaCompatibilityHook( + client, table_name + ) + + return self.check_schema_hook.run(dataframe) diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index f76b4c25..7e0f9018 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -7,10 +7,11 @@ from pyspark.sql.dataframe import DataFrame from butterfree.clients import SparkClient +from butterfree.hooks import HookableComponent from butterfree.transform import FeatureSet -class Writer(ABC): +class Writer(ABC, HookableComponent): """Abstract base class for Writers. Args: @@ -18,8 +19,11 @@ class Writer(ABC): """ - def __init__(self) -> None: + def __init__(self, debug_mode: bool = False, interval_mode: bool = False) -> None: + super().__init__() self.transformations: List[Dict[str, Any]] = [] + self.debug_mode = debug_mode + self.interval_mode = interval_mode def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any @@ -70,6 +74,19 @@ def write( """ + @abstractmethod + def check_schema( + self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + ) -> DataFrame: + """Instantiate the schema check hook to check schema between dataframe and database. + + Args: + client: client for Spark or Cassandra connections with external services. + dataframe: Spark dataframe containing data from a feature set. + table_name: table name where the dataframe will be saved. + database: database name where the dataframe will be saved. + """ + @abstractmethod def validate( self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index ce1b7ba4..8aec54ec 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -40,11 +40,12 @@ class FeatureSetPipeline: ... ) >>> from butterfree.load import Sink >>> from butterfree.load.writers import HistoricalFeatureStoreWriter - >>> import pyspark.sql.functions as F + >>> from pyspark.sql import functions >>> def divide(df, fs, column1, column2): ... name = fs.get_output_columns()[0] - ... df = df.withColumn(name, F.col(column1) / F.col(column2)) + ... df = df.withColumn(name, + ... functions.col(column1) / functions.col(column2)) ... return df >>> pipeline = FeatureSetPipeline( @@ -67,7 +68,8 @@ class FeatureSetPipeline: ... name="feature1", ... description="test", ... transformation=SparkFunctionTransform( - ... functions=[F.avg, F.stddev_pop] + ... functions=[Function(functions.avg, DataType.DOUBLE), + ... Function(functions.stddev_pop, DataType.DOUBLE)], ... ).with_window( ... partition_by="id", ... order_by=TIMESTAMP_COLUMN, @@ -113,6 +115,19 @@ class FeatureSetPipeline: the defined sources, compute all the transformations and save the data to the specified locations. + We can run the pipeline over a range of dates by passing an end-date + and a start-date, where it will only bring data within this date range. + + >>> pipeline.run(end_date="2020-08-04", start_date="2020-07-04") + + Or run up to a date, where it will only bring data up to the specific date. + + >>> pipeline.run(end_date="2020-08-04") + + Or just a specific date, where you will only bring data for that day. + + >>> pipeline.run_for_date(execution_date="2020-08-04") + """ def __init__( @@ -179,6 +194,7 @@ def run( partition_by: List[str] = None, order_by: List[str] = None, num_processors: int = None, + start_date: str = None, ) -> None: """Runs the defined feature set pipeline. @@ -192,7 +208,11 @@ def run( soon. Use only if strictly necessary. """ - dataframe = self.source.construct(client=self.spark_client) + dataframe = self.source.construct( + client=self.spark_client, + start_date=self.feature_set.define_start_date(start_date), + end_date=end_date, + ) if partition_by: order_by = order_by or partition_by @@ -203,6 +223,7 @@ def run( dataframe = self.feature_set.construct( dataframe=dataframe, client=self.spark_client, + start_date=start_date, end_date=end_date, num_processors=num_processors, ) @@ -219,3 +240,30 @@ def run( feature_set=self.feature_set, spark_client=self.spark_client, ) + + def run_for_date( + self, + execution_date: str = None, + partition_by: List[str] = None, + order_by: List[str] = None, + num_processors: int = None, + ) -> None: + """Runs the defined feature set pipeline for a specific date. + + The pipeline consists in the following steps: + + - Constructs the input dataframe from the data source. + - Construct the feature set dataframe using the defined Features. + - Load the data to the configured sink locations. + + It's important to notice, however, that both parameters partition_by + and num_processors are WIP, we intend to enhance their functionality + soon. Use only if strictly necessary. + """ + self.run( + start_date=execution_date, + end_date=execution_date, + partition_by=partition_by, + order_by=order_by, + num_processors=num_processors, + ) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index f43c12d5..a19efb35 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -1,6 +1,6 @@ """AggregatedFeatureSet entity.""" import itertools -from datetime import timedelta +from datetime import datetime, timedelta from functools import reduce from typing import Any, Dict, List, Optional, Union @@ -8,6 +8,7 @@ from pyspark.sql import DataFrame, functions from butterfree.clients import SparkClient +from butterfree.constants.window_definitions import ALLOWED_WINDOWS from butterfree.dataframe_service import repartition_df from butterfree.transform import FeatureSet from butterfree.transform.features import Feature, KeyFeature, TimestampFeature @@ -488,12 +489,45 @@ def get_schema(self) -> List[Dict[str, Any]]: return schema + @staticmethod + def _get_biggest_window_in_days(definitions: List[str]) -> float: + windows_list = [] + for window in definitions: + windows_list.append( + int(window.split()[0]) * ALLOWED_WINDOWS[window.split()[1]] + ) + return max(windows_list) / (60 * 60 * 24) + + def define_start_date(self, start_date: str = None) -> Optional[str]: + """Get aggregated feature set start date. + + Args: + start_date: start date regarding source dataframe. + + Returns: + start date. + """ + if self._windows and start_date: + window_definition = [ + definition.frame_boundaries.window_definition + for definition in self._windows + ] + biggest_window = self._get_biggest_window_in_days(window_definition) + + return ( + datetime.strptime(start_date, "%Y-%m-%d") + - timedelta(days=int(biggest_window) + 1) + ).strftime("%Y-%m-%d") + + return start_date + def construct( self, dataframe: DataFrame, client: SparkClient, end_date: str = None, num_processors: int = None, + start_date: str = None, ) -> DataFrame: """Use all the features to build the feature set dataframe. @@ -506,6 +540,7 @@ def construct( client: client responsible for connecting to Spark session. end_date: user defined max date for having aggregated data (exclusive). num_processors: cluster total number of processors for repartitioning. + start_date: user defined min date for having aggregated data. Returns: Spark dataframe with all the feature columns. @@ -519,10 +554,12 @@ def construct( if not isinstance(dataframe, DataFrame): raise ValueError("source_df must be a dataframe") + pre_hook_df = self.run_pre_hooks(dataframe) + output_df = reduce( lambda df, feature: feature.transform(df), self.keys + [self.timestamp], - dataframe, + pre_hook_df, ) if self._windows and end_date is not None: @@ -558,6 +595,10 @@ def construct( else: output_df = self._aggregate(output_df, features=self.features) + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date + ) + output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) @@ -565,4 +606,6 @@ def construct( output_df = self._filter_duplicated_rows(output_df) output_df.cache().count() - return output_df + post_hook_df = self.run_post_hooks(output_df) + + return post_hook_df diff --git a/butterfree/transform/feature_set.py b/butterfree/transform/feature_set.py index c35e90fa..c2e40a49 100644 --- a/butterfree/transform/feature_set.py +++ b/butterfree/transform/feature_set.py @@ -1,7 +1,7 @@ """FeatureSet entity.""" import itertools from functools import reduce -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import pyspark.sql.functions as F from pyspark.sql import Window @@ -9,6 +9,8 @@ from butterfree.clients import SparkClient from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.dataframe_service import IncrementalStrategy +from butterfree.hooks import HookableComponent from butterfree.transform.features import Feature, KeyFeature, TimestampFeature from butterfree.transform.transformations import ( AggregatedTransform, @@ -16,7 +18,7 @@ ) -class FeatureSet: +class FeatureSet(HookableComponent): """Holds metadata about the feature set and constructs the final dataframe. Attributes: @@ -106,12 +108,14 @@ def __init__( timestamp: TimestampFeature, features: List[Feature], ) -> None: + super().__init__() self.name = name self.entity = entity self.description = description self.keys = keys self.timestamp = timestamp self.features = features + self.incremental_strategy = IncrementalStrategy(column=TIMESTAMP_COLUMN) @property def name(self) -> str: @@ -243,9 +247,6 @@ def columns(self) -> List[str]: def get_schema(self) -> List[Dict[str, Any]]: """Get feature set schema. - Args: - feature_set: object processed with feature set metadata. - Returns: List of dicts regarding cassandra feature set schema. @@ -378,12 +379,24 @@ def _filter_duplicated_rows(self, df: DataFrame) -> DataFrame: return df.select([column for column in self.columns]) + def define_start_date(self, start_date: str = None) -> Optional[str]: + """Get feature set start date. + + Args: + start_date: start date regarding source dataframe. + + Returns: + start date. + """ + return start_date + def construct( self, dataframe: DataFrame, client: SparkClient, end_date: str = None, num_processors: int = None, + start_date: str = None, ) -> DataFrame: """Use all the features to build the feature set dataframe. @@ -393,7 +406,8 @@ def construct( Args: dataframe: input dataframe to be transformed by the features. client: client responsible for connecting to Spark session. - end_date: user defined base date. + start_date: user defined start date. + end_date: user defined end date. num_processors: cluster total number of processors for repartitioning. Returns: @@ -403,14 +417,22 @@ def construct( if not isinstance(dataframe, DataFrame): raise ValueError("source_df must be a dataframe") + pre_hook_df = self.run_pre_hooks(dataframe) + output_df = reduce( lambda df, feature: feature.transform(df), self.keys + [self.timestamp] + self.features, - dataframe, + pre_hook_df, ).select(*self.columns) if not output_df.isStreaming: output_df = self._filter_duplicated_rows(output_df) output_df.cache().count() - return output_df + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date + ) + + post_hook_df = self.run_post_hooks(output_df) + + return post_hook_df diff --git a/butterfree/transform/utils/window_spec.py b/butterfree/transform/utils/window_spec.py index f3a392f6..a270fec0 100644 --- a/butterfree/transform/utils/window_spec.py +++ b/butterfree/transform/utils/window_spec.py @@ -5,6 +5,7 @@ from pyspark.sql import Column, WindowSpec, functions from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.constants.window_definitions import ALLOWED_WINDOWS class FrameBoundaries: @@ -16,21 +17,6 @@ class FrameBoundaries: it can be second(s), minute(s), hour(s), day(s), week(s) and year(s), """ - __ALLOWED_WINDOWS = { - "second": 1, - "seconds": 1, - "minute": 60, - "minutes": 60, - "hour": 3600, - "hours": 3600, - "day": 86400, - "days": 86400, - "week": 604800, - "weeks": 604800, - "year": 29030400, - "years": 29030400, - } - def __init__(self, mode: Optional[str], window_definition: str): self.mode = mode self.window_definition = window_definition @@ -46,7 +32,7 @@ def window_size(self) -> int: def window_unit(self) -> str: """Returns window unit.""" unit = self.window_definition.split()[1] - if unit not in self.__ALLOWED_WINDOWS and self.mode != "row_windows": + if unit not in ALLOWED_WINDOWS and self.mode != "row_windows": raise ValueError("Not allowed") return unit @@ -59,7 +45,7 @@ def get(self, window: WindowSpec) -> Any: span = self.window_size - 1 return window.rowsBetween(-span, 0) if self.mode == "fixed_windows": - span = self.__ALLOWED_WINDOWS[self.window_unit] * self.window_size + span = ALLOWED_WINDOWS[self.window_unit] * self.window_size return window.rangeBetween(-span, 0) diff --git a/examples/interval_runs/interval_runs.ipynb b/examples/interval_runs/interval_runs.ipynb new file mode 100644 index 00000000..e234da8a --- /dev/null +++ b/examples/interval_runs/interval_runs.ipynb @@ -0,0 +1,2152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# #5 Discovering Butterfree - Interval Runs\n", + "\n", + "Welcome to Discovering Butterfree tutorial series!\n", + "\n", + "This is the fifth tutorial of this series: its goal is to cover interval runs.\n", + "\n", + "Before diving into the tutorial make sure you have a basic understanding of these main data concepts: features, feature sets and the \"Feature Store Architecture\", you can read more about this [here]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example:\n", + "\n", + "Simulating the following scenario (the same from previous tutorials):\n", + "\n", + "- We want to create a feature set with features about houses for rent (listings).\n", + "\n", + "\n", + "We have an input dataset:\n", + "\n", + "- Table: `listing_events`. Table with data about events of house listings.\n", + "\n", + "\n", + "Our desire is to have three resulting datasets with the following schema:\n", + "\n", + "* id: **int**;\n", + "* timestamp: **timestamp**;\n", + "* rent__avg_over_1_day_rolling_windows: **double**;\n", + "* rent__stddev_pop_over_1_day_rolling_windows: **double**.\n", + " \n", + "The first dataset will be computed with just an end date time limit. The second one, on the other hand, uses both start and end date in order to filter data. Finally, the third one will be the result of a daily run. You can understand more about these definitions in our documentation.\n", + "\n", + "The following code blocks will show how to generate this feature set using Butterfree library:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# setup spark\n", + "from pyspark import SparkContext, SparkConf\n", + "from pyspark.sql import session\n", + "\n", + "conf = SparkConf().setAll([('spark.driver.host','127.0.0.1'), ('spark.sql.session.timeZone', 'UTC')])\n", + "sc = SparkContext(conf=conf)\n", + "spark = session.SparkSession(sc)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# fix working dir\n", + "import pathlib\n", + "import os\n", + "path = os.path.join(pathlib.Path().absolute(), '../..')\n", + "os.chdir(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Showing test data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "listing_events_df = spark.read.json(f\"{path}/examples/data/listing_events.json\")\n", + "listing_events_df.createOrReplaceTempView(\"listing_events\") # creating listing_events view\n", + "\n", + "region = spark.read.json(f\"{path}/examples/data/region.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Listing events table:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areabathroomsbedroomsidregion_idrenttimestamp
050111113001588302000000
150111120001588647600000
2100122215001588734000000
3100122225001589252400000
4150223330001589943600000
5175224432001589943600000
6250335532001590030000000
7225326632001590116400000
\n", + "
" + ], + "text/plain": [ + " area bathrooms bedrooms id region_id rent timestamp\n", + "0 50 1 1 1 1 1300 1588302000000\n", + "1 50 1 1 1 1 2000 1588647600000\n", + "2 100 1 2 2 2 1500 1588734000000\n", + "3 100 1 2 2 2 2500 1589252400000\n", + "4 150 2 2 3 3 3000 1589943600000\n", + "5 175 2 2 4 4 3200 1589943600000\n", + "6 250 3 3 5 5 3200 1590030000000\n", + "7 225 3 2 6 6 3200 1590116400000" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "listing_events_df.toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Region table:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityidlatlngregion
0Cerulean173.4448931.75030Kanto
1Veridian2-9.43510-167.11772Kanto
2Cinnabar329.73043117.66164Kanto
3Pallet4-52.95717-81.15251Kanto
4Violet5-47.35798-178.77255Johto
5Olivine651.7282046.21958Johto
\n", + "
" + ], + "text/plain": [ + " city id lat lng region\n", + "0 Cerulean 1 73.44489 31.75030 Kanto\n", + "1 Veridian 2 -9.43510 -167.11772 Kanto\n", + "2 Cinnabar 3 29.73043 117.66164 Kanto\n", + "3 Pallet 4 -52.95717 -81.15251 Kanto\n", + "4 Violet 5 -47.35798 -178.77255 Johto\n", + "5 Olivine 6 51.72820 46.21958 Johto" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "region.toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extract\n", + "\n", + "- For the extract part, we need the `Source` entity and the `FileReader` for the data we have;\n", + "- We need to declare a query in order to bring the results from our lonely reader (it's as simples as a select all statement)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from butterfree.clients import SparkClient\n", + "from butterfree.extract import Source\n", + "from butterfree.extract.readers import FileReader, TableReader\n", + "from butterfree.extract.pre_processing import filter\n", + "\n", + "readers = [\n", + " TableReader(id=\"listing_events\", table=\"listing_events\",),\n", + " FileReader(id=\"region\", path=f\"{path}/examples/data/region.json\", format=\"json\",)\n", + "]\n", + "\n", + "query = \"\"\"\n", + "select\n", + " listing_events.*,\n", + " region.city,\n", + " region.region,\n", + " region.lat,\n", + " region.lng,\n", + " region.region as region_name\n", + "from\n", + " listing_events\n", + " join region\n", + " on listing_events.region_id = region.id\n", + "\"\"\"\n", + "\n", + "source = Source(readers=readers, query=query)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "spark_client = SparkClient()\n", + "source_df = source.construct(spark_client)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And, finally, it's possible to see the results from building our souce dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areabathroomsbedroomsidregion_idrenttimestampcityregionlatlngregion_name
050111113001588302000000CeruleanKanto73.4448931.75030Kanto
150111120001588647600000CeruleanKanto73.4448931.75030Kanto
2100122215001588734000000VeridianKanto-9.43510-167.11772Kanto
3100122225001589252400000VeridianKanto-9.43510-167.11772Kanto
4150223330001589943600000CinnabarKanto29.73043117.66164Kanto
5175224432001589943600000PalletKanto-52.95717-81.15251Kanto
6250335532001590030000000VioletJohto-47.35798-178.77255Johto
7225326632001590116400000OlivineJohto51.7282046.21958Johto
\n", + "
" + ], + "text/plain": [ + " area bathrooms bedrooms id region_id rent timestamp city \\\n", + "0 50 1 1 1 1 1300 1588302000000 Cerulean \n", + "1 50 1 1 1 1 2000 1588647600000 Cerulean \n", + "2 100 1 2 2 2 1500 1588734000000 Veridian \n", + "3 100 1 2 2 2 2500 1589252400000 Veridian \n", + "4 150 2 2 3 3 3000 1589943600000 Cinnabar \n", + "5 175 2 2 4 4 3200 1589943600000 Pallet \n", + "6 250 3 3 5 5 3200 1590030000000 Violet \n", + "7 225 3 2 6 6 3200 1590116400000 Olivine \n", + "\n", + " region lat lng region_name \n", + "0 Kanto 73.44489 31.75030 Kanto \n", + "1 Kanto 73.44489 31.75030 Kanto \n", + "2 Kanto -9.43510 -167.11772 Kanto \n", + "3 Kanto -9.43510 -167.11772 Kanto \n", + "4 Kanto 29.73043 117.66164 Kanto \n", + "5 Kanto -52.95717 -81.15251 Kanto \n", + "6 Johto -47.35798 -178.77255 Johto \n", + "7 Johto 51.72820 46.21958 Johto " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source_df.toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform\n", + "- At the transform part, a set of `Feature` objects is declared;\n", + "- An Instance of `AggregatedFeatureSet` is used to hold the features;\n", + "- An `AggregatedFeatureSet` can only be created when it is possible to define a unique tuple formed by key columns and a time reference. This is an **architectural requirement** for the data. So least one `KeyFeature` and one `TimestampFeature` is needed;\n", + "- Every `Feature` needs a unique name, a description, and a data-type definition. Besides, in the case of the `AggregatedFeatureSet`, it's also mandatory to have an `AggregatedTransform` operator;\n", + "- An `AggregatedTransform` operator is used, as the name suggests, to define aggregation functions." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "\n", + "from butterfree.transform.aggregated_feature_set import AggregatedFeatureSet\n", + "from butterfree.transform.features import Feature, KeyFeature, TimestampFeature\n", + "from butterfree.transform.transformations import AggregatedTransform\n", + "from butterfree.constants import DataType\n", + "from butterfree.transform.utils import Function\n", + "\n", + "keys = [\n", + " KeyFeature(\n", + " name=\"id\",\n", + " description=\"Unique identificator code for houses.\",\n", + " dtype=DataType.BIGINT,\n", + " )\n", + "]\n", + "\n", + "# from_ms = True because the data originally is not in a Timestamp format.\n", + "ts_feature = TimestampFeature(from_ms=True)\n", + "\n", + "features = [\n", + " Feature(\n", + " name=\"rent\",\n", + " description=\"Rent value by month described in the listing.\",\n", + " transformation=AggregatedTransform(\n", + " functions=[\n", + " Function(F.avg, DataType.DOUBLE),\n", + " Function(F.stddev_pop, DataType.DOUBLE),\n", + " ],\n", + " filter_expression=\"region_name = 'Kanto'\",\n", + " ),\n", + " )\n", + "]\n", + "\n", + "aggregated_feature_set = AggregatedFeatureSet(\n", + " name=\"house_listings\",\n", + " entity=\"house\", # entity: to which \"business context\" this feature set belongs\n", + " description=\"Features describring a house listing.\",\n", + " keys=keys,\n", + " timestamp=ts_feature,\n", + " features=features,\n", + ").with_windows(definitions=[\"1 day\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we'll define out first aggregated feature set, with just an `end date` parameter:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "aggregated_feature_set_windows_df = aggregated_feature_set.construct(\n", + " source_df, \n", + " spark_client, \n", + " end_date=\"2020-05-30\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The resulting dataset is:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-01NaNNaN
112020-05-021300.00.0
212020-05-03NaNNaN
312020-05-062000.00.0
412020-05-07NaNNaN
522020-05-01NaNNaN
622020-05-071500.00.0
722020-05-08NaNNaN
822020-05-132500.00.0
922020-05-14NaNNaN
1032020-05-01NaNNaN
1132020-05-213000.00.0
1232020-05-22NaNNaN
1342020-05-01NaNNaN
1442020-05-213200.00.0
1542020-05-22NaNNaN
1652020-05-01NaNNaN
1762020-05-01NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-01 NaN \n", + "1 1 2020-05-02 1300.0 \n", + "2 1 2020-05-03 NaN \n", + "3 1 2020-05-06 2000.0 \n", + "4 1 2020-05-07 NaN \n", + "5 2 2020-05-01 NaN \n", + "6 2 2020-05-07 1500.0 \n", + "7 2 2020-05-08 NaN \n", + "8 2 2020-05-13 2500.0 \n", + "9 2 2020-05-14 NaN \n", + "10 3 2020-05-01 NaN \n", + "11 3 2020-05-21 3000.0 \n", + "12 3 2020-05-22 NaN \n", + "13 4 2020-05-01 NaN \n", + "14 4 2020-05-21 3200.0 \n", + "15 4 2020-05-22 NaN \n", + "16 5 2020-05-01 NaN \n", + "17 6 2020-05-01 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 NaN \n", + "1 0.0 \n", + "2 NaN \n", + "3 0.0 \n", + "4 NaN \n", + "5 NaN \n", + "6 0.0 \n", + "7 NaN \n", + "8 0.0 \n", + "9 NaN \n", + "10 NaN \n", + "11 0.0 \n", + "12 NaN \n", + "13 NaN \n", + "14 0.0 \n", + "15 NaN \n", + "16 NaN \n", + "17 NaN " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aggregated_feature_set_windows_df.orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's possible to see that if we use both a `start date` and `end_date` values. Then we'll achieve a time slice of the last dataframe, as it's possible to see:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-062000.00.0
112020-05-07NaNNaN
222020-05-06NaNNaN
322020-05-071500.00.0
422020-05-08NaNNaN
522020-05-132500.00.0
622020-05-14NaNNaN
732020-05-06NaNNaN
832020-05-213000.00.0
942020-05-06NaNNaN
1042020-05-213200.00.0
1152020-05-06NaNNaN
1262020-05-06NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-06 2000.0 \n", + "1 1 2020-05-07 NaN \n", + "2 2 2020-05-06 NaN \n", + "3 2 2020-05-07 1500.0 \n", + "4 2 2020-05-08 NaN \n", + "5 2 2020-05-13 2500.0 \n", + "6 2 2020-05-14 NaN \n", + "7 3 2020-05-06 NaN \n", + "8 3 2020-05-21 3000.0 \n", + "9 4 2020-05-06 NaN \n", + "10 4 2020-05-21 3200.0 \n", + "11 5 2020-05-06 NaN \n", + "12 6 2020-05-06 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 0.0 \n", + "1 NaN \n", + "2 NaN \n", + "3 0.0 \n", + "4 NaN \n", + "5 0.0 \n", + "6 NaN \n", + "7 NaN \n", + "8 0.0 \n", + "9 NaN \n", + "10 0.0 \n", + "11 NaN \n", + "12 NaN " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aggregated_feature_set.construct(\n", + " source_df, \n", + " spark_client, \n", + " end_date=\"2020-05-21\",\n", + " start_date=\"2020-05-06\",\n", + ").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load\n", + "\n", + "- For the load part we need `Writer` instances and a `Sink`;\n", + "- `writers` define where to load the data;\n", + "- The `Sink` gets the transformed data (feature set) and trigger the load to all the defined `writers`;\n", + "- `debug_mode` will create a temporary view instead of trying to write in a real data store." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from butterfree.load.writers import (\n", + " HistoricalFeatureStoreWriter,\n", + " OnlineFeatureStoreWriter,\n", + ")\n", + "from butterfree.load import Sink\n", + "\n", + "writers = [HistoricalFeatureStoreWriter(debug_mode=True, interval_mode=True), \n", + " OnlineFeatureStoreWriter(debug_mode=True, interval_mode=True)]\n", + "sink = Sink(writers=writers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pipeline\n", + "\n", + "- The `Pipeline` entity wraps all the other defined elements.\n", + "- `run` command will trigger the execution of the pipeline, end-to-end." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from butterfree.pipelines import FeatureSetPipeline\n", + "\n", + "pipeline = FeatureSetPipeline(source=source, feature_set=aggregated_feature_set, sink=sink)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first run will use just an `end_date` as parameter:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "result_df = pipeline.run(end_date=\"2020-05-30\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windowsyearmonthday
012020-05-01NaNNaN202051
112020-05-021300.00.0202052
212020-05-03NaNNaN202053
312020-05-062000.00.0202056
412020-05-07NaNNaN202057
522020-05-01NaNNaN202051
622020-05-071500.00.0202057
722020-05-08NaNNaN202058
822020-05-132500.00.02020513
922020-05-14NaNNaN2020514
1032020-05-01NaNNaN202051
1132020-05-213000.00.02020521
1232020-05-22NaNNaN2020522
1342020-05-01NaNNaN202051
1442020-05-213200.00.02020521
1542020-05-22NaNNaN2020522
1652020-05-01NaNNaN202051
1762020-05-01NaNNaN202051
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-01 NaN \n", + "1 1 2020-05-02 1300.0 \n", + "2 1 2020-05-03 NaN \n", + "3 1 2020-05-06 2000.0 \n", + "4 1 2020-05-07 NaN \n", + "5 2 2020-05-01 NaN \n", + "6 2 2020-05-07 1500.0 \n", + "7 2 2020-05-08 NaN \n", + "8 2 2020-05-13 2500.0 \n", + "9 2 2020-05-14 NaN \n", + "10 3 2020-05-01 NaN \n", + "11 3 2020-05-21 3000.0 \n", + "12 3 2020-05-22 NaN \n", + "13 4 2020-05-01 NaN \n", + "14 4 2020-05-21 3200.0 \n", + "15 4 2020-05-22 NaN \n", + "16 5 2020-05-01 NaN \n", + "17 6 2020-05-01 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows year month day \n", + "0 NaN 2020 5 1 \n", + "1 0.0 2020 5 2 \n", + "2 NaN 2020 5 3 \n", + "3 0.0 2020 5 6 \n", + "4 NaN 2020 5 7 \n", + "5 NaN 2020 5 1 \n", + "6 0.0 2020 5 7 \n", + "7 NaN 2020 5 8 \n", + "8 0.0 2020 5 13 \n", + "9 NaN 2020 5 14 \n", + "10 NaN 2020 5 1 \n", + "11 0.0 2020 5 21 \n", + "12 NaN 2020 5 22 \n", + "13 NaN 2020 5 1 \n", + "14 0.0 2020 5 21 \n", + "15 NaN 2020 5 22 \n", + "16 NaN 2020 5 1 \n", + "17 NaN 2020 5 1 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"historical_feature_store__house_listings\").orderBy(\n", + " \"id\", \"timestamp\"\n", + ").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-07NaNNaN
122020-05-14NaNNaN
232020-05-22NaNNaN
342020-05-22NaNNaN
452020-05-01NaNNaN
562020-05-01NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-07 NaN \n", + "1 2 2020-05-14 NaN \n", + "2 3 2020-05-22 NaN \n", + "3 4 2020-05-22 NaN \n", + "4 5 2020-05-01 NaN \n", + "5 6 2020-05-01 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"online_feature_store__house_listings\").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We can see that we were able to create all the desired features in an easy way\n", + "- The **historical feature set** holds all the data, and we can see that it is partitioned by year, month and day (columns added in the `HistoricalFeatureStoreWriter`)\n", + "- In the **online feature set** there is only the latest data for each id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The second run, on the other hand, will use both a `start_date` and `end_date` as parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "result_df = pipeline.run(end_date=\"2020-05-21\", start_date=\"2020-05-06\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windowsyearmonthday
012020-05-062000.00.0202056
112020-05-07NaNNaN202057
222020-05-06NaNNaN202056
322020-05-071500.00.0202057
422020-05-08NaNNaN202058
522020-05-132500.00.02020513
622020-05-14NaNNaN2020514
732020-05-06NaNNaN202056
832020-05-213000.00.02020521
942020-05-06NaNNaN202056
1042020-05-213200.00.02020521
1152020-05-06NaNNaN202056
1262020-05-06NaNNaN202056
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-06 2000.0 \n", + "1 1 2020-05-07 NaN \n", + "2 2 2020-05-06 NaN \n", + "3 2 2020-05-07 1500.0 \n", + "4 2 2020-05-08 NaN \n", + "5 2 2020-05-13 2500.0 \n", + "6 2 2020-05-14 NaN \n", + "7 3 2020-05-06 NaN \n", + "8 3 2020-05-21 3000.0 \n", + "9 4 2020-05-06 NaN \n", + "10 4 2020-05-21 3200.0 \n", + "11 5 2020-05-06 NaN \n", + "12 6 2020-05-06 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows year month day \n", + "0 0.0 2020 5 6 \n", + "1 NaN 2020 5 7 \n", + "2 NaN 2020 5 6 \n", + "3 0.0 2020 5 7 \n", + "4 NaN 2020 5 8 \n", + "5 0.0 2020 5 13 \n", + "6 NaN 2020 5 14 \n", + "7 NaN 2020 5 6 \n", + "8 0.0 2020 5 21 \n", + "9 NaN 2020 5 6 \n", + "10 0.0 2020 5 21 \n", + "11 NaN 2020 5 6 \n", + "12 NaN 2020 5 6 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"historical_feature_store__house_listings\").orderBy(\n", + " \"id\", \"timestamp\"\n", + ").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-07NaNNaN
122020-05-14NaNNaN
232020-05-213000.00.0
342020-05-213200.00.0
452020-05-06NaNNaN
562020-05-06NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-07 NaN \n", + "1 2 2020-05-14 NaN \n", + "2 3 2020-05-21 3000.0 \n", + "3 4 2020-05-21 3200.0 \n", + "4 5 2020-05-06 NaN \n", + "5 6 2020-05-06 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 NaN \n", + "1 NaN \n", + "2 0.0 \n", + "3 0.0 \n", + "4 NaN \n", + "5 NaN " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"online_feature_store__house_listings\").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, the third run, will use only an `execution_date` as a parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "result_df = pipeline.run_for_date(execution_date=\"2020-05-21\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windowsyearmonthday
012020-05-21NaNNaN2020521
122020-05-21NaNNaN2020521
232020-05-213000.00.02020521
342020-05-213200.00.02020521
452020-05-21NaNNaN2020521
562020-05-21NaNNaN2020521
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-21 NaN \n", + "1 2 2020-05-21 NaN \n", + "2 3 2020-05-21 3000.0 \n", + "3 4 2020-05-21 3200.0 \n", + "4 5 2020-05-21 NaN \n", + "5 6 2020-05-21 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows year month day \n", + "0 NaN 2020 5 21 \n", + "1 NaN 2020 5 21 \n", + "2 0.0 2020 5 21 \n", + "3 0.0 2020 5 21 \n", + "4 NaN 2020 5 21 \n", + "5 NaN 2020 5 21 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"historical_feature_store__house_listings\").orderBy(\n", + " \"id\", \"timestamp\"\n", + ").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-21NaNNaN
122020-05-21NaNNaN
232020-05-213000.00.0
342020-05-213200.00.0
452020-05-21NaNNaN
562020-05-21NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-21 NaN \n", + "1 2 2020-05-21 NaN \n", + "2 3 2020-05-21 3000.0 \n", + "3 4 2020-05-21 3200.0 \n", + "4 5 2020-05-21 NaN \n", + "5 6 2020-05-21 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 NaN \n", + "1 NaN \n", + "2 0.0 \n", + "3 0.0 \n", + "4 NaN \n", + "5 NaN " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"online_feature_store__house_listings\").orderBy(\"id\", \"timestamp\").toPandas()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/setup.py b/setup.py index bf471fec..4adcbce9 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.1.3.dev0" +__version__ = "1.2.0.dev0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/integration/butterfree/load/test_sink.py b/tests/integration/butterfree/load/test_sink.py index d00f4806..f507a335 100644 --- a/tests/integration/butterfree/load/test_sink.py +++ b/tests/integration/butterfree/load/test_sink.py @@ -9,9 +9,10 @@ ) -def test_sink(input_dataframe, feature_set): +def test_sink(input_dataframe, feature_set, mocker): # arrange client = SparkClient() + client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") feature_set_df = feature_set.construct(input_dataframe, client) target_latest_df = OnlineFeatureStoreWriter.filter_latest( feature_set_df, id_columns=[key.name for key in feature_set.keys] @@ -20,14 +21,23 @@ def test_sink(input_dataframe, feature_set): # setup historical writer s3config = Mock() + s3config.mode = "overwrite" + s3config.format_ = "parquet" s3config.get_options = Mock( - return_value={ - "mode": "overwrite", - "format_": "parquet", - "path": "test_folder/historical/entity/feature_set", - } + return_value={"path": "test_folder/historical/entity/feature_set"} + ) + s3config.get_path_with_partitions = Mock( + return_value="test_folder/historical/entity/feature_set" + ) + + historical_writer = HistoricalFeatureStoreWriter( + db_config=s3config, interval_mode=True ) - historical_writer = HistoricalFeatureStoreWriter(db_config=s3config) + + schema_dataframe = historical_writer._create_partitions(feature_set_df) + historical_writer.check_schema_hook = mocker.stub("check_schema_hook") + historical_writer.check_schema_hook.run = mocker.stub("run") + historical_writer.check_schema_hook.run.return_value = schema_dataframe # setup online writer # TODO: Change for CassandraConfig when Cassandra for test is ready @@ -39,6 +49,10 @@ def test_sink(input_dataframe, feature_set): ) online_writer = OnlineFeatureStoreWriter(db_config=online_config) + online_writer.check_schema_hook = mocker.stub("check_schema_hook") + online_writer.check_schema_hook.run = mocker.stub("run") + online_writer.check_schema_hook.run.return_value = feature_set_df + writers = [historical_writer, online_writer] sink = Sink(writers) @@ -47,13 +61,14 @@ def test_sink(input_dataframe, feature_set): sink.flush(feature_set, feature_set_df, client) # get historical results - historical_result_df = client.read_table( - feature_set.name, historical_writer.database + historical_result_df = client.read( + s3config.format_, + path=s3config.get_path_with_partitions(feature_set.name, feature_set_df), ) # get online results online_result_df = client.read( - online_config.format_, options=online_config.get_options(feature_set.name) + online_config.format_, **online_config.get_options(feature_set.name) ) # assert diff --git a/tests/integration/butterfree/pipelines/conftest.py b/tests/integration/butterfree/pipelines/conftest.py index 79894176..73da163e 100644 --- a/tests/integration/butterfree/pipelines/conftest.py +++ b/tests/integration/butterfree/pipelines/conftest.py @@ -1,7 +1,19 @@ import pytest +from pyspark.sql import DataFrame +from pyspark.sql import functions as F from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy +from butterfree.extract import Source +from butterfree.extract.readers import TableReader +from butterfree.load import Sink +from butterfree.load.writers import HistoricalFeatureStoreWriter +from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature +from butterfree.transform.transformations import SparkFunctionTransform +from butterfree.transform.utils import Function @pytest.fixture() @@ -74,3 +86,193 @@ def fixed_windows_output_feature_set_dataframe(spark_context, spark_session): df = df.withColumn(TIMESTAMP_COLUMN, df.timestamp.cast(DataType.TIMESTAMP.spark)) return df + + +@pytest.fixture() +def mocked_date_df(spark_context, spark_session): + data = [ + {"id": 1, "ts": "2016-04-11 11:31:11", "feature": 200}, + {"id": 1, "ts": "2016-04-12 11:44:12", "feature": 300}, + {"id": 1, "ts": "2016-04-13 11:46:24", "feature": 400}, + {"id": 1, "ts": "2016-04-14 12:03:21", "feature": 500}, + ] + df = spark_session.read.json(spark_context.parallelize(data, 1)) + df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) + + return df + + +@pytest.fixture() +def fixed_windows_output_feature_set_date_dataframe(spark_context, spark_session): + data = [ + { + "id": 1, + "timestamp": "2016-04-12 11:44:12", + "feature__avg_over_1_day_fixed_windows": 300, + "feature__stddev_pop_over_1_day_fixed_windows": 0, + "year": 2016, + "month": 4, + "day": 12, + }, + { + "id": 1, + "timestamp": "2016-04-13 11:46:24", + "feature__avg_over_1_day_fixed_windows": 400, + "feature__stddev_pop_over_1_day_fixed_windows": 0, + "year": 2016, + "month": 4, + "day": 13, + }, + ] + df = spark_session.read.json(spark_context.parallelize(data, 1)) + df = df.withColumn(TIMESTAMP_COLUMN, df.timestamp.cast(DataType.TIMESTAMP.spark)) + + return df + + +@pytest.fixture() +def feature_set_pipeline( + spark_context, spark_session, +): + + feature_set_pipeline = FeatureSetPipeline( + source=Source( + readers=[ + TableReader(id="b_source", table="b_table",).with_incremental_strategy( + incremental_strategy=IncrementalStrategy(column="timestamp") + ), + ], + query=f"select * from b_source ", # noqa + ), + feature_set=FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature", + description="test", + transformation=SparkFunctionTransform( + functions=[ + Function(F.avg, DataType.FLOAT), + Function(F.stddev_pop, DataType.FLOAT), + ], + ).with_window( + partition_by="id", + order_by=TIMESTAMP_COLUMN, + mode="fixed_windows", + window_definition=["1 day"], + ), + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(), + ), + sink=Sink(writers=[HistoricalFeatureStoreWriter(debug_mode=True)]), + ) + + return feature_set_pipeline + + +@pytest.fixture() +def pipeline_interval_run_target_dfs( + spark_session, spark_context +) -> (DataFrame, DataFrame, DataFrame): + first_data = [ + { + "id": 1, + "timestamp": "2016-04-11 11:31:11", + "feature": 200, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 11, + }, + { + "id": 1, + "timestamp": "2016-04-12 11:44:12", + "feature": 300, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 12, + }, + { + "id": 1, + "timestamp": "2016-04-13 11:46:24", + "feature": 400, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 13, + }, + ] + + second_data = first_data + [ + { + "id": 1, + "timestamp": "2016-04-14 12:03:21", + "feature": 500, + "run_id": 2, + "year": 2016, + "month": 4, + "day": 14, + }, + ] + + third_data = [ + { + "id": 1, + "timestamp": "2016-04-11 11:31:11", + "feature": 200, + "run_id": 3, + "year": 2016, + "month": 4, + "day": 11, + }, + { + "id": 1, + "timestamp": "2016-04-12 11:44:12", + "feature": 300, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 12, + }, + { + "id": 1, + "timestamp": "2016-04-13 11:46:24", + "feature": 400, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 13, + }, + { + "id": 1, + "timestamp": "2016-04-14 12:03:21", + "feature": 500, + "run_id": 2, + "year": 2016, + "month": 4, + "day": 14, + }, + ] + + first_run_df = spark_session.read.json( + spark_context.parallelize(first_data, 1) + ).withColumn("timestamp", F.col("timestamp").cast(DataType.TIMESTAMP.spark)) + second_run_df = spark_session.read.json( + spark_context.parallelize(second_data, 1) + ).withColumn("timestamp", F.col("timestamp").cast(DataType.TIMESTAMP.spark)) + third_run_df = spark_session.read.json( + spark_context.parallelize(third_data, 1) + ).withColumn("timestamp", F.col("timestamp").cast(DataType.TIMESTAMP.spark)) + + return first_run_df, second_run_df, third_run_df diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index 23d200c1..a302dc9e 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -4,21 +4,48 @@ from pyspark.sql import DataFrame from pyspark.sql import functions as F +from butterfree.clients import SparkClient from butterfree.configs import environment +from butterfree.configs.db import MetastoreConfig from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy from butterfree.extract import Source from butterfree.extract.readers import TableReader +from butterfree.hooks import Hook from butterfree.load import Sink from butterfree.load.writers import HistoricalFeatureStoreWriter from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline from butterfree.testing.dataframe import assert_dataframe_equality from butterfree.transform import FeatureSet from butterfree.transform.features import Feature, KeyFeature, TimestampFeature -from butterfree.transform.transformations import CustomTransform, SparkFunctionTransform +from butterfree.transform.transformations import ( + CustomTransform, + SparkFunctionTransform, + SQLExpressionTransform, +) from butterfree.transform.utils import Function +class AddHook(Hook): + def __init__(self, value): + self.value = value + + def run(self, dataframe): + return dataframe.withColumn("feature", F.expr(f"feature + {self.value}")) + + +class RunHook(Hook): + def __init__(self, id): + self.id = id + + def run(self, dataframe): + return dataframe.withColumn( + "run_id", + F.when(F.lit(self.id).isNotNull(), F.lit(self.id)).otherwise(F.lit(None)), + ) + + def create_temp_view(dataframe: DataFrame, name): dataframe.createOrReplaceTempView(name) @@ -38,9 +65,21 @@ def divide(df, fs, column1, column2): return df +def create_ymd(dataframe): + return ( + dataframe.withColumn("year", F.year(F.col("timestamp"))) + .withColumn("month", F.month(F.col("timestamp"))) + .withColumn("day", F.dayofmonth(F.col("timestamp"))) + ) + + class TestFeatureSetPipeline: def test_feature_set_pipeline( - self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe + self, + mocked_df, + spark_session, + fixed_windows_output_feature_set_dataframe, + mocker, ): # arrange table_reader_id = "a_source" @@ -53,13 +92,25 @@ def test_feature_set_pipeline( table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) + + spark_client = SparkClient() + spark_client.conn.conf.set( + "spark.sql.sources.partitionOverwriteMode", "dynamic" + ) + dbconfig = Mock() + dbconfig.mode = "overwrite" + dbconfig.format_ = "parquet" dbconfig.get_options = Mock( - return_value={ - "mode": "overwrite", - "format_": "parquet", - "path": "test_folder/historical/entity/feature_set", - } + return_value={"path": "test_folder/historical/entity/feature_set"} + ) + + historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) + + historical_writer.check_schema_hook = mocker.stub("check_schema_hook") + historical_writer.check_schema_hook.run = mocker.stub("run") + historical_writer.check_schema_hook.run.return_value = ( + fixed_windows_output_feature_set_dataframe ) # act @@ -112,7 +163,7 @@ def test_feature_set_pipeline( ], timestamp=TimestampFeature(), ), - sink=Sink(writers=[HistoricalFeatureStoreWriter(db_config=dbconfig)],), + sink=Sink(writers=[historical_writer]), ) test_pipeline.run() @@ -129,3 +180,247 @@ def test_feature_set_pipeline( # tear down shutil.rmtree("test_folder") + + def test_feature_set_pipeline_with_dates( + self, + mocked_date_df, + spark_session, + fixed_windows_output_feature_set_date_dataframe, + feature_set_pipeline, + mocker, + ): + # arrange + table_reader_table = "b_table" + create_temp_view(dataframe=mocked_date_df, name=table_reader_table) + + historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) + + feature_set_pipeline.sink.writers = [historical_writer] + + # act + feature_set_pipeline.run(start_date="2016-04-12", end_date="2016-04-13") + + df = spark_session.sql("select * from historical_feature_store__feature_set") + + # assert + assert_dataframe_equality(df, fixed_windows_output_feature_set_date_dataframe) + + def test_feature_set_pipeline_with_execution_date( + self, + mocked_date_df, + spark_session, + fixed_windows_output_feature_set_date_dataframe, + feature_set_pipeline, + mocker, + ): + # arrange + table_reader_table = "b_table" + create_temp_view(dataframe=mocked_date_df, name=table_reader_table) + + target_df = fixed_windows_output_feature_set_date_dataframe.filter( + "timestamp < '2016-04-13'" + ) + + historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) + + feature_set_pipeline.sink.writers = [historical_writer] + + # act + feature_set_pipeline.run_for_date(execution_date="2016-04-12") + + df = spark_session.sql("select * from historical_feature_store__feature_set") + + # assert + assert_dataframe_equality(df, target_df) + + def test_pipeline_with_hooks(self, spark_session, mocker): + # arrange + hook1 = AddHook(value=1) + + spark_session.sql( + "select 1 as id, timestamp('2020-01-01') as timestamp, 0 as feature" + ).createOrReplaceTempView("test") + + target_df = spark_session.sql( + "select 1 as id, timestamp('2020-01-01') as timestamp, 6 as feature, 2020 " + "as year, 1 as month, 1 as day" + ) + + historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) + + test_pipeline = FeatureSetPipeline( + source=Source( + readers=[TableReader(id="reader", table="test",).add_post_hook(hook1)], + query="select * from reader", + ).add_post_hook(hook1), + feature_set=FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature", + description="test", + transformation=SQLExpressionTransform(expression="feature + 1"), + dtype=DataType.INTEGER, + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(), + ) + .add_pre_hook(hook1) + .add_post_hook(hook1), + sink=Sink(writers=[historical_writer],).add_pre_hook(hook1), + ) + + # act + test_pipeline.run() + output_df = spark_session.table("historical_feature_store__feature_set") + + # assert + output_df.show() + assert_dataframe_equality(output_df, target_df) + + def test_pipeline_interval_run( + self, mocked_date_df, pipeline_interval_run_target_dfs, spark_session + ): + """Testing pipeline's idempotent interval run feature. + Source data: + +-------+---+-------------------+-------------------+ + |feature| id| ts| timestamp| + +-------+---+-------------------+-------------------+ + | 200| 1|2016-04-11 11:31:11|2016-04-11 11:31:11| + | 300| 1|2016-04-12 11:44:12|2016-04-12 11:44:12| + | 400| 1|2016-04-13 11:46:24|2016-04-13 11:46:24| + | 500| 1|2016-04-14 12:03:21|2016-04-14 12:03:21| + +-------+---+-------------------+-------------------+ + The test executes 3 runs for different time intervals. The input data has 4 data + points: 2016-04-11, 2016-04-12, 2016-04-13 and 2016-04-14. The following run + specifications are: + 1) Interval: from 2016-04-11 to 2016-04-13 + Target table result: + +---+-------+---+-----+------+-------------------+----+ + |day|feature| id|month|run_id| timestamp|year| + +---+-------+---+-----+------+-------------------+----+ + | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| + | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| + | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| + +---+-------+---+-----+------+-------------------+----+ + 2) Interval: only 2016-04-14. + Target table result: + +---+-------+---+-----+------+-------------------+----+ + |day|feature| id|month|run_id| timestamp|year| + +---+-------+---+-----+------+-------------------+----+ + | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| + | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| + | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| + | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| + +---+-------+---+-----+------+-------------------+----+ + 3) Interval: only 2016-04-11. + Target table result: + +---+-------+---+-----+------+-------------------+----+ + |day|feature| id|month|run_id| timestamp|year| + +---+-------+---+-----+------+-------------------+----+ + | 11| 200| 1| 4| 3|2016-04-11 11:31:11|2016| + | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| + | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| + | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| + +---+-------+---+-----+------+-------------------+----+ + """ + # arrange + create_temp_view(dataframe=mocked_date_df, name="input_data") + + db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") + path = "test_folder/historical/entity/feature_set" + + spark_session.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") + spark_session.sql(f"create database if not exists {db}") + spark_session.sql( + f"create table if not exists {db}.feature_set_interval " + f"(id int, timestamp timestamp, feature int, " + f"run_id int, year int, month int, day int);" + ) + + dbconfig = MetastoreConfig() + dbconfig.get_options = Mock( + return_value={"mode": "overwrite", "format_": "parquet", "path": path} + ) + + historical_writer = HistoricalFeatureStoreWriter( + db_config=dbconfig, interval_mode=True + ) + + first_run_hook = RunHook(id=1) + second_run_hook = RunHook(id=2) + third_run_hook = RunHook(id=3) + + ( + first_run_target_df, + second_run_target_df, + third_run_target_df, + ) = pipeline_interval_run_target_dfs + + test_pipeline = FeatureSetPipeline( + source=Source( + readers=[ + TableReader(id="id", table="input_data",).with_incremental_strategy( + IncrementalStrategy("ts") + ), + ], + query="select * from id ", + ), + feature_set=FeatureSet( + name="feature_set_interval", + entity="entity", + description="", + keys=[KeyFeature(name="id", description="", dtype=DataType.INTEGER,)], + timestamp=TimestampFeature(from_column="ts"), + features=[ + Feature(name="feature", description="", dtype=DataType.INTEGER), + Feature(name="run_id", description="", dtype=DataType.INTEGER), + ], + ), + sink=Sink([historical_writer],), + ) + + # act and assert + dbconfig.get_path_with_partitions = Mock( + return_value=[ + "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", + "test_folder/historical/entity/feature_set/year=2016/month=4/day=12", + "test_folder/historical/entity/feature_set/year=2016/month=4/day=13", + ] + ) + test_pipeline.feature_set.add_pre_hook(first_run_hook) + test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11") + first_run_output_df = spark_session.read.parquet(path) + assert_dataframe_equality(first_run_output_df, first_run_target_df) + + dbconfig.get_path_with_partitions = Mock( + return_value=[ + "test_folder/historical/entity/feature_set/year=2016/month=4/day=14", + ] + ) + test_pipeline.feature_set.add_pre_hook(second_run_hook) + test_pipeline.run_for_date("2016-04-14") + second_run_output_df = spark_session.read.parquet(path) + assert_dataframe_equality(second_run_output_df, second_run_target_df) + + dbconfig.get_path_with_partitions = Mock( + return_value=[ + "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", + ] + ) + test_pipeline.feature_set.add_pre_hook(third_run_hook) + test_pipeline.run_for_date("2016-04-11") + third_run_output_df = spark_session.read.parquet(path) + assert_dataframe_equality(third_run_output_df, third_run_target_df) + + # tear down + shutil.rmtree("test_folder") diff --git a/tests/integration/butterfree/transform/conftest.py b/tests/integration/butterfree/transform/conftest.py index 6621c9a3..fe0cc572 100644 --- a/tests/integration/butterfree/transform/conftest.py +++ b/tests/integration/butterfree/transform/conftest.py @@ -395,3 +395,58 @@ def rolling_windows_output_feature_set_dataframe_base_date( df = df.withColumn(TIMESTAMP_COLUMN, df.origin_ts.cast(DataType.TIMESTAMP.spark)) return df + + +@fixture +def feature_set_dates_dataframe(spark_context, spark_session): + data = [ + {"id": 1, "ts": "2016-04-11 11:31:11", "feature": 200}, + {"id": 1, "ts": "2016-04-12 11:44:12", "feature": 300}, + {"id": 1, "ts": "2016-04-13 11:46:24", "feature": 400}, + {"id": 1, "ts": "2016-04-14 12:03:21", "feature": 500}, + ] + df = spark_session.read.json(spark_context.parallelize(data, 1)) + df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) + df = df.withColumn("ts", df.ts.cast(DataType.TIMESTAMP.spark)) + + return df + + +@fixture +def feature_set_dates_output_dataframe(spark_context, spark_session): + data = [ + {"id": 1, "timestamp": "2016-04-11 11:31:11", "feature": 200}, + {"id": 1, "timestamp": "2016-04-12 11:44:12", "feature": 300}, + ] + df = spark_session.read.json(spark_context.parallelize(data, 1)) + df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + + return df + + +@fixture +def rolling_windows_output_date_boundaries(spark_context, spark_session): + data = [ + { + "id": 1, + "ts": "2016-04-11 00:00:00", + "feature__avg_over_1_day_rolling_windows": None, + "feature__avg_over_1_week_rolling_windows": None, + "feature__stddev_pop_over_1_day_rolling_windows": None, + "feature__stddev_pop_over_1_week_rolling_windows": None, + }, + { + "id": 1, + "ts": "2016-04-12 00:00:00", + "feature__avg_over_1_day_rolling_windows": 200.0, + "feature__avg_over_1_week_rolling_windows": 200.0, + "feature__stddev_pop_over_1_day_rolling_windows": 0.0, + "feature__stddev_pop_over_1_week_rolling_windows": 0.0, + }, + ] + df = spark_session.read.json( + spark_context.parallelize(data).map(lambda x: json.dumps(x)) + ) + df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) + + return df diff --git a/tests/integration/butterfree/transform/test_aggregated_feature_set.py b/tests/integration/butterfree/transform/test_aggregated_feature_set.py index 559dbcb8..bc3ebb6c 100644 --- a/tests/integration/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/integration/butterfree/transform/test_aggregated_feature_set.py @@ -241,3 +241,53 @@ def test_construct_with_pivot( # assert assert_dataframe_equality(output_df, target_df_pivot_agg) + + def test_construct_rolling_windows_with_date_boundaries( + self, feature_set_dates_dataframe, rolling_windows_output_date_boundaries, + ): + # given + + spark_client = SparkClient() + + # arrange + + feature_set = AggregatedFeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature", + description="test", + transformation=AggregatedTransform( + functions=[ + Function(F.avg, DataType.DOUBLE), + Function(F.stddev_pop, DataType.DOUBLE), + ], + ), + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(), + ).with_windows(definitions=["1 day", "1 week"]) + + # act + output_df = feature_set.construct( + feature_set_dates_dataframe, + client=spark_client, + start_date="2016-04-11", + end_date="2016-04-12", + ).orderBy("timestamp") + + target_df = rolling_windows_output_date_boundaries.orderBy( + feature_set.timestamp_column + ).select(feature_set.columns) + + # assert + assert_dataframe_equality(output_df, target_df) diff --git a/tests/integration/butterfree/transform/test_feature_set.py b/tests/integration/butterfree/transform/test_feature_set.py index 4872ded2..25f70b6e 100644 --- a/tests/integration/butterfree/transform/test_feature_set.py +++ b/tests/integration/butterfree/transform/test_feature_set.py @@ -77,3 +77,47 @@ def test_construct( # assert assert_dataframe_equality(output_df, target_df) + + def test_construct_with_date_boundaries( + self, feature_set_dates_dataframe, feature_set_dates_output_dataframe + ): + # given + + spark_client = SparkClient() + + # arrange + + feature_set = FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature(name="feature", description="test", dtype=DataType.FLOAT,), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(), + ) + + output_df = ( + feature_set.construct( + feature_set_dates_dataframe, + client=spark_client, + start_date="2016-04-11", + end_date="2016-04-12", + ) + .orderBy(feature_set.timestamp_column) + .select(feature_set.columns) + ) + + target_df = feature_set_dates_output_dataframe.orderBy( + feature_set.timestamp_column + ).select(feature_set.columns) + + # assert + assert_dataframe_equality(output_df, target_df) diff --git a/tests/unit/butterfree/clients/conftest.py b/tests/unit/butterfree/clients/conftest.py index fda11f8e..ffb2db88 100644 --- a/tests/unit/butterfree/clients/conftest.py +++ b/tests/unit/butterfree/clients/conftest.py @@ -46,11 +46,16 @@ def mocked_stream_df() -> Mock: return mock +@pytest.fixture() +def mock_spark_sql() -> Mock: + mock = Mock() + mock.sql = mock + return mock + + @pytest.fixture def cassandra_client() -> CassandraClient: - return CassandraClient( - cassandra_host=["mock"], cassandra_key_space="dummy_keyspace" - ) + return CassandraClient(host=["mock"], keyspace="dummy_keyspace") @pytest.fixture diff --git a/tests/unit/butterfree/clients/test_cassandra_client.py b/tests/unit/butterfree/clients/test_cassandra_client.py index 8785485b..aa52e6f8 100644 --- a/tests/unit/butterfree/clients/test_cassandra_client.py +++ b/tests/unit/butterfree/clients/test_cassandra_client.py @@ -15,9 +15,7 @@ def sanitize_string(query: str) -> str: class TestCassandraClient: def test_conn(self, cassandra_client: CassandraClient) -> None: # arrange - cassandra_client = CassandraClient( - cassandra_host=["mock"], cassandra_key_space="dummy_keyspace" - ) + cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace") # act start_conn = cassandra_client._session diff --git a/tests/unit/butterfree/clients/test_spark_client.py b/tests/unit/butterfree/clients/test_spark_client.py index 58d53a40..9f641506 100644 --- a/tests/unit/butterfree/clients/test_spark_client.py +++ b/tests/unit/butterfree/clients/test_spark_client.py @@ -1,4 +1,5 @@ -from typing import Any, Dict, Optional, Union +from datetime import datetime +from typing import Any, Optional, Union from unittest.mock import Mock import pytest @@ -26,19 +27,20 @@ def test_conn(self) -> None: assert start_conn is None @pytest.mark.parametrize( - "format, options, stream, schema", + "format, path, stream, schema, options", [ - ("parquet", {"path": "path/to/file"}, False, None), - ("csv", {"path": "path/to/file", "header": True}, False, None), - ("json", {"path": "path/to/file"}, True, None), + ("parquet", ["path/to/file"], False, None, {}), + ("csv", "path/to/file", False, None, {"header": True}), + ("json", "path/to/file", True, None, {}), ], ) def test_read( self, format: str, - options: Dict[str, Any], stream: bool, schema: Optional[StructType], + path: Any, + options: Any, target_df: DataFrame, mocked_spark_read: Mock, ) -> None: @@ -48,26 +50,25 @@ def test_read( spark_client._session = mocked_spark_read # act - result_df = spark_client.read(format, options, schema, stream) + result_df = spark_client.read( + format=format, schema=schema, stream=stream, path=path, **options + ) # assert mocked_spark_read.format.assert_called_once_with(format) - mocked_spark_read.options.assert_called_once_with(**options) + mocked_spark_read.load.assert_called_once_with(path, **options) assert target_df.collect() == result_df.collect() @pytest.mark.parametrize( - "format, options", - [(None, {"path": "path/to/file"}), ("csv", "not a valid options")], + "format, path", [(None, "path/to/file"), ("csv", 123)], ) - def test_read_invalid_params( - self, format: Optional[str], options: Union[Dict[str, Any], str] - ) -> None: + def test_read_invalid_params(self, format: Optional[str], path: Any) -> None: # arrange spark_client = SparkClient() # act and assert with pytest.raises(ValueError): - spark_client.read(format, options) # type: ignore + spark_client.read(format=format, path=path) # type: ignore def test_sql(self, target_df: DataFrame) -> None: # arrange @@ -252,3 +253,43 @@ def test_create_temporary_view( # assert assert_dataframe_equality(target_df, result_df) + + def test_add_table_partitions(self, mock_spark_sql: Mock): + # arrange + target_command = ( + f"ALTER TABLE `db`.`table` ADD IF NOT EXISTS " + f"PARTITION ( year = 2020, month = 8, day = 14 ) " + f"PARTITION ( year = 2020, month = 8, day = 15 ) " + f"PARTITION ( year = 2020, month = 8, day = 16 )" + ) + + spark_client = SparkClient() + spark_client._session = mock_spark_sql + partitions = [ + {"year": 2020, "month": 8, "day": 14}, + {"year": 2020, "month": 8, "day": 15}, + {"year": 2020, "month": 8, "day": 16}, + ] + + # act + spark_client.add_table_partitions(partitions, "table", "db") + + # assert + mock_spark_sql.assert_called_once_with(target_command) + + @pytest.mark.parametrize( + "partition", + [ + [{"float_partition": 2.72}], + [{123: 2020}], + [{"date": datetime(year=2020, month=8, day=18)}], + ], + ) + def test_add_invalid_partitions(self, mock_spark_sql: Mock, partition): + # arrange + spark_client = SparkClient() + spark_client._session = mock_spark_sql + + # act and assert + with pytest.raises(ValueError): + spark_client.add_table_partitions(partition, "table", "db") diff --git a/tests/unit/butterfree/dataframe_service/conftest.py b/tests/unit/butterfree/dataframe_service/conftest.py index 867bc80a..09470c9a 100644 --- a/tests/unit/butterfree/dataframe_service/conftest.py +++ b/tests/unit/butterfree/dataframe_service/conftest.py @@ -25,3 +25,17 @@ def input_df(spark_context, spark_session): return spark_session.read.json( spark_context.parallelize(data, 1), schema="timestamp timestamp" ) + + +@pytest.fixture() +def test_partitioning_input_df(spark_context, spark_session): + data = [ + {"feature": 1, "year": 2009, "month": 8, "day": 20}, + {"feature": 2, "year": 2009, "month": 8, "day": 20}, + {"feature": 3, "year": 2020, "month": 8, "day": 20}, + {"feature": 4, "year": 2020, "month": 9, "day": 20}, + {"feature": 5, "year": 2020, "month": 9, "day": 20}, + {"feature": 6, "year": 2020, "month": 8, "day": 20}, + {"feature": 7, "year": 2020, "month": 8, "day": 21}, + ] + return spark_session.read.json(spark_context.parallelize(data, 1)) diff --git a/tests/unit/butterfree/dataframe_service/test_incremental_srategy.py b/tests/unit/butterfree/dataframe_service/test_incremental_srategy.py new file mode 100644 index 00000000..a140ceb3 --- /dev/null +++ b/tests/unit/butterfree/dataframe_service/test_incremental_srategy.py @@ -0,0 +1,70 @@ +from butterfree.dataframe_service import IncrementalStrategy + + +class TestIncrementalStrategy: + def test_from_milliseconds(self): + # arrange + incremental_strategy = IncrementalStrategy().from_milliseconds("ts") + target_expression = "date(from_unixtime(ts/ 1000.0)) >= date('2020-01-01')" + + # act + result_expression = incremental_strategy.get_expression(start_date="2020-01-01") + + # assert + assert target_expression.split() == result_expression.split() + + def test_from_string(self): + # arrange + incremental_strategy = IncrementalStrategy().from_string( + "dt", mask="dd/MM/yyyy" + ) + target_expression = "date(to_date(dt, 'dd/MM/yyyy')) >= date('2020-01-01')" + + # act + result_expression = incremental_strategy.get_expression(start_date="2020-01-01") + + # assert + assert target_expression.split() == result_expression.split() + + def test_from_year_month_day_partitions(self): + # arrange + incremental_strategy = IncrementalStrategy().from_year_month_day_partitions( + year_column="y", month_column="m", day_column="d" + ) + target_expression = ( + "date(concat(string(y), " + "'-', string(m), " + "'-', string(d))) >= date('2020-01-01')" + ) + + # act + result_expression = incremental_strategy.get_expression(start_date="2020-01-01") + + # assert + assert target_expression.split() == result_expression.split() + + def test_get_expression_with_just_end_date(self): + # arrange + incremental_strategy = IncrementalStrategy(column="dt") + target_expression = "date(dt) <= date('2020-01-01')" + + # act + result_expression = incremental_strategy.get_expression(end_date="2020-01-01") + + # assert + assert target_expression.split() == result_expression.split() + + def test_get_expression_with_start_and_end_date(self): + # arrange + incremental_strategy = IncrementalStrategy(column="dt") + target_expression = ( + "date(dt) >= date('2019-12-30') and date(dt) <= date('2020-01-01')" + ) + + # act + result_expression = incremental_strategy.get_expression( + start_date="2019-12-30", end_date="2020-01-01" + ) + + # assert + assert target_expression.split() == result_expression.split() diff --git a/tests/unit/butterfree/dataframe_service/test_partitioning.py b/tests/unit/butterfree/dataframe_service/test_partitioning.py new file mode 100644 index 00000000..3a6b5b40 --- /dev/null +++ b/tests/unit/butterfree/dataframe_service/test_partitioning.py @@ -0,0 +1,20 @@ +from butterfree.dataframe_service import extract_partition_values + + +class TestPartitioning: + def test_extract_partition_values(self, test_partitioning_input_df): + # arrange + target_values = [ + {"year": 2009, "month": 8, "day": 20}, + {"year": 2020, "month": 8, "day": 20}, + {"year": 2020, "month": 9, "day": 20}, + {"year": 2020, "month": 8, "day": 21}, + ] + + # act + result_values = extract_partition_values( + test_partitioning_input_df, partition_columns=["year", "month", "day"] + ) + + # assert + assert result_values == target_values diff --git a/tests/unit/butterfree/extract/conftest.py b/tests/unit/butterfree/extract/conftest.py index ab6f525c..3d0e763d 100644 --- a/tests/unit/butterfree/extract/conftest.py +++ b/tests/unit/butterfree/extract/conftest.py @@ -1,6 +1,7 @@ from unittest.mock import Mock import pytest +from pyspark.sql.functions import col, to_date from butterfree.constants.columns import TIMESTAMP_COLUMN @@ -17,6 +18,60 @@ def target_df(spark_context, spark_session): return spark_session.read.json(spark_context.parallelize(data, 1)) +@pytest.fixture() +def incremental_source_df(spark_context, spark_session): + data = [ + { + "id": 1, + "feature": 100, + "date_str": "28/07/2020", + "milliseconds": 1595894400000, + "year": 2020, + "month": 7, + "day": 28, + }, + { + "id": 1, + "feature": 110, + "date_str": "29/07/2020", + "milliseconds": 1595980800000, + "year": 2020, + "month": 7, + "day": 29, + }, + { + "id": 1, + "feature": 120, + "date_str": "30/07/2020", + "milliseconds": 1596067200000, + "year": 2020, + "month": 7, + "day": 30, + }, + { + "id": 2, + "feature": 150, + "date_str": "31/07/2020", + "milliseconds": 1596153600000, + "year": 2020, + "month": 7, + "day": 31, + }, + { + "id": 2, + "feature": 200, + "date_str": "01/08/2020", + "milliseconds": 1596240000000, + "year": 2020, + "month": 8, + "day": 1, + }, + ] + return spark_session.read.json(spark_context.parallelize(data, 1)).withColumn( + "date", to_date(col("date_str"), "dd/MM/yyyy") + ) + + @pytest.fixture() def spark_client(): return Mock() diff --git a/tests/unit/butterfree/extract/readers/test_file_reader.py b/tests/unit/butterfree/extract/readers/test_file_reader.py index d337d4fe..9e1c42bc 100644 --- a/tests/unit/butterfree/extract/readers/test_file_reader.py +++ b/tests/unit/butterfree/extract/readers/test_file_reader.py @@ -36,11 +36,11 @@ def test_consume( # act output_df = file_reader.consume(spark_client) - options = dict({"path": path}, **format_options if format_options else {}) + options = dict(format_options if format_options else {}) # assert spark_client.read.assert_called_once_with( - format=format, options=options, schema=schema, stream=False + format=format, schema=schema, stream=False, path=path, **options ) assert target_df.collect() == output_df.collect() @@ -51,7 +51,7 @@ def test_consume_with_stream_without_schema(self, spark_client, target_df): schema = None format_options = None stream = True - options = dict({"path": path}) + options = dict({}) spark_client.read.return_value = target_df file_reader = FileReader( @@ -64,11 +64,11 @@ def test_consume_with_stream_without_schema(self, spark_client, target_df): # assert # assert call for schema infer - spark_client.read.assert_any_call(format=format, options=options) + spark_client.read.assert_any_call(format=format, path=path, **options) # assert call for stream read # stream spark_client.read.assert_called_with( - format=format, options=options, schema=output_df.schema, stream=stream + format=format, schema=output_df.schema, stream=stream, path=path, **options ) assert target_df.collect() == output_df.collect() diff --git a/tests/unit/butterfree/extract/readers/test_reader.py b/tests/unit/butterfree/extract/readers/test_reader.py index c210a756..78160553 100644 --- a/tests/unit/butterfree/extract/readers/test_reader.py +++ b/tests/unit/butterfree/extract/readers/test_reader.py @@ -1,7 +1,9 @@ import pytest from pyspark.sql.functions import expr +from butterfree.dataframe_service import IncrementalStrategy from butterfree.extract.readers import FileReader +from butterfree.testing.dataframe import assert_dataframe_equality def add_value_transformer(df, column, value): @@ -152,3 +154,59 @@ def test_build_with_columns( # assert assert column_target_df.collect() == result_df.collect() + + def test_build_with_incremental_strategy( + self, incremental_source_df, spark_client, spark_session + ): + # arrange + readers = [ + # directly from column + FileReader( + id="test_1", path="path/to/file", format="format" + ).with_incremental_strategy( + incremental_strategy=IncrementalStrategy(column="date") + ), + # from milliseconds + FileReader( + id="test_2", path="path/to/file", format="format" + ).with_incremental_strategy( + incremental_strategy=IncrementalStrategy().from_milliseconds( + column_name="milliseconds" + ) + ), + # from str + FileReader( + id="test_3", path="path/to/file", format="format" + ).with_incremental_strategy( + incremental_strategy=IncrementalStrategy().from_string( + column_name="date_str", mask="dd/MM/yyyy" + ) + ), + # from year, month, day partitions + FileReader( + id="test_4", path="path/to/file", format="format" + ).with_incremental_strategy( + incremental_strategy=( + IncrementalStrategy().from_year_month_day_partitions() + ) + ), + ] + + spark_client.read.return_value = incremental_source_df + target_df = incremental_source_df.where( + "date >= date('2020-07-29') and date <= date('2020-07-31')" + ) + + # act + for reader in readers: + reader.build( + client=spark_client, start_date="2020-07-29", end_date="2020-07-31" + ) + + output_dfs = [ + spark_session.table(f"test_{i + 1}") for i, _ in enumerate(readers) + ] + + # assert + for output_df in output_dfs: + assert_dataframe_equality(output_df=output_df, target_df=target_df) diff --git a/tests/unit/butterfree/hooks/__init__.py b/tests/unit/butterfree/hooks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/butterfree/hooks/schema_compatibility/__init__.py b/tests/unit/butterfree/hooks/schema_compatibility/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py b/tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py new file mode 100644 index 00000000..eccb8d8c --- /dev/null +++ b/tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py @@ -0,0 +1,49 @@ +from unittest.mock import MagicMock + +import pytest + +from butterfree.clients import CassandraClient +from butterfree.hooks.schema_compatibility import CassandraTableSchemaCompatibilityHook + + +class TestCassandraTableSchemaCompatibilityHook: + def test_run_compatible_schema(self, spark_session): + cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace") + + cassandra_client.sql = MagicMock( # type: ignore + return_value=[ + {"column_name": "feature1", "type": "text"}, + {"column_name": "feature2", "type": "int"}, + ] + ) + + table = "table" + + input_dataframe = spark_session.sql("select 'abc' as feature1, 1 as feature2") + + hook = CassandraTableSchemaCompatibilityHook(cassandra_client, table) + + # act and assert + assert hook.run(input_dataframe) == input_dataframe + + def test_run_incompatible_schema(self, spark_session): + cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace") + + cassandra_client.sql = MagicMock( # type: ignore + return_value=[ + {"column_name": "feature1", "type": "text"}, + {"column_name": "feature2", "type": "bigint"}, + ] + ) + + table = "table" + + input_dataframe = spark_session.sql("select 'abc' as feature1, 1 as feature2") + + hook = CassandraTableSchemaCompatibilityHook(cassandra_client, table) + + # act and assert + with pytest.raises( + ValueError, match="There's a schema incompatibility between" + ): + hook.run(input_dataframe) diff --git a/tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py b/tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py new file mode 100644 index 00000000..3a31b600 --- /dev/null +++ b/tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py @@ -0,0 +1,53 @@ +import pytest + +from butterfree.clients import SparkClient +from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook + + +class TestSparkTableSchemaCompatibilityHook: + @pytest.mark.parametrize( + "table, database, target_table_expression", + [("table", "database", "`database`.`table`"), ("table", None, "`table`")], + ) + def test_build_table_expression(self, table, database, target_table_expression): + # arrange + spark_client = SparkClient() + + # act + result_table_expression = SparkTableSchemaCompatibilityHook( + spark_client, table, database + ).table_expression + + # assert + assert target_table_expression == result_table_expression + + def test_run_compatible_schema(self, spark_session): + # arrange + spark_client = SparkClient() + target_table = spark_session.sql( + "select 1 as feature_a, 'abc' as feature_b, true as other_feature" + ) + input_dataframe = spark_session.sql("select 1 as feature_a, 'abc' as feature_b") + target_table.registerTempTable("test") + + hook = SparkTableSchemaCompatibilityHook(spark_client, "test") + + # act and assert + assert hook.run(input_dataframe) == input_dataframe + + def test_run_incompatible_schema(self, spark_session): + # arrange + spark_client = SparkClient() + target_table = spark_session.sql( + "select 1 as feature_a, 'abc' as feature_b, true as other_feature" + ) + input_dataframe = spark_session.sql( + "select 1 as feature_a, 'abc' as feature_b, true as unregisted_column" + ) + target_table.registerTempTable("test") + + hook = SparkTableSchemaCompatibilityHook(spark_client, "test") + + # act and assert + with pytest.raises(ValueError, match="The dataframe has a schema incompatible"): + hook.run(input_dataframe) diff --git a/tests/unit/butterfree/hooks/test_hookable_component.py b/tests/unit/butterfree/hooks/test_hookable_component.py new file mode 100644 index 00000000..37e34e69 --- /dev/null +++ b/tests/unit/butterfree/hooks/test_hookable_component.py @@ -0,0 +1,107 @@ +import pytest +from pyspark.sql.functions import expr + +from butterfree.hooks import Hook, HookableComponent +from butterfree.testing.dataframe import assert_dataframe_equality + + +class TestComponent(HookableComponent): + def construct(self, dataframe): + pre_hook_df = self.run_pre_hooks(dataframe) + construct_df = pre_hook_df.withColumn("feature", expr("feature * feature")) + return self.run_post_hooks(construct_df) + + +class AddHook(Hook): + def __init__(self, value): + self.value = value + + def run(self, dataframe): + return dataframe.withColumn("feature", expr(f"feature + {self.value}")) + + +class TestHookableComponent: + def test_add_hooks(self): + # arrange + hook1 = AddHook(value=1) + hook2 = AddHook(value=2) + hook3 = AddHook(value=3) + hook4 = AddHook(value=4) + hookable_component = HookableComponent() + + # act + hookable_component.add_pre_hook(hook1, hook2) + hookable_component.add_post_hook(hook3, hook4) + + # assert + assert hookable_component.pre_hooks == [hook1, hook2] + assert hookable_component.post_hooks == [hook3, hook4] + + @pytest.mark.parametrize( + "enable_pre_hooks, enable_post_hooks", + [("not boolean", False), (False, "not boolean")], + ) + def test_invalid_enable_hook(self, enable_pre_hooks, enable_post_hooks): + # arrange + hookable_component = HookableComponent() + + # act and assert + with pytest.raises(ValueError): + hookable_component.enable_pre_hooks = enable_pre_hooks + hookable_component.enable_post_hooks = enable_post_hooks + + @pytest.mark.parametrize( + "pre_hooks, post_hooks", + [ + ([AddHook(1)], "not a list of hooks"), + ([AddHook(1)], [AddHook(1), 2, 3]), + ("not a list of hooks", [AddHook(1)]), + ([AddHook(1), 2, 3], [AddHook(1)]), + ], + ) + def test_invalid_hooks(self, pre_hooks, post_hooks): + # arrange + hookable_component = HookableComponent() + + # act and assert + with pytest.raises(ValueError): + hookable_component.pre_hooks = pre_hooks + hookable_component.post_hooks = post_hooks + + @pytest.mark.parametrize( + "pre_hook, enable_pre_hooks, post_hook, enable_post_hooks", + [ + (AddHook(value=1), False, AddHook(value=1), True), + (AddHook(value=1), True, AddHook(value=1), False), + ("not a pre-hook", True, AddHook(value=1), True), + (AddHook(value=1), True, "not a pre-hook", True), + ], + ) + def test_add_invalid_hooks( + self, pre_hook, enable_pre_hooks, post_hook, enable_post_hooks + ): + # arrange + hookable_component = HookableComponent() + hookable_component.enable_pre_hooks = enable_pre_hooks + hookable_component.enable_post_hooks = enable_post_hooks + + # act and assert + with pytest.raises(ValueError): + hookable_component.add_pre_hook(pre_hook) + hookable_component.add_post_hook(post_hook) + + def test_run_hooks(self, spark_session): + # arrange + input_dataframe = spark_session.sql("select 2 as feature") + test_component = ( + TestComponent() + .add_pre_hook(AddHook(value=1)) + .add_post_hook(AddHook(value=1)) + ) + target_table = spark_session.sql("select 10 as feature") + + # act + output_df = test_component.construct(input_dataframe) + + # assert + assert_dataframe_equality(output_df, target_table) diff --git a/tests/unit/butterfree/load/conftest.py b/tests/unit/butterfree/load/conftest.py index 7c2549c5..4dcf25c9 100644 --- a/tests/unit/butterfree/load/conftest.py +++ b/tests/unit/butterfree/load/conftest.py @@ -32,6 +32,31 @@ def feature_set(): ) +@fixture +def feature_set_incremental(): + key_features = [ + KeyFeature(name="id", description="Description", dtype=DataType.INTEGER) + ] + ts_feature = TimestampFeature(from_column=TIMESTAMP_COLUMN) + features = [ + Feature( + name="feature", + description="test", + transformation=AggregatedTransform( + functions=[Function(functions.sum, DataType.INTEGER)] + ), + ), + ] + return AggregatedFeatureSet( + "feature_set", + "entity", + "description", + keys=key_features, + timestamp=ts_feature, + features=features, + ) + + @fixture def feature_set_dataframe(spark_context, spark_session): data = [ diff --git a/tests/unit/butterfree/load/test_sink.py b/tests/unit/butterfree/load/test_sink.py index 93b5e279..ef377f67 100644 --- a/tests/unit/butterfree/load/test_sink.py +++ b/tests/unit/butterfree/load/test_sink.py @@ -120,7 +120,7 @@ def test_flush_with_writers_list_empty(self): with pytest.raises(ValueError): Sink(writers=writer) - def test_flush_streaming_df(self, feature_set): + def test_flush_streaming_df(self, feature_set, mocker): """Testing the return of the streaming handlers by the sink.""" # arrange spark_client = SparkClient() @@ -136,10 +136,25 @@ def test_flush_streaming_df(self, feature_set): mocked_stream_df.start.return_value = Mock(spec=StreamingQuery) online_feature_store_writer = OnlineFeatureStoreWriter() + + online_feature_store_writer.check_schema_hook = mocker.stub("check_schema_hook") + online_feature_store_writer.check_schema_hook.run = mocker.stub("run") + online_feature_store_writer.check_schema_hook.run.return_value = ( + mocked_stream_df + ) + online_feature_store_writer_on_entity = OnlineFeatureStoreWriter( write_to_entity=True ) + online_feature_store_writer_on_entity.check_schema_hook = mocker.stub( + "check_schema_hook" + ) + online_feature_store_writer_on_entity.check_schema_hook.run = mocker.stub("run") + online_feature_store_writer_on_entity.check_schema_hook.run.return_value = ( + mocked_stream_df + ) + sink = Sink( writers=[ online_feature_store_writer, @@ -162,7 +177,7 @@ def test_flush_streaming_df(self, feature_set): assert isinstance(handler, StreamingQuery) def test_flush_with_multiple_online_writers( - self, feature_set, feature_set_dataframe + self, feature_set, feature_set_dataframe, mocker ): """Testing the flow of writing to a feature-set table and to an entity table.""" # arrange @@ -173,10 +188,25 @@ def test_flush_with_multiple_online_writers( feature_set.name = "my_feature_set" online_feature_store_writer = OnlineFeatureStoreWriter() + + online_feature_store_writer.check_schema_hook = mocker.stub("check_schema_hook") + online_feature_store_writer.check_schema_hook.run = mocker.stub("run") + online_feature_store_writer.check_schema_hook.run.return_value = ( + feature_set_dataframe + ) + online_feature_store_writer_on_entity = OnlineFeatureStoreWriter( write_to_entity=True ) + online_feature_store_writer_on_entity.check_schema_hook = mocker.stub( + "check_schema_hook" + ) + online_feature_store_writer_on_entity.check_schema_hook.run = mocker.stub("run") + online_feature_store_writer_on_entity.check_schema_hook.run.return_value = ( + feature_set_dataframe + ) + sink = Sink( writers=[online_feature_store_writer, online_feature_store_writer_on_entity] ) diff --git a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py index 14c067f9..aac806f7 100644 --- a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py @@ -19,10 +19,15 @@ def test_write( feature_set, ): # given - spark_client = mocker.stub("spark_client") + spark_client = SparkClient() spark_client.write_table = mocker.stub("write_table") writer = HistoricalFeatureStoreWriter() + schema_dataframe = writer._create_partitions(feature_set_dataframe) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = schema_dataframe + # when writer.write( feature_set=feature_set, @@ -41,7 +46,76 @@ def test_write( assert ( writer.PARTITION_BY == spark_client.write_table.call_args[1]["partition_by"] ) - assert feature_set.name == spark_client.write_table.call_args[1]["table_name"] + + def test_write_interval_mode( + self, + feature_set_dataframe, + historical_feature_set_dataframe, + mocker, + feature_set, + ): + # given + spark_client = SparkClient() + spark_client.write_dataframe = mocker.stub("write_dataframe") + spark_client.conn.conf.set( + "spark.sql.sources.partitionOverwriteMode", "dynamic" + ) + writer = HistoricalFeatureStoreWriter(interval_mode=True) + + schema_dataframe = writer._create_partitions(feature_set_dataframe) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = schema_dataframe + + # when + writer.write( + feature_set=feature_set, + dataframe=feature_set_dataframe, + spark_client=spark_client, + ) + result_df = spark_client.write_dataframe.call_args[1]["dataframe"] + + # then + assert_dataframe_equality(historical_feature_set_dataframe, result_df) + + assert ( + writer.db_config.format_ + == spark_client.write_dataframe.call_args[1]["format_"] + ) + assert ( + writer.db_config.mode == spark_client.write_dataframe.call_args[1]["mode"] + ) + assert ( + writer.PARTITION_BY + == spark_client.write_dataframe.call_args[1]["partitionBy"] + ) + + def test_write_interval_mode_invalid_partition_mode( + self, + feature_set_dataframe, + historical_feature_set_dataframe, + mocker, + feature_set, + ): + # given + spark_client = SparkClient() + spark_client.write_dataframe = mocker.stub("write_dataframe") + spark_client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "static") + + writer = HistoricalFeatureStoreWriter(interval_mode=True) + + schema_dataframe = writer._create_partitions(feature_set_dataframe) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = schema_dataframe + + # when + with pytest.raises(RuntimeError): + _ = writer.write( + feature_set=feature_set, + dataframe=feature_set_dataframe, + spark_client=spark_client, + ) def test_write_in_debug_mode( self, @@ -49,6 +123,7 @@ def test_write_in_debug_mode( historical_feature_set_dataframe, feature_set, spark_session, + mocker, ): # given spark_client = SparkClient() @@ -65,33 +140,75 @@ def test_write_in_debug_mode( # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) - def test_validate(self, feature_set_dataframe, mocker, feature_set): + def test_write_in_debug_mode_with_interval_mode( + self, + feature_set_dataframe, + historical_feature_set_dataframe, + feature_set, + spark_session, + ): + # given + spark_client = SparkClient() + writer = HistoricalFeatureStoreWriter(debug_mode=True, interval_mode=True) + + # when + writer.write( + feature_set=feature_set, + dataframe=feature_set_dataframe, + spark_client=spark_client, + ) + result_df = spark_session.table(f"historical_feature_store__{feature_set.name}") + + # then + assert_dataframe_equality(historical_feature_set_dataframe, result_df) + + def test_validate(self, historical_feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") spark_client.read_table = mocker.stub("read_table") - spark_client.read_table.return_value = feature_set_dataframe + spark_client.read_table.return_value = historical_feature_set_dataframe writer = HistoricalFeatureStoreWriter() # when - writer.validate(feature_set, feature_set_dataframe, spark_client) + writer.validate(feature_set, historical_feature_set_dataframe, spark_client) # then spark_client.read_table.assert_called_once() - def test_validate_false(self, feature_set_dataframe, mocker, feature_set): + def test_validate_interval_mode( + self, historical_feature_set_dataframe, mocker, feature_set + ): # given spark_client = mocker.stub("spark_client") - spark_client.read_table = mocker.stub("read_table") + spark_client.read = mocker.stub("read") + spark_client.read.return_value = historical_feature_set_dataframe + + writer = HistoricalFeatureStoreWriter(interval_mode=True) + + # when + writer.validate(feature_set, historical_feature_set_dataframe, spark_client) + + # then + spark_client.read.assert_called_once() + + def test_validate_false( + self, historical_feature_set_dataframe, mocker, feature_set + ): + # given + spark_client = mocker.stub("spark_client") + spark_client.read = mocker.stub("read") # limiting df to 1 row, now the counts should'n be the same - spark_client.read_table.return_value = feature_set_dataframe.limit(1) + spark_client.read.return_value = historical_feature_set_dataframe.limit(1) - writer = HistoricalFeatureStoreWriter() + writer = HistoricalFeatureStoreWriter(interval_mode=True) # when with pytest.raises(AssertionError): - _ = writer.validate(feature_set, feature_set_dataframe, spark_client) + _ = writer.validate( + feature_set, historical_feature_set_dataframe, spark_client + ) def test__create_partitions(self, spark_session, spark_context): # arrange @@ -201,8 +318,15 @@ def test_write_with_transform( # given spark_client = mocker.stub("spark_client") spark_client.write_table = mocker.stub("write_table") + writer = HistoricalFeatureStoreWriter().with_(json_transform) + schema_dataframe = writer._create_partitions(feature_set_dataframe) + json_dataframe = writer._apply_transformations(schema_dataframe) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = json_dataframe + # when writer.write( feature_set=feature_set, diff --git a/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py index 87823c55..384ec152 100644 --- a/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py @@ -68,6 +68,10 @@ def test_write( spark_client.write_dataframe = mocker.stub("write_dataframe") writer = OnlineFeatureStoreWriter(cassandra_config) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = feature_set_dataframe + # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -94,11 +98,16 @@ def test_write_in_debug_mode( latest_feature_set_dataframe, feature_set, spark_session, + mocker, ): # given spark_client = SparkClient() writer = OnlineFeatureStoreWriter(debug_mode=True) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = feature_set_dataframe + # when writer.write( feature_set=feature_set, @@ -110,9 +119,7 @@ def test_write_in_debug_mode( # then assert_dataframe_equality(latest_feature_set_dataframe, result_df) - def test_write_in_debug_and_stream_mode( - self, feature_set, spark_session, - ): + def test_write_in_debug_and_stream_mode(self, feature_set, spark_session, mocker): # arrange spark_client = SparkClient() @@ -125,6 +132,10 @@ def test_write_in_debug_and_stream_mode( writer = OnlineFeatureStoreWriter(debug_mode=True) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = mocked_stream_df + # act handler = writer.write( feature_set=feature_set, @@ -140,7 +151,7 @@ def test_write_in_debug_and_stream_mode( assert isinstance(handler, StreamingQuery) @pytest.mark.parametrize("has_checkpoint", [True, False]) - def test_write_stream(self, feature_set, has_checkpoint, monkeypatch): + def test_write_stream(self, feature_set, has_checkpoint, monkeypatch, mocker): # arrange spark_client = SparkClient() spark_client.write_stream = Mock() @@ -163,6 +174,10 @@ def test_write_stream(self, feature_set, has_checkpoint, monkeypatch): writer = OnlineFeatureStoreWriter(cassandra_config) writer.filter_latest = Mock() + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = dataframe + # act stream_handler = writer.write(feature_set, dataframe, spark_client) @@ -186,7 +201,7 @@ def test_get_db_schema(self, cassandra_config, test_feature_set, expected_schema assert schema == expected_schema - def test_write_stream_on_entity(self, feature_set, monkeypatch): + def test_write_stream_on_entity(self, feature_set, monkeypatch, mocker): """Test write method with stream dataframe and write_to_entity enabled. The main purpose of this test is assert the correct setup of stream checkpoint @@ -209,6 +224,10 @@ def test_write_stream_on_entity(self, feature_set, monkeypatch): writer = OnlineFeatureStoreWriter(write_to_entity=True) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = dataframe + # act stream_handler = writer.write(feature_set, dataframe, spark_client) @@ -237,6 +256,10 @@ def test_write_with_transform( spark_client.write_dataframe = mocker.stub("write_dataframe") writer = OnlineFeatureStoreWriter(cassandra_config).with_(json_transform) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = feature_set_dataframe + # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -270,6 +293,10 @@ def test_write_with_kafka_config( kafka_config = KafkaConfig() writer = OnlineFeatureStoreWriter(kafka_config).with_(json_transform) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = feature_set_dataframe + # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -293,6 +320,10 @@ def test_write_with_custom_kafka_config( json_transform ) + custom_writer.check_schema_hook = mocker.stub("check_schema_hook") + custom_writer.check_schema_hook.run = mocker.stub("run") + custom_writer.check_schema_hook.run.return_value = feature_set_dataframe + # when custom_writer.write(feature_set, feature_set_dataframe, spark_client) diff --git a/tests/unit/butterfree/pipelines/conftest.py b/tests/unit/butterfree/pipelines/conftest.py new file mode 100644 index 00000000..47e65efb --- /dev/null +++ b/tests/unit/butterfree/pipelines/conftest.py @@ -0,0 +1,63 @@ +from unittest.mock import Mock + +from pyspark.sql import functions +from pytest import fixture + +from butterfree.clients import SparkClient +from butterfree.constants import DataType +from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.extract import Source +from butterfree.extract.readers import TableReader +from butterfree.load import Sink +from butterfree.load.writers import HistoricalFeatureStoreWriter +from butterfree.pipelines import FeatureSetPipeline +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature +from butterfree.transform.transformations import SparkFunctionTransform +from butterfree.transform.utils import Function + + +@fixture() +def feature_set_pipeline(): + test_pipeline = FeatureSetPipeline( + spark_client=SparkClient(), + source=Mock( + spec=Source, + readers=[TableReader(id="source_a", database="db", table="table",)], + query="select * from source_a", + ), + feature_set=Mock( + spec=FeatureSet, + name="feature_set", + entity="entity", + description="description", + keys=[ + KeyFeature( + name="user_id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(from_column="ts"), + features=[ + Feature( + name="listing_page_viewed__rent_per_month", + description="Average of something.", + transformation=SparkFunctionTransform( + functions=[ + Function(functions.avg, DataType.FLOAT), + Function(functions.stddev_pop, DataType.FLOAT), + ], + ).with_window( + partition_by="user_id", + order_by=TIMESTAMP_COLUMN, + window_definition=["7 days", "2 weeks"], + mode="fixed_windows", + ), + ), + ], + ), + sink=Mock(spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)],), + ) + + return test_pipeline diff --git a/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py b/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py index 1bc3c707..7bae6606 100644 --- a/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py @@ -17,12 +17,8 @@ from butterfree.load.writers.writer import Writer from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline from butterfree.transform import FeatureSet -from butterfree.transform.aggregated_feature_set import AggregatedFeatureSet from butterfree.transform.features import Feature, KeyFeature, TimestampFeature -from butterfree.transform.transformations import ( - AggregatedTransform, - SparkFunctionTransform, -) +from butterfree.transform.transformations import SparkFunctionTransform from butterfree.transform.utils import Function @@ -104,115 +100,29 @@ def test_feature_set_args(self): assert len(pipeline.sink.writers) == 2 assert all(isinstance(writer, Writer) for writer in pipeline.sink.writers) - def test_run(self, spark_session): - test_pipeline = FeatureSetPipeline( - spark_client=SparkClient(), - source=Mock( - spec=Source, - readers=[TableReader(id="source_a", database="db", table="table",)], - query="select * from source_a", - ), - feature_set=Mock( - spec=FeatureSet, - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="listing_page_viewed__rent_per_month", - description="Average of something.", - transformation=SparkFunctionTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.FLOAT), - ], - ).with_window( - partition_by="user_id", - order_by=TIMESTAMP_COLUMN, - window_definition=["7 days", "2 weeks"], - mode="fixed_windows", - ), - ), - ], - ), - sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], - ), - ) - + def test_run(self, spark_session, feature_set_pipeline): # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) - test_pipeline.feature_set.construct.return_value = sample_df + feature_set_pipeline.feature_set.construct.return_value = sample_df - test_pipeline.run() + feature_set_pipeline.run() - test_pipeline.source.construct.assert_called_once() - test_pipeline.feature_set.construct.assert_called_once() - test_pipeline.sink.flush.assert_called_once() - test_pipeline.sink.validate.assert_called_once() - - def test_run_with_repartition(self, spark_session): - test_pipeline = FeatureSetPipeline( - spark_client=SparkClient(), - source=Mock( - spec=Source, - readers=[TableReader(id="source_a", database="db", table="table",)], - query="select * from source_a", - ), - feature_set=Mock( - spec=FeatureSet, - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="listing_page_viewed__rent_per_month", - description="Average of something.", - transformation=SparkFunctionTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.FLOAT), - ], - ).with_window( - partition_by="user_id", - order_by=TIMESTAMP_COLUMN, - window_definition=["7 days", "2 weeks"], - mode="fixed_windows", - ), - ), - ], - ), - sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], - ), - ) + feature_set_pipeline.source.construct.assert_called_once() + feature_set_pipeline.feature_set.construct.assert_called_once() + feature_set_pipeline.sink.flush.assert_called_once() + feature_set_pipeline.sink.validate.assert_called_once() + def test_run_with_repartition(self, spark_session, feature_set_pipeline): # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) - test_pipeline.feature_set.construct.return_value = sample_df + feature_set_pipeline.feature_set.construct.return_value = sample_df - test_pipeline.run(partition_by=["id"]) + feature_set_pipeline.run(partition_by=["id"]) - test_pipeline.source.construct.assert_called_once() - test_pipeline.feature_set.construct.assert_called_once() - test_pipeline.sink.flush.assert_called_once() - test_pipeline.sink.validate.assert_called_once() + feature_set_pipeline.source.construct.assert_called_once() + feature_set_pipeline.feature_set.construct.assert_called_once() + feature_set_pipeline.sink.flush.assert_called_once() + feature_set_pipeline.sink.validate.assert_called_once() def test_source_raise(self): with pytest.raises(ValueError, match="source must be a Source instance"): @@ -343,52 +253,26 @@ def test_sink_raise(self): sink=Mock(writers=[HistoricalFeatureStoreWriter(db_config=None)],), ) - def test_run_agg_with_end_date(self, spark_session): - test_pipeline = FeatureSetPipeline( - spark_client=SparkClient(), - source=Mock( - spec=Source, - readers=[TableReader(id="source_a", database="db", table="table",)], - query="select * from source_a", - ), - feature_set=Mock( - spec=AggregatedFeatureSet, - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="listing_page_viewed__rent_per_month", - description="Average of something.", - transformation=AggregatedTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.FLOAT), - ], - ), - ), - ], - ), - sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], - ), - ) + def test_run_agg_with_end_date(self, spark_session, feature_set_pipeline): + # feature_set need to return a real df for streaming validation + sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) + feature_set_pipeline.feature_set.construct.return_value = sample_df + + feature_set_pipeline.run(end_date="2016-04-18") + + feature_set_pipeline.source.construct.assert_called_once() + feature_set_pipeline.feature_set.construct.assert_called_once() + feature_set_pipeline.sink.flush.assert_called_once() + feature_set_pipeline.sink.validate.assert_called_once() + def test_run_agg_with_start_date(self, spark_session, feature_set_pipeline): # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) - test_pipeline.feature_set.construct.return_value = sample_df + feature_set_pipeline.feature_set.construct.return_value = sample_df - test_pipeline.run(end_date="2016-04-18") + feature_set_pipeline.run(start_date="2020-08-04") - test_pipeline.source.construct.assert_called_once() - test_pipeline.feature_set.construct.assert_called_once() - test_pipeline.sink.flush.assert_called_once() - test_pipeline.sink.validate.assert_called_once() + feature_set_pipeline.source.construct.assert_called_once() + feature_set_pipeline.feature_set.construct.assert_called_once() + feature_set_pipeline.sink.flush.assert_called_once() + feature_set_pipeline.sink.validate.assert_called_once() diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index 2d7d3e50..febc8bbc 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -1,11 +1,19 @@ import json from unittest.mock import Mock +from pyspark.sql import functions from pytest import fixture from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.transform import FeatureSet +from butterfree.transform.aggregated_feature_set import AggregatedFeatureSet from butterfree.transform.features import Feature, KeyFeature, TimestampFeature +from butterfree.transform.transformations import ( + AggregatedTransform, + SparkFunctionTransform, +) +from butterfree.transform.utils import Function def make_dataframe(spark_context, spark_session): @@ -297,3 +305,77 @@ def key_id(): @fixture def timestamp_c(): return TimestampFeature() + + +@fixture +def feature_set(): + feature_set = FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature1", + description="test", + transformation=SparkFunctionTransform( + functions=[ + Function(functions.avg, DataType.FLOAT), + Function(functions.stddev_pop, DataType.DOUBLE), + ] + ).with_window( + partition_by="id", + order_by=TIMESTAMP_COLUMN, + mode="fixed_windows", + window_definition=["2 minutes", "15 minutes"], + ), + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.BIGINT, + ) + ], + timestamp=TimestampFeature(), + ) + + return feature_set + + +@fixture +def agg_feature_set(): + feature_set = AggregatedFeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature1", + description="test", + transformation=AggregatedTransform( + functions=[ + Function(functions.avg, DataType.DOUBLE), + Function(functions.stddev_pop, DataType.FLOAT), + ], + ), + ), + Feature( + name="feature2", + description="test", + transformation=AggregatedTransform( + functions=[Function(functions.count, DataType.ARRAY_STRING)] + ), + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.BIGINT, + ) + ], + timestamp=TimestampFeature(), + ).with_windows(definitions=["1 week", "2 days"]) + + return feature_set diff --git a/tests/unit/butterfree/transform/test_aggregated_feature_set.py b/tests/unit/butterfree/transform/test_aggregated_feature_set.py index 2c404fea..8025d6f8 100644 --- a/tests/unit/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/unit/butterfree/transform/test_aggregated_feature_set.py @@ -89,7 +89,7 @@ def test_agg_feature_set_with_window( output_df = fs.construct(dataframe, spark_client, end_date="2016-05-01") assert_dataframe_equality(output_df, rolling_windows_agg_dataframe) - def test_get_schema(self): + def test_get_schema(self, agg_feature_set): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, @@ -125,40 +125,7 @@ def test_get_schema(self): }, ] - feature_set = AggregatedFeatureSet( - name="feature_set", - entity="entity", - description="description", - features=[ - Feature( - name="feature1", - description="test", - transformation=AggregatedTransform( - functions=[ - Function(functions.avg, DataType.DOUBLE), - Function(functions.stddev_pop, DataType.FLOAT), - ], - ), - ), - Feature( - name="feature2", - description="test", - transformation=AggregatedTransform( - functions=[Function(functions.count, DataType.ARRAY_STRING)] - ), - ), - ], - keys=[ - KeyFeature( - name="id", - description="The user's Main ID or device ID", - dtype=DataType.BIGINT, - ) - ], - timestamp=TimestampFeature(), - ).with_windows(definitions=["1 week", "2 days"]) - - schema = feature_set.get_schema() + schema = agg_feature_set.get_schema() assert schema == expected_schema @@ -389,3 +356,34 @@ def test_feature_transform_with_data_type_array(self, spark_context, spark_sessi # assert assert_dataframe_equality(target_df, output_df) + + def test_define_start_date(self, agg_feature_set): + start_date = agg_feature_set.define_start_date("2020-08-04") + + assert isinstance(start_date, str) + assert start_date == "2020-07-27" + + def test_feature_set_start_date( + self, timestamp_c, feature_set_with_distinct_dataframe, + ): + fs = AggregatedFeatureSet( + name="name", + entity="entity", + description="description", + features=[ + Feature( + name="feature", + description="test", + transformation=AggregatedTransform( + functions=[Function(functions.sum, DataType.INTEGER)] + ), + ), + ], + keys=[KeyFeature(name="h3", description="test", dtype=DataType.STRING)], + timestamp=timestamp_c, + ).with_windows(["10 days", "3 weeks", "90 days"]) + + # assert + start_date = fs.define_start_date("2016-04-14") + + assert start_date == "2016-01-14" diff --git a/tests/unit/butterfree/transform/test_feature_set.py b/tests/unit/butterfree/transform/test_feature_set.py index bdb1ff7d..43d937be 100644 --- a/tests/unit/butterfree/transform/test_feature_set.py +++ b/tests/unit/butterfree/transform/test_feature_set.py @@ -12,13 +12,11 @@ from butterfree.clients import SparkClient from butterfree.constants import DataType -from butterfree.constants.columns import TIMESTAMP_COLUMN from butterfree.testing.dataframe import assert_dataframe_equality from butterfree.transform import FeatureSet -from butterfree.transform.features import Feature, KeyFeature, TimestampFeature +from butterfree.transform.features import Feature from butterfree.transform.transformations import ( AggregatedTransform, - SparkFunctionTransform, SQLExpressionTransform, ) from butterfree.transform.utils import Function @@ -341,7 +339,7 @@ def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): timestamp=timestamp_c, ).construct(dataframe, spark_client) - def test_get_schema(self): + def test_get_schema(self, feature_set): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, @@ -367,37 +365,6 @@ def test_get_schema(self): }, ] - feature_set = FeatureSet( - name="feature_set", - entity="entity", - description="description", - features=[ - Feature( - name="feature1", - description="test", - transformation=SparkFunctionTransform( - functions=[ - Function(F.avg, DataType.FLOAT), - Function(F.stddev_pop, DataType.DOUBLE), - ] - ).with_window( - partition_by="id", - order_by=TIMESTAMP_COLUMN, - mode="fixed_windows", - window_definition=["2 minutes", "15 minutes"], - ), - ), - ], - keys=[ - KeyFeature( - name="id", - description="The user's Main ID or device ID", - dtype=DataType.BIGINT, - ) - ], - timestamp=TimestampFeature(), - ) - schema = feature_set.get_schema() assert schema == expected_schema @@ -421,3 +388,9 @@ def test_feature_without_datatype(self, key_id, timestamp_c, dataframe): keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client) + + def test_define_start_date(self, feature_set): + start_date = feature_set.define_start_date("2020-08-04") + + assert isinstance(start_date, str) + assert start_date == "2020-08-04" From 8da89edac01510f31e8da33b0c5b474e93f2e5a4 Mon Sep 17 00:00:00 2001 From: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Date: Mon, 22 Feb 2021 09:22:28 -0300 Subject: [PATCH 12/86] Allow slide selection (#293) --- Makefile | 2 +- .../transform/aggregated_feature_set.py | 61 ++++++++++----- butterfree/transform/utils/window_spec.py | 11 +-- setup.py | 2 +- tests/unit/butterfree/transform/conftest.py | 57 +++++++++----- .../transform/test_aggregated_feature_set.py | 76 +++++++------------ 6 files changed, 116 insertions(+), 93 deletions(-) diff --git a/Makefile b/Makefile index e6de9baa..397d04bf 100644 --- a/Makefile +++ b/Makefile @@ -105,7 +105,7 @@ checks: style-check quality-check type-check ## fix stylistic errors with black apply-style: @python -m black -t py36 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . - @python -m isort -rc butterfree/ tests/ + @python -m isort -rc --atomic butterfree/ tests/ .PHONY: clean ## clean unused artifacts diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index a19efb35..7a8656cd 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -301,7 +301,9 @@ def with_distinct(self, subset: List, keep: str = "last") -> "AggregatedFeatureS return self - def with_windows(self, definitions: List[str]) -> "AggregatedFeatureSet": + def with_windows( + self, definitions: List[str], slide: str = None + ) -> "AggregatedFeatureSet": """Create a list with windows defined.""" self._windows = [ Window( @@ -309,6 +311,7 @@ def with_windows(self, definitions: List[str]) -> "AggregatedFeatureSet": order_by=None, mode="rolling_windows", window_definition=definition, + slide=slide, ) for definition in definitions ] @@ -563,12 +566,6 @@ def construct( ) if self._windows and end_date is not None: - # prepare our left table, a cartesian product between distinct keys - # and dates in range for this feature set - base_df = self._get_base_dataframe( - client=client, dataframe=output_df, end_date=end_date - ) - # run aggregations for each window agg_list = [ self._aggregate( @@ -580,18 +577,44 @@ def construct( for w in self._windows ] - # left join each aggregation result to our base dataframe - output_df = reduce( - lambda left, right: self._dataframe_join( - left, - right, - on=self.keys_columns + [self.timestamp_column], - how="left", - num_processors=num_processors, - ), - agg_list, - base_df, - ) + # prepare our left table, a cartesian product between distinct keys + # and dates in range for this feature set + + # todo next versions won't use this logic anymore, + # leaving for the client to correct the usage of aggregations + # without events + + # keeping this logic to maintain the same behavior for already implemented + # feature sets + + if self._windows[0].slide == "1 day": + base_df = self._get_base_dataframe( + client=client, dataframe=output_df, end_date=end_date + ) + + # left join each aggregation result to our base dataframe + output_df = reduce( + lambda left, right: self._dataframe_join( + left, + right, + on=self.keys_columns + [self.timestamp_column], + how="left", + num_processors=num_processors, + ), + agg_list, + base_df, + ) + else: + output_df = reduce( + lambda left, right: self._dataframe_join( + left, + right, + on=self.keys_columns + [self.timestamp_column], + how="full outer", + num_processors=num_processors, + ), + agg_list, + ) else: output_df = self._aggregate(output_df, features=self.features) diff --git a/butterfree/transform/utils/window_spec.py b/butterfree/transform/utils/window_spec.py index a270fec0..53ecd2fd 100644 --- a/butterfree/transform/utils/window_spec.py +++ b/butterfree/transform/utils/window_spec.py @@ -62,7 +62,7 @@ class Window: Use the static methods in :class:`Window` to create a :class:`WindowSpec`. """ - SLIDE_DURATION: str = "1 day" + DEFAULT_SLIDE_DURATION: str = "1 day" def __init__( self, @@ -70,10 +70,12 @@ def __init__( partition_by: Optional[Union[Column, str, List[str]]] = None, order_by: Optional[Union[Column, str]] = None, mode: str = None, + slide: str = None, ): self.partition_by = partition_by self.order_by = order_by or TIMESTAMP_COLUMN self.frame_boundaries = FrameBoundaries(mode, window_definition) + self.slide = slide or self.DEFAULT_SLIDE_DURATION def get_name(self) -> str: """Return window suffix name based on passed criteria.""" @@ -89,15 +91,10 @@ def get_name(self) -> str: def get(self) -> Any: """Defines a common window to be used both in time and rows windows.""" if self.frame_boundaries.mode == "rolling_windows": - if int(self.frame_boundaries.window_definition.split()[0]) <= 0: - raise KeyError( - f"{self.frame_boundaries.window_definition} " - f"have negative element." - ) return functions.window( TIMESTAMP_COLUMN, self.frame_boundaries.window_definition, - slideDuration=self.SLIDE_DURATION, + slideDuration=self.slide, ) elif self.order_by == TIMESTAMP_COLUMN: w = sql.Window.partitionBy(self.partition_by).orderBy( # type: ignore diff --git a/setup.py b/setup.py index 4adcbce9..393fb0a0 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev0" +__version__ = "1.2.0.dev1" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index febc8bbc..bbef8b13 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -135,6 +135,35 @@ def make_rolling_windows_agg_dataframe(spark_context, spark_session): return df +def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): + data = [ + { + "id": 1, + "timestamp": "2016-04-11 12:00:00", + "feature1__avg_over_1_day_rolling_windows": 266.6666666666667, + "feature2__avg_over_1_day_rolling_windows": 300.0, + }, + { + "id": 1, + "timestamp": "2016-04-12 00:00:00", + "feature1__avg_over_1_day_rolling_windows": 300.0, + "feature2__avg_over_1_day_rolling_windows": 350.0, + }, + { + "id": 1, + "timestamp": "2016-04-12 12:00:00", + "feature1__avg_over_1_day_rolling_windows": 400.0, + "feature2__avg_over_1_day_rolling_windows": 500.0, + }, + ] + df = spark_session.read.json( + spark_context.parallelize(data).map(lambda x: json.dumps(x)) + ) + df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + + return df + + def make_fs(spark_context, spark_session): df = make_dataframe(spark_context, spark_session) df = ( @@ -241,6 +270,11 @@ def rolling_windows_agg_dataframe(spark_context, spark_session): return make_rolling_windows_agg_dataframe(spark_context, spark_session) +@fixture +def rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): + return make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session) + + @fixture def feature_set_with_distinct_dataframe(spark_context, spark_session): return make_fs_dataframe_with_distinct(spark_context, spark_session) @@ -345,8 +379,8 @@ def feature_set(): @fixture def agg_feature_set(): - feature_set = AggregatedFeatureSet( - name="feature_set", + return AggregatedFeatureSet( + name="name", entity="entity", description="description", features=[ @@ -354,28 +388,17 @@ def agg_feature_set(): name="feature1", description="test", transformation=AggregatedTransform( - functions=[ - Function(functions.avg, DataType.DOUBLE), - Function(functions.stddev_pop, DataType.FLOAT), - ], + functions=[Function(functions.avg, DataType.DOUBLE)], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( - functions=[Function(functions.count, DataType.ARRAY_STRING)] + functions=[Function(functions.avg, DataType.DOUBLE)] ), ), ], - keys=[ - KeyFeature( - name="id", - description="The user's Main ID or device ID", - dtype=DataType.BIGINT, - ) - ], + keys=[KeyFeature(name="id", description="description", dtype=DataType.BIGINT,)], timestamp=TimestampFeature(), - ).with_windows(definitions=["1 week", "2 days"]) - - return feature_set + ) diff --git a/tests/unit/butterfree/transform/test_aggregated_feature_set.py b/tests/unit/butterfree/transform/test_aggregated_feature_set.py index 8025d6f8..458956f3 100644 --- a/tests/unit/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/unit/butterfree/transform/test_aggregated_feature_set.py @@ -1,13 +1,6 @@ import pytest from pyspark.sql import functions -from pyspark.sql.types import ( - ArrayType, - DoubleType, - FloatType, - LongType, - StringType, - TimestampType, -) +from pyspark.sql.types import DoubleType, LongType, TimestampType from butterfree.clients import SparkClient from butterfree.constants import DataType @@ -51,33 +44,11 @@ def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): ).construct(dataframe, spark_client) def test_agg_feature_set_with_window( - self, key_id, timestamp_c, dataframe, rolling_windows_agg_dataframe + self, dataframe, rolling_windows_agg_dataframe, agg_feature_set, ): spark_client = SparkClient() - fs = AggregatedFeatureSet( - name="name", - entity="entity", - description="description", - features=[ - Feature( - name="feature1", - description="unit test", - transformation=AggregatedTransform( - functions=[Function(functions.avg, DataType.FLOAT)] - ), - ), - Feature( - name="feature2", - description="unit test", - transformation=AggregatedTransform( - functions=[Function(functions.avg, DataType.FLOAT)] - ), - ), - ], - keys=[key_id], - timestamp=timestamp_c, - ).with_windows(definitions=["1 week"]) + fs = agg_feature_set.with_windows(definitions=["1 week"]) # raises without end date with pytest.raises(ValueError): @@ -89,6 +60,21 @@ def test_agg_feature_set_with_window( output_df = fs.construct(dataframe, spark_client, end_date="2016-05-01") assert_dataframe_equality(output_df, rolling_windows_agg_dataframe) + def test_agg_feature_set_with_smaller_slide( + self, dataframe, rolling_windows_hour_slide_agg_dataframe, agg_feature_set, + ): + spark_client = SparkClient() + + fs = agg_feature_set.with_windows(definitions=["1 day"], slide="12 hours") + + # raises without end date + with pytest.raises(ValueError): + _ = fs.construct(dataframe, spark_client) + + # filters with date smaller then mocked max + output_df = fs.construct(dataframe, spark_client, end_date="2016-04-17") + assert_dataframe_equality(output_df, rolling_windows_hour_slide_agg_dataframe) + def test_get_schema(self, agg_feature_set): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, @@ -104,28 +90,20 @@ def test_get_schema(self, agg_feature_set): "primary_key": False, }, { - "column_name": "feature1__stddev_pop_over_1_week_rolling_windows", - "type": FloatType(), - "primary_key": False, - }, - { - "column_name": "feature1__stddev_pop_over_2_days_rolling_windows", - "type": FloatType(), - "primary_key": False, - }, - { - "column_name": "feature2__count_over_1_week_rolling_windows", - "type": ArrayType(StringType(), True), + "column_name": "feature2__avg_over_1_week_rolling_windows", + "type": DoubleType(), "primary_key": False, }, { - "column_name": "feature2__count_over_2_days_rolling_windows", - "type": ArrayType(StringType(), True), + "column_name": "feature2__avg_over_2_days_rolling_windows", + "type": DoubleType(), "primary_key": False, }, ] - schema = agg_feature_set.get_schema() + schema = agg_feature_set.with_windows( + definitions=["1 week", "2 days"] + ).get_schema() assert schema == expected_schema @@ -358,7 +336,9 @@ def test_feature_transform_with_data_type_array(self, spark_context, spark_sessi assert_dataframe_equality(target_df, output_df) def test_define_start_date(self, agg_feature_set): - start_date = agg_feature_set.define_start_date("2020-08-04") + start_date = agg_feature_set.with_windows( + definitions=["1 week", "2 days"] + ).define_start_date("2020-08-04") assert isinstance(start_date, str) assert start_date == "2020-07-27" From 0df07aebded2b2cb2a35370d22d3dda6f2f8713a Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Fri, 26 Feb 2021 10:36:18 -0300 Subject: [PATCH 13/86] Fix Slide Duration Typo (#295) --- .../transform/aggregated_feature_set.py | 2 +- setup.py | 2 +- tests/unit/butterfree/transform/conftest.py | 56 +++++++++++++++++++ .../transform/test_aggregated_feature_set.py | 22 ++++++++ 4 files changed, 80 insertions(+), 2 deletions(-) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 7a8656cd..133195d7 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -610,7 +610,7 @@ def construct( left, right, on=self.keys_columns + [self.timestamp_column], - how="full outer", + how="full_outer", num_processors=num_processors, ), agg_list, diff --git a/setup.py b/setup.py index 393fb0a0..7f65117a 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev1" +__version__ = "1.2.0.dev2" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index bbef8b13..ab760640 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -164,6 +164,55 @@ def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): return df +def make_multiple_rolling_windows_hour_slide_agg_dataframe( + spark_context, spark_session +): + data = [ + { + "id": 1, + "timestamp": "2016-04-11 12:00:00", + "feature1__avg_over_2_days_rolling_windows": 266.6666666666667, + "feature1__avg_over_3_days_rolling_windows": 266.6666666666667, + "feature2__avg_over_2_days_rolling_windows": 300.0, + "feature2__avg_over_3_days_rolling_windows": 300.0, + }, + { + "id": 1, + "timestamp": "2016-04-12 00:00:00", + "feature1__avg_over_2_days_rolling_windows": 300.0, + "feature1__avg_over_3_days_rolling_windows": 300.0, + "feature2__avg_over_2_days_rolling_windows": 350.0, + "feature2__avg_over_3_days_rolling_windows": 350.0, + }, + { + "id": 1, + "timestamp": "2016-04-13 12:00:00", + "feature1__avg_over_2_days_rolling_windows": 400.0, + "feature1__avg_over_3_days_rolling_windows": 300.0, + "feature2__avg_over_2_days_rolling_windows": 500.0, + "feature2__avg_over_3_days_rolling_windows": 350.0, + }, + { + "id": 1, + "timestamp": "2016-04-14 00:00:00", + "feature1__avg_over_3_days_rolling_windows": 300.0, + "feature2__avg_over_3_days_rolling_windows": 350.0, + }, + { + "id": 1, + "timestamp": "2016-04-14 12:00:00", + "feature1__avg_over_3_days_rolling_windows": 400.0, + "feature2__avg_over_3_days_rolling_windows": 500.0, + }, + ] + df = spark_session.read.json( + spark_context.parallelize(data).map(lambda x: json.dumps(x)) + ) + df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + + return df + + def make_fs(spark_context, spark_session): df = make_dataframe(spark_context, spark_session) df = ( @@ -275,6 +324,13 @@ def rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): return make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session) +@fixture +def multiple_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): + return make_multiple_rolling_windows_hour_slide_agg_dataframe( + spark_context, spark_session + ) + + @fixture def feature_set_with_distinct_dataframe(spark_context, spark_session): return make_fs_dataframe_with_distinct(spark_context, spark_session) diff --git a/tests/unit/butterfree/transform/test_aggregated_feature_set.py b/tests/unit/butterfree/transform/test_aggregated_feature_set.py index 458956f3..73320cf5 100644 --- a/tests/unit/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/unit/butterfree/transform/test_aggregated_feature_set.py @@ -75,6 +75,28 @@ def test_agg_feature_set_with_smaller_slide( output_df = fs.construct(dataframe, spark_client, end_date="2016-04-17") assert_dataframe_equality(output_df, rolling_windows_hour_slide_agg_dataframe) + def test_agg_feature_set_with_smaller_slide_and_multiple_windows( + self, + dataframe, + multiple_rolling_windows_hour_slide_agg_dataframe, + agg_feature_set, + ): + spark_client = SparkClient() + + fs = agg_feature_set.with_windows( + definitions=["2 days", "3 days"], slide="12 hours" + ) + + # raises without end date + with pytest.raises(ValueError): + _ = fs.construct(dataframe, spark_client) + + # filters with date smaller then mocked max + output_df = fs.construct(dataframe, spark_client, end_date="2016-04-17") + assert_dataframe_equality( + output_df, multiple_rolling_windows_hour_slide_agg_dataframe + ) + def test_get_schema(self, agg_feature_set): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, From aeb79998ecc6b72f3214c82ca993a1ca7aad48e7 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 8 Mar 2021 17:53:29 -0300 Subject: [PATCH 14/86] [MLOP-637] Implement diff method (#292) --- butterfree/migrations/__init__.py | 7 +- butterfree/migrations/cassandra_migration.py | 23 ---- .../migrations/database_migration/__init__.py | 11 ++ .../database_migration/cassandra_migration.py | 36 ++++++ .../database_migration/database_migration.py | 119 ++++++++++++++++++ .../database_migration/metastore_migration.py | 31 +++++ butterfree/migrations/metastore_migration.py | 23 ---- butterfree/migrations/migrate.py | 41 ++++++ butterfree/migrations/migration.py | 62 --------- setup.cfg | 2 +- tests/unit/butterfree/migrations/__init__.py | 0 .../migrations/database_migration/__init__.py | 0 .../migrations/database_migration/conftest.py | 20 +++ .../test_database_migration.py | 56 +++++++++ 14 files changed, 317 insertions(+), 114 deletions(-) delete mode 100644 butterfree/migrations/cassandra_migration.py create mode 100644 butterfree/migrations/database_migration/__init__.py create mode 100644 butterfree/migrations/database_migration/cassandra_migration.py create mode 100644 butterfree/migrations/database_migration/database_migration.py create mode 100644 butterfree/migrations/database_migration/metastore_migration.py delete mode 100644 butterfree/migrations/metastore_migration.py create mode 100644 butterfree/migrations/migrate.py delete mode 100644 butterfree/migrations/migration.py create mode 100644 tests/unit/butterfree/migrations/__init__.py create mode 100644 tests/unit/butterfree/migrations/database_migration/__init__.py create mode 100644 tests/unit/butterfree/migrations/database_migration/conftest.py create mode 100644 tests/unit/butterfree/migrations/database_migration/test_database_migration.py diff --git a/butterfree/migrations/__init__.py b/butterfree/migrations/__init__.py index 5f709bfe..39cabfb7 100644 --- a/butterfree/migrations/__init__.py +++ b/butterfree/migrations/__init__.py @@ -1,7 +1,4 @@ """Holds available migrations.""" +from butterfree.migrations.migrate import Migrate -from butterfree.migrations.cassandra_migration import CassandraMigration -from butterfree.migrations.metastore_migration import MetastoreMigration -from butterfree.migrations.migration import DatabaseMigration - -__all__ = ["DatabaseMigration", "CassandraMigration", "MetastoreMigration"] +__all__ = ["Migrate"] diff --git a/butterfree/migrations/cassandra_migration.py b/butterfree/migrations/cassandra_migration.py deleted file mode 100644 index e9cecdc7..00000000 --- a/butterfree/migrations/cassandra_migration.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Cassandra Migration entity.""" - -from typing import Any, Dict, List - -from butterfree.migrations import DatabaseMigration - - -class CassandraMigration(DatabaseMigration): - """Cassandra class for Migrations.""" - - def create_query( - self, - fs_schema: List[Dict[str, Any]], - db_schema: List[Dict[str, Any]], - table_name: str, - ) -> Any: - """Create a query regarding Cassandra. - - Returns: - Schema object. - - """ - pass diff --git a/butterfree/migrations/database_migration/__init__.py b/butterfree/migrations/database_migration/__init__.py new file mode 100644 index 00000000..7138c445 --- /dev/null +++ b/butterfree/migrations/database_migration/__init__.py @@ -0,0 +1,11 @@ +"""Holds available database migrations.""" + +from butterfree.migrations.database_migration.cassandra_migration import ( + CassandraMigration, +) +from butterfree.migrations.database_migration.database_migration import Diff +from butterfree.migrations.database_migration.metastore_migration import ( + MetastoreMigration, +) + +__all__ = ["CassandraMigration", "MetastoreMigration", "Diff"] diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py new file mode 100644 index 00000000..c4943c8e --- /dev/null +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -0,0 +1,36 @@ +"""Cassandra Migration entity.""" + +from typing import Any, Dict, List + +from butterfree.clients import CassandraClient +from butterfree.configs.db import CassandraConfig +from butterfree.migrations.database_migration.database_migration import ( + DatabaseMigration, +) + + +class CassandraMigration(DatabaseMigration): + """Cassandra class for Migrations.""" + + def __init__(self) -> None: + self._db_config = CassandraConfig() + self._client = CassandraClient( + host=[self._db_config.host], + keyspace=self._db_config.keyspace, # type: ignore + user=self._db_config.username, + password=self._db_config.password, + ) + + def create_query( + self, + table_name: str, + db_schema: List[Dict[str, Any]] = None, + diff_schema: List[Dict[str, Any]] = None, + ) -> Any: + """Create a query regarding Cassandra. + + Returns: + Schema object. + + """ + pass diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py new file mode 100644 index 00000000..032a7acc --- /dev/null +++ b/butterfree/migrations/database_migration/database_migration.py @@ -0,0 +1,119 @@ +"""Migration entity.""" +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any, Dict, List, Set + +from butterfree.transform import FeatureSet + + +@dataclass +class Diff: + """DataClass to help identifying different types of diff between schemas.""" + + class Kind(Enum): + """Mapping actions to take given a difference between columns of a schema.""" + + ADD = auto() + DROP = auto() + ALTER_TYPE = auto() + ALTER_KEY = auto() + + column: str + kind: Kind + value: Any + + def __hash__(self) -> int: + return hash((self.column, self.kind, self.value)) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, type(self)): + raise NotImplementedError + return ( + self.column == other.column + and self.kind == other.kind + and self.value == other.value + ) + + +class DatabaseMigration(ABC): + """Abstract base class for Migrations.""" + + @abstractmethod + def create_query( + self, + table_name: str, + db_schema: List[Dict[str, Any]] = None, + diff_schema: List[Dict[str, Any]] = None, + ) -> Any: + """Create a query regarding a data source. + + Returns: + The desired query for the given database. + + """ + + def _apply_migration(self, feature_set: FeatureSet) -> None: + """Apply the migration in the respective database.""" + pass + + @staticmethod + def _get_diff( + fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]], + ) -> Set[Diff]: + """Gets schema difference between feature set and the table of a given db. + + Args: + fs_schema: object that contains feature set's schemas. + db_schema: object that contains the table of a given db schema. + + """ + db_columns = set(item.get("column_name") for item in db_schema) + fs_columns = set(item.get("column_name") for item in fs_schema) + + add_columns = fs_columns - db_columns + drop_columns = db_columns - fs_columns + + # This could be way easier to write (and to read) if the schemas were a simple + # Dict[str, Any] where each key would be the column name itself... + # but changing that could break things so: + # TODO version 2 change get schema to return a dict(columns, properties) + alter_type_columns = dict() + alter_key_columns = dict() + for fs_item in fs_schema: + for db_item in db_schema: + if fs_item.get("column_name") == db_item.get("column_name"): + if fs_item.get("type") != db_item.get("type"): + alter_type_columns.update( + {fs_item.get("column_name"): fs_item.get("type")} + ) + if fs_item.get("primary_key") != db_item.get("primary_key"): + alter_key_columns.update( + {fs_item.get("column_name"): fs_item.get("primary_key")} + ) + break + + schema_diff = set( + Diff(str(col), kind=Diff.Kind.ADD, value=None) for col in add_columns + ) + schema_diff |= set( + Diff(str(col), kind=Diff.Kind.DROP, value=None) for col in drop_columns + ) + schema_diff |= set( + Diff(str(col), kind=Diff.Kind.ALTER_TYPE, value=value) + for col, value in alter_type_columns.items() + ) + schema_diff |= set( + Diff(str(col), kind=Diff.Kind.ALTER_KEY, value=value) + for col, value in alter_key_columns.items() + ) + return schema_diff + + def run(self, feature_set: FeatureSet) -> None: + """Runs the migrations. + + Args: + feature_set: the feature set. + + """ + pass diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py new file mode 100644 index 00000000..ae0dd182 --- /dev/null +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -0,0 +1,31 @@ +"""Metastore Migration entity.""" + +from typing import Any, Dict, List + +from butterfree.clients import SparkClient +from butterfree.configs.db import MetastoreConfig +from butterfree.migrations.database_migration.database_migration import ( + DatabaseMigration, +) + + +class MetastoreMigration(DatabaseMigration): + """Metastore class for Migrations.""" + + def __init__(self) -> None: + self._db_config = MetastoreConfig() + self._client = SparkClient() + + def create_query( + self, + table_name: str, + db_schema: List[Dict[str, Any]] = None, + diff_schema: List[Dict[str, Any]] = None, + ) -> Any: + """Create a query regarding Metastore. + + Returns: + Schema object. + + """ + pass diff --git a/butterfree/migrations/metastore_migration.py b/butterfree/migrations/metastore_migration.py deleted file mode 100644 index bb208f2a..00000000 --- a/butterfree/migrations/metastore_migration.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Metastore Migration entity.""" - -from typing import Any, Dict, List - -from butterfree.migrations import DatabaseMigration - - -class MetastoreMigration(DatabaseMigration): - """Metastore class for Migrations.""" - - def create_query( - self, - fs_schema: List[Dict[str, Any]], - db_schema: List[Dict[str, Any]], - table_name: str, - ) -> Any: - """Create a query regarding Metastore. - - Returns: - Schema object. - - """ - pass diff --git a/butterfree/migrations/migrate.py b/butterfree/migrations/migrate.py new file mode 100644 index 00000000..f128dee1 --- /dev/null +++ b/butterfree/migrations/migrate.py @@ -0,0 +1,41 @@ +"""Holds the Migrator Class.""" + +from typing import Callable, List, Tuple + +from butterfree.pipelines import FeatureSetPipeline +from butterfree.transform import FeatureSet + + +class Migrate: + """Execute migration operations in a Database based on pipeline Writer. + + Attributes: + pipelines: list of Feature Set Pipelines to use to migration. + + """ + + def __init__(self, pipelines: List[FeatureSetPipeline]) -> None: + self.pipelines = pipelines + + def _parse_feature_set_pipeline( + self, pipeline: FeatureSetPipeline + ) -> List[Tuple[Callable, FeatureSet]]: + feature_set = pipeline.feature_set + migrations = [ + writer.db_config._migration_class for writer in pipeline.sink.writers + ] + + return [(migrate, feature_set) for migrate in migrations] + + def _send_logs_to_s3(self) -> None: + """Send all migration logs to S3.""" + pass + + def migration(self) -> None: + """Construct and apply the migrations.""" + migration_list = [ + self._parse_feature_set_pipeline(pipeline) for pipeline in self.pipelines + ] + + for migration, fs in migration_list: + migration.run(fs) diff --git a/butterfree/migrations/migration.py b/butterfree/migrations/migration.py deleted file mode 100644 index c53945bf..00000000 --- a/butterfree/migrations/migration.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Migration entity.""" - -from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, List - -from butterfree.pipelines import FeatureSetPipeline - - -class DatabaseMigration(ABC): - """Abstract base class for Migrations.""" - - @abstractmethod - def create_query( - self, - fs_schema: List[Dict[str, Any]], - db_schema: List[Dict[str, Any]], - table_name: str, - ) -> Any: - """Create a query regarding a data source. - - Returns: - The desired query for the given database. - - """ - - def _validate_schema( - self, fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]] - ) -> Any: - """Provides schema validation for feature sets. - - Compares the schema of your local feature set to the - corresponding table in a given database. - - Args: - fs_schema: object that contains feature set's schemas. - db_schema: object that contains the table og a given db schema. - - """ - - def _get_schema(self, db_client: Callable, table_name: str) -> List[Dict[str, Any]]: - """Get a table schema in the respective database. - - Returns: - Schema object. - """ - pass - - def _apply_migration(self, query: str, db_client: Callable) -> None: - """Apply the migration in the respective database.""" - - def _send_logs_to_s3(self) -> None: - """Send all migration logs to S3.""" - pass - - def run(self, pipelines: List[FeatureSetPipeline]) -> None: - """Runs the migrations. - - Args: - pipelines: the feature set pipelines. - - """ - pass diff --git a/setup.cfg b/setup.cfg index 7b1c62bd..255fff84 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ docstring-convention = google max-line-length = 88 max-complexity = 12 -ignore = W503, E203, D203, D401, D107, S101 +ignore = W503, E203, D203, D401, D107, S101, D105 exclude = dist/*,build/*,.pytest_cache/*,.git/*,pip/* per-file-ignores = # We will not check for docstrings or the use of asserts in tests diff --git a/tests/unit/butterfree/migrations/__init__.py b/tests/unit/butterfree/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/butterfree/migrations/database_migration/__init__.py b/tests/unit/butterfree/migrations/database_migration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py new file mode 100644 index 00000000..f737c4dc --- /dev/null +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -0,0 +1,20 @@ +from pyspark.sql.types import DoubleType, LongType, TimestampType +from pytest import fixture + + +@fixture +def db_schema(): + return [ + {"column_name": "id", "type": LongType(), "primary_key": True}, + {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, + { + "column_name": "feature1__avg_over_1_week_rolling_windows", + "type": DoubleType(), + "primary_key": False, + }, + { + "column_name": "feature1__avg_over_2_days_rolling_windows", + "type": DoubleType(), + "primary_key": False, + }, + ] diff --git a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py new file mode 100644 index 00000000..aa272317 --- /dev/null +++ b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py @@ -0,0 +1,56 @@ +from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType + +from butterfree.migrations.database_migration import CassandraMigration, Diff + + +class TestDatabaseMigration: + def test__get_diff_empty(self, mocker, db_schema): + fs_schema = [ + {"column_name": "id", "type": LongType(), "primary_key": True}, + {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, + { + "column_name": "feature1__avg_over_1_week_rolling_windows", + "type": DoubleType(), + "primary_key": False, + }, + { + "column_name": "feature1__avg_over_2_days_rolling_windows", + "type": DoubleType(), + "primary_key": False, + }, + ] + m = CassandraMigration() + m._client = mocker.stub("client") + diff = m._get_diff(fs_schema, db_schema) + assert not diff + + def test__get_diff(self, mocker, db_schema): + fs_schema = [ + {"column_name": "id", "type": LongType(), "primary_key": True}, + {"column_name": "timestamp", "type": TimestampType(), "primary_key": True}, + {"column_name": "new_feature", "type": FloatType(), "primary_key": False}, + { + "column_name": "feature1__avg_over_1_week_rolling_windows", + "type": FloatType(), + "primary_key": False, + }, + ] + expected_diff = { + Diff("timestamp", kind=Diff.Kind.ALTER_KEY, value=True), + Diff("new_feature", kind=Diff.Kind.ADD, value=None), + Diff( + "feature1__avg_over_2_days_rolling_windows", + kind=Diff.Kind.DROP, + value=None, + ), + Diff( + "feature1__avg_over_1_week_rolling_windows", + kind=Diff.Kind.ALTER_TYPE, + value=FloatType(), + ), + } + + m = CassandraMigration() + m._client = mocker.stub("client") + diff = m._get_diff(fs_schema, db_schema) + assert diff == expected_diff From 9afc39c242403510501a1be5d29b63984b13c950 Mon Sep 17 00:00:00 2001 From: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Date: Mon, 15 Mar 2021 09:43:29 -0300 Subject: [PATCH 15/86] [MLOP-640] Create CLI with migrate command (#298) --- butterfree/_cli/__init__.py | 10 ++ butterfree/_cli/main.py | 9 ++ butterfree/_cli/migrate.py | 104 ++++++++++++++++++ requirements.txt | 2 + setup.py | 2 + tests/mocks/__init__.py | 0 tests/mocks/entities/__init__.py | 0 tests/mocks/entities/first/__init__.py | 3 + tests/mocks/entities/first/first_pipeline.py | 43 ++++++++ tests/mocks/entities/second/__init__.py | 0 .../mocks/entities/second/deeper/__init__.py | 3 + .../entities/second/deeper/second_pipeline.py | 45 ++++++++ tests/unit/butterfree/_cli/__init__.py | 0 tests/unit/butterfree/_cli/test_migrate.py | 8 ++ 14 files changed, 229 insertions(+) create mode 100644 butterfree/_cli/__init__.py create mode 100644 butterfree/_cli/main.py create mode 100644 butterfree/_cli/migrate.py create mode 100644 tests/mocks/__init__.py create mode 100644 tests/mocks/entities/__init__.py create mode 100644 tests/mocks/entities/first/__init__.py create mode 100644 tests/mocks/entities/first/first_pipeline.py create mode 100644 tests/mocks/entities/second/__init__.py create mode 100644 tests/mocks/entities/second/deeper/__init__.py create mode 100644 tests/mocks/entities/second/deeper/second_pipeline.py create mode 100644 tests/unit/butterfree/_cli/__init__.py create mode 100644 tests/unit/butterfree/_cli/test_migrate.py diff --git a/butterfree/_cli/__init__.py b/butterfree/_cli/__init__.py new file mode 100644 index 00000000..ec8a1792 --- /dev/null +++ b/butterfree/_cli/__init__.py @@ -0,0 +1,10 @@ +import logging + + +def __logger(name: str) -> logging.Logger: + format_ = "%(name)s:%(asctime)-15s:%(levelname)s:< %(message)s >" + logging.basicConfig(format=format_, level=logging.INFO) + return logging.getLogger(name) + + +cli_logger = __logger("butterfree") diff --git a/butterfree/_cli/main.py b/butterfree/_cli/main.py new file mode 100644 index 00000000..e340bc1b --- /dev/null +++ b/butterfree/_cli/main.py @@ -0,0 +1,9 @@ +import typer + +from butterfree._cli import migrate + +app = typer.Typer() +app.add_typer(migrate.app) + +if __name__ == "__main__": + app() diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py new file mode 100644 index 00000000..ee083f73 --- /dev/null +++ b/butterfree/_cli/migrate.py @@ -0,0 +1,104 @@ +import importlib +import inspect +import pkgutil +import sys +from typing import Set + +import setuptools +import typer + +from butterfree._cli import cli_logger +from butterfree.pipelines import FeatureSetPipeline + +app = typer.Typer() + + +def __find_modules(path: str) -> Set[str]: + modules = set() + for pkg in setuptools.find_packages(path): + modules.add(pkg) + pkg_path = path + "/" + pkg.replace(".", "/") + + # different usage for older python3 versions + if sys.version_info.minor < 6: + for _, name, is_pkg in pkgutil.iter_modules([pkg_path]): + if not is_pkg: + modules.add(pkg + "." + name) + else: + for info in pkgutil.iter_modules([pkg_path]): + if not info.ispkg: + modules.add(pkg + "." + info.name) + return modules + + +def __fs_objects(path: str) -> Set[FeatureSetPipeline]: + cli_logger.info(f"Looking for python modules under {path}...") + modules = __find_modules(path) + if not modules: + return set() + + cli_logger.info(f"Importing modules...") + package = ".".join(path.strip("/").split("/")) + imported = set( + importlib.import_module(f".{name}", package=package) for name in modules + ) + + cli_logger.info(f"Scanning modules...") + content = { + module: set( + filter( + lambda x: not x.startswith("__"), # filter "__any__" attributes + set(item for item in dir(module)), + ) + ) + for module in imported + } + + instances = set() + for module, items in content.items(): + for item in items: + value = getattr(module, item) + if not value: + continue + + # filtering non-classes + if not inspect.isclass(value): + continue + + # filtering abstractions + if inspect.isabstract(value): + continue + + # filtering classes that doesn't inherit from FeatureSetPipeline + if not issubclass(value, FeatureSetPipeline): + continue + + # filtering FeatureSetPipeline itself + if value == FeatureSetPipeline: + continue + + instances.add(value) + + cli_logger.info("Creating instances...") + return set(value() for value in instances) + + +PATH = typer.Argument( + ..., help="Full or relative path to where feature set pipelines are being defined.", +) + + +@app.callback() +def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: + """Scan and run database migrations for feature set pipelines defined under PATH. + + Butterfree will scan a given path for classes that inherit from its + FeatureSetPipeline and create dry instances of it to extract schema and writer + information. By doing this, Butterfree can compare all defined feature set schemas + to their current state on each sink being used. + + All pipelines must be under python modules inside path, so we can dynamically + import and instantiate them. + """ + # TODO call the Migration actor with all feature set pipeline objects + return __fs_objects(path) diff --git a/requirements.txt b/requirements.txt index e55289f4..bac7f2c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ mdutils>=1.2.2,<2.0 pandas>=0.24,<1.1 parameters-validation>=1.1.5,<2.0 pyspark==3.* +typer>=0.3,<0.4 +setuptools>=41,<42 \ No newline at end of file diff --git a/setup.py b/setup.py index 7f65117a..d211098c 100644 --- a/setup.py +++ b/setup.py @@ -36,4 +36,6 @@ install_requires=requirements, extras_require={"h3": ["cmake==3.16.3", "h3==3.4.2"]}, python_requires=">=3.7, <4", + entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]}, + include_package_data=True, ) diff --git a/tests/mocks/__init__.py b/tests/mocks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/mocks/entities/__init__.py b/tests/mocks/entities/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/mocks/entities/first/__init__.py b/tests/mocks/entities/first/__init__.py new file mode 100644 index 00000000..e69592de --- /dev/null +++ b/tests/mocks/entities/first/__init__.py @@ -0,0 +1,3 @@ +from .first_pipeline import FirstPipeline + +__all__ = ["FirstPipeline"] diff --git a/tests/mocks/entities/first/first_pipeline.py b/tests/mocks/entities/first/first_pipeline.py new file mode 100644 index 00000000..90cfba96 --- /dev/null +++ b/tests/mocks/entities/first/first_pipeline.py @@ -0,0 +1,43 @@ +from butterfree.constants.data_type import DataType +from butterfree.extract import Source +from butterfree.extract.readers import TableReader +from butterfree.load import Sink +from butterfree.load.writers import ( + HistoricalFeatureStoreWriter, + OnlineFeatureStoreWriter, +) +from butterfree.pipelines import FeatureSetPipeline +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature + + +class FirstPipeline(FeatureSetPipeline): + def __init__(self): + super(FirstPipeline, self).__init__( + source=Source( + readers=[TableReader(id="t", database="db", table="table",)], + query=f"select * from t", # noqa + ), + feature_set=FeatureSet( + name="first", + entity="entity", + description="description", + features=[ + Feature(name="feature1", description="test", dtype=DataType.FLOAT,), + Feature( + name="feature2", + description="another test", + dtype=DataType.STRING, + ), + ], + keys=[ + KeyFeature( + name="id", description="identifier", dtype=DataType.BIGINT, + ) + ], + timestamp=TimestampFeature(), + ), + sink=Sink( + writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] + ), + ) diff --git a/tests/mocks/entities/second/__init__.py b/tests/mocks/entities/second/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/mocks/entities/second/deeper/__init__.py b/tests/mocks/entities/second/deeper/__init__.py new file mode 100644 index 00000000..9f70be75 --- /dev/null +++ b/tests/mocks/entities/second/deeper/__init__.py @@ -0,0 +1,3 @@ +from .second_pipeline import SecondPipeline + +__all__ = ["SecondPipeline"] diff --git a/tests/mocks/entities/second/deeper/second_pipeline.py b/tests/mocks/entities/second/deeper/second_pipeline.py new file mode 100644 index 00000000..12c53cf3 --- /dev/null +++ b/tests/mocks/entities/second/deeper/second_pipeline.py @@ -0,0 +1,45 @@ +from butterfree.constants.data_type import DataType +from butterfree.extract import Source +from butterfree.extract.readers import TableReader +from butterfree.load import Sink +from butterfree.load.writers import ( + HistoricalFeatureStoreWriter, + OnlineFeatureStoreWriter, +) +from butterfree.pipelines import FeatureSetPipeline +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature + + +class SecondPipeline(FeatureSetPipeline): + def __init__(self): + super(SecondPipeline, self).__init__( + source=Source( + readers=[TableReader(id="t", database="db", table="table",)], + query=f"select * from t", # noqa + ), + feature_set=FeatureSet( + name="second", + entity="entity", + description="description", + features=[ + Feature( + name="feature1", description="test", dtype=DataType.STRING, + ), + Feature( + name="feature2", + description="another test", + dtype=DataType.FLOAT, + ), + ], + keys=[ + KeyFeature( + name="id", description="identifier", dtype=DataType.BIGINT, + ) + ], + timestamp=TimestampFeature(), + ), + sink=Sink( + writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] + ), + ) diff --git a/tests/unit/butterfree/_cli/__init__.py b/tests/unit/butterfree/_cli/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py new file mode 100644 index 00000000..6a63453f --- /dev/null +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -0,0 +1,8 @@ +from butterfree._cli import migrate +from butterfree.pipelines import FeatureSetPipeline + + +def test_migrate_success(): + all_fs = migrate.migrate("tests/mocks/entities/") + assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) + assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] From bf204f2a38afc098fd4e1cc1a96f5a58b7951164 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Mon, 15 Mar 2021 17:16:15 -0300 Subject: [PATCH 16/86] [MLOP-645] Implement query method, cassandra (#291) --- .../database_migration/cassandra_migration.py | 181 ++++++++++++++++-- .../database_migration/database_migration.py | 59 +++++- .../database_migration/metastore_migration.py | 3 +- .../migrations/database_migration/conftest.py | 16 +- .../test_cassandra_migration.py | 41 ++++ .../test_database_migration.py | 4 +- 6 files changed, 283 insertions(+), 21 deletions(-) create mode 100644 tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index c4943c8e..4141a7d5 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -1,16 +1,35 @@ """Cassandra Migration entity.""" -from typing import Any, Dict, List +import logging +from typing import Any, Dict, List, Set from butterfree.clients import CassandraClient from butterfree.configs.db import CassandraConfig from butterfree.migrations.database_migration.database_migration import ( DatabaseMigration, + Diff, ) class CassandraMigration(DatabaseMigration): - """Cassandra class for Migrations.""" + """Cassandra class for performing migrations. + + This class implements some methods of the parent DatabaseMigration class and + has specific methods for query building. + + The CassandraMigration class will be used, as the name suggests, for applying + changes to a given Cassandra table. There are, however, some remarks that need + to be highlighted: + - If an existing feature has its type changed, then it's extremely important to + make sure that this conversion would not result in data loss; + - If new features are added to your feature set, then they're going to be added + to the corresponding Cassandra table; + - Since feature sets can be written both to a feature set and an entity table, + we're not going to automatically drop features when using entity tables, since + it means that some features belong to a different feature set. In summary, if + data is being loaded into an entity table, then users can drop columns manually. + + """ def __init__(self) -> None: self._db_config = CassandraConfig() @@ -21,16 +40,156 @@ def __init__(self) -> None: password=self._db_config.password, ) - def create_query( - self, - table_name: str, - db_schema: List[Dict[str, Any]] = None, - diff_schema: List[Dict[str, Any]] = None, - ) -> Any: - """Create a query regarding Cassandra. + @staticmethod + def _get_parsed_columns(columns: List[Diff]) -> List[str]: + """Parse columns from a list of Diff objects. + + Args: + columns: list of Diff objects. + + Returns: + Parsed columns. + + """ + parsed_columns = [] + for col in columns: + parsed_columns.append(f"{col.column} {col.value}") + + parsed_columns = ", ".join(parsed_columns) # type: ignore + + return parsed_columns + + def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> str: + """Creates CQL statement to add columns to a table. + + Args: + columns: list of Diff objects with ADD kind. + table_name: table name. Returns: - Schema object. + Alter table query. """ - pass + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} ADD ({parsed_columns});" + + def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + """Creates CQL statement to alter columns' types. + + Args: + columns: list of Diff objects with ALTER_TYPE kind. + table_name: table name. + + Returns: + Alter column type query. + + """ + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} ALTER ({parsed_columns});" + + @staticmethod + def _get_create_table_query(columns: List[Dict[str, Any]], table_name: str) -> str: + """Creates CQL statement to create a table. + + Args: + columns: object that contains column's schemas. + table_name: table name. + + Returns: + Create table query. + + """ + parsed_columns = [] + primary_keys = [] + + for col in columns: + col_str = f"{col['column_name']} {col['type']}" + if col["primary_key"]: + primary_keys.append(col["column_name"]) + parsed_columns.append(col_str) + + joined_parsed_columns = ", ".join(parsed_columns) + + if len(primary_keys) > 0: + joined_primary_keys = ", ".join(primary_keys) + columns_str = ( + f"{joined_parsed_columns}, PRIMARY KEY ({joined_primary_keys})" + ) + else: + columns_str = joined_parsed_columns + + keyspace = CassandraConfig().keyspace + + return f"CREATE TABLE {keyspace}.{table_name} " f"({columns_str});" + + def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> str: + """Creates CQL statement to drop columns from a table. + + Args: + columns: list of Diff objects with DROP kind. + table_name: table name. + + Returns: + Drop columns from a given table query. + + """ + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} DROP ({parsed_columns});" + + def _get_queries( + self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None + ) -> List[str]: + """Create the desired queries for migration. + + Args: + schema_diff: list of Diff objects. + table_name: table name. + + Returns: + List of queries. + + """ + add_items = [] + drop_items = [] + alter_type_items = [] + alter_key_items = [] + + for diff in schema_diff: + if diff.kind == Diff.Kind.ADD: + add_items.append(diff) + elif diff.kind == Diff.Kind.ALTER_TYPE: + alter_type_items.append(diff) + elif diff.kind == Diff.Kind.DROP: + drop_items.append(diff) + elif diff.kind == Diff.Kind.ALTER_KEY: + alter_key_items.append(diff) + + queries = [] + if add_items: + alter_table_add_query = self._get_alter_table_add_query( + add_items, table_name + ) + queries.append(alter_table_add_query) + if drop_items: + if write_on_entity: + logging.info( + "Features will not be dropped automatically " + "when data is loaded to an entity table" + ) + else: + drop_columns_query = self._get_alter_table_drop_query( + drop_items, table_name + ) + queries.append(drop_columns_query) + if alter_type_items: + alter_column_types_query = self._get_alter_column_type_query( + alter_type_items, table_name + ) + queries.append(alter_column_types_query) + if alter_key_items: + logging.info("This operation is not supported by Cassandra DB.") + + return queries diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 032a7acc..a2106f3c 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -15,9 +15,9 @@ class Kind(Enum): """Mapping actions to take given a difference between columns of a schema.""" ADD = auto() - DROP = auto() - ALTER_TYPE = auto() ALTER_KEY = auto() + ALTER_TYPE = auto() + DROP = auto() column: str kind: Kind @@ -40,18 +40,56 @@ class DatabaseMigration(ABC): """Abstract base class for Migrations.""" @abstractmethod + def _get_create_table_query( + self, columns: List[Dict[str, Any]], table_name: str + ) -> Any: + """Creates desired statement to create a table. + + Args: + columns: object that contains column's schemas. + table_name: table name. + + Returns: + Create table query. + + """ + pass + + @abstractmethod + def _get_queries( + self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None + ) -> Any: + """Create the desired queries for migration. + + Args: + schema_diff: list of Diff objects. + table_name: table name. + + Returns: + List of queries. + + """ + pass + def create_query( self, + fs_schema: List[Dict[str, Any]], table_name: str, db_schema: List[Dict[str, Any]] = None, - diff_schema: List[Dict[str, Any]] = None, + write_on_entity: bool = None, ) -> Any: """Create a query regarding a data source. Returns: - The desired query for the given database. + The desired queries for the given database. """ + if not db_schema: + return [self._get_create_table_query(fs_schema, table_name)] + + schema_diff = self._get_diff(fs_schema, db_schema) + + return self._get_queries(schema_diff, table_name, write_on_entity) def _apply_migration(self, feature_set: FeatureSet) -> None: """Apply the migration in the respective database.""" @@ -67,6 +105,9 @@ def _get_diff( fs_schema: object that contains feature set's schemas. db_schema: object that contains the table of a given db schema. + Returns: + Object with schema differences. + """ db_columns = set(item.get("column_name") for item in db_schema) fs_columns = set(item.get("column_name") for item in fs_schema) @@ -78,9 +119,14 @@ def _get_diff( # Dict[str, Any] where each key would be the column name itself... # but changing that could break things so: # TODO version 2 change get schema to return a dict(columns, properties) + add_type_columns = dict() alter_type_columns = dict() alter_key_columns = dict() for fs_item in fs_schema: + if fs_item.get("column_name") in add_columns: + add_type_columns.update( + {fs_item.get("column_name"): fs_item.get("type")} + ) for db_item in db_schema: if fs_item.get("column_name") == db_item.get("column_name"): if fs_item.get("type") != db_item.get("type"): @@ -94,7 +140,8 @@ def _get_diff( break schema_diff = set( - Diff(str(col), kind=Diff.Kind.ADD, value=None) for col in add_columns + Diff(str(col), kind=Diff.Kind.ADD, value=value) + for col, value in add_type_columns.items() ) schema_diff |= set( Diff(str(col), kind=Diff.Kind.DROP, value=None) for col in drop_columns @@ -104,7 +151,7 @@ def _get_diff( for col, value in alter_type_columns.items() ) schema_diff |= set( - Diff(str(col), kind=Diff.Kind.ALTER_KEY, value=value) + Diff(str(col), kind=Diff.Kind.ALTER_KEY, value=None) for col, value in alter_key_columns.items() ) return schema_diff diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index ae0dd182..4f51ddf2 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -18,9 +18,10 @@ def __init__(self) -> None: def create_query( self, + fs_schema: List[Dict[str, Any]], table_name: str, db_schema: List[Dict[str, Any]] = None, - diff_schema: List[Dict[str, Any]] = None, + write_on_entity: bool = None, ) -> Any: """Create a query regarding Metastore. diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py index f737c4dc..bcf7f7f3 100644 --- a/tests/unit/butterfree/migrations/database_migration/conftest.py +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -1,4 +1,4 @@ -from pyspark.sql.types import DoubleType, LongType, TimestampType +from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType from pytest import fixture @@ -18,3 +18,17 @@ def db_schema(): "primary_key": False, }, ] + + +@fixture +def fs_schema(): + return [ + {"column_name": "id", "type": LongType(), "primary_key": True}, + {"column_name": "timestamp", "type": TimestampType(), "primary_key": True}, + {"column_name": "new_feature", "type": FloatType(), "primary_key": False}, + { + "column_name": "feature1__avg_over_1_week_rolling_windows", + "type": FloatType(), + "primary_key": False, + }, + ] diff --git a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py new file mode 100644 index 00000000..8f16a1d2 --- /dev/null +++ b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py @@ -0,0 +1,41 @@ +from butterfree.migrations.database_migration import CassandraMigration + + +class TestCassandraMigration: + def test_queries(self, fs_schema, db_schema): + cassandra_migration = CassandraMigration() + expected_query = [ + "ALTER TABLE table_name ADD (new_feature FloatType);", + "ALTER TABLE table_name DROP (feature1__avg_over_2_days_rolling_windows);", + "ALTER TABLE table_name ALTER " + "(feature1__avg_over_1_week_rolling_windows FloatType);", + ] + query = cassandra_migration.create_query(fs_schema, "table_name", db_schema) + + assert query, expected_query + + def test_queries_on_entity(self, fs_schema, db_schema): + cassandra_migration = CassandraMigration() + expected_query = [ + "ALTER TABLE table_name ADD (new_feature FloatType);", + "ALTER TABLE table_name ALTER " + "(feature1__avg_over_1_week_rolling_windows FloatType);", + ] + query = cassandra_migration.create_query( + fs_schema, "table_name", db_schema, True + ) + + assert query, expected_query + + def test_create_table_query(self, fs_schema): + + cassandra_migration = CassandraMigration() + expected_query = [ + "CREATE TABLE test.table_name " + "(id LongType, timestamp TimestampType, new_feature FloatType, " + "feature1__avg_over_1_week_rolling_windows FloatType, " + "PRIMARY KEY (id, timestamp));" + ] + query = cassandra_migration.create_query(fs_schema, "table_name") + + assert query, expected_query diff --git a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py index aa272317..30277992 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py @@ -36,8 +36,8 @@ def test__get_diff(self, mocker, db_schema): }, ] expected_diff = { - Diff("timestamp", kind=Diff.Kind.ALTER_KEY, value=True), - Diff("new_feature", kind=Diff.Kind.ADD, value=None), + Diff("timestamp", kind=Diff.Kind.ALTER_KEY, value=None), + Diff("new_feature", kind=Diff.Kind.ADD, value=FloatType()), Diff( "feature1__avg_over_2_days_rolling_windows", kind=Diff.Kind.DROP, From b518dbcc132b3833a8eb1bde5bdc202687bf1db7 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Tue, 16 Mar 2021 14:49:58 -0300 Subject: [PATCH 17/86] [MLOP-671] Implement get_schema on Spark client (#301) --- butterfree/clients/spark_client.py | 66 ++++++++++++++++++- .../butterfree/clients/test_spark_client.py | 33 ++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index 0f0113e2..09a1bcd9 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -1,5 +1,6 @@ """SparkClient entity.""" +import json from typing import Any, Dict, List, Optional, Union from pyspark.sql import DataFrame, DataFrameReader, SparkSession @@ -216,7 +217,8 @@ def write_table( **options, ) - def create_temporary_view(self, dataframe: DataFrame, name: str) -> Any: + @staticmethod + def create_temporary_view(dataframe: DataFrame, name: str) -> Any: """Create a temporary view from a given dataframe. Args: @@ -271,3 +273,65 @@ def add_table_partitions( ) self.conn.sql(command) + + @staticmethod + def _filter_schema(schema: DataFrame) -> List[str]: + """Returns filtered schema with the desired information. + + Attributes: + schema: desired table. + + Returns: + A list of strings in the format + ['{"column_name": "example1", type: "Spark_type"}', ...] + + """ + return ( + schema.filter( + ~schema.col_name.isin( + ["# Partition Information", "# col_name", "year", "month", "day"] + ) + ) + .toJSON() + .collect() + ) + + def _convert_schema(self, schema: DataFrame) -> List[Dict[str, str]]: + """Returns schema with the desired information. + + Attributes: + schema: desired table. + + Returns: + A list of dictionaries in the format + [{"column_name": "example1", type: "Spark_type"}, ...] + + """ + schema_list = self._filter_schema(schema) + converted_schema = [] + for row in schema_list: + converted_schema.append(json.loads(row)) + + return converted_schema + + def get_schema(self, table: str, database: str) -> List[Dict[str, str]]: + """Returns desired table schema. + + Attributes: + table: desired table. + + Returns: + A list of dictionaries in the format + [{"column_name": "example1", type: "Spark_type"}, ...] + + """ + query = f"DESCRIBE {database}.{table} " # noqa + + response = self.sql(query) + + if not response: + raise RuntimeError( + f"No columns found for table: {table}" f"in database: {database}" + ) + + return self._convert_schema(response) diff --git a/tests/unit/butterfree/clients/test_spark_client.py b/tests/unit/butterfree/clients/test_spark_client.py index 9f641506..dc40841c 100644 --- a/tests/unit/butterfree/clients/test_spark_client.py +++ b/tests/unit/butterfree/clients/test_spark_client.py @@ -15,6 +15,15 @@ def create_temp_view(dataframe: DataFrame, name: str) -> None: dataframe.createOrReplaceTempView(name) +def create_db_and_table(spark, database, table, view): + spark.sql(f"create database if not exists {database}") + spark.sql(f"use {database}") + spark.sql( + f"create table if not exists {database}.{table} " # noqa + f"as select * from {view}" # noqa + ) + + class TestSparkClient: def test_conn(self) -> None: # arrange @@ -293,3 +302,27 @@ def test_add_invalid_partitions(self, mock_spark_sql: Mock, partition): # act and assert with pytest.raises(ValueError): spark_client.add_table_partitions(partition, "table", "db") + + def test_get_schema( + self, target_df: DataFrame, spark_session: SparkSession + ) -> None: + # arrange + spark_client = SparkClient() + create_temp_view(dataframe=target_df, name="temp_view") + create_db_and_table( + spark=spark_session, + database="test_db", + table="test_table", + view="temp_view", + ) + + expected_schema = [ + {"col_name": "col1", "data_type": "string"}, + {"col_name": "col2", "data_type": "bigint"}, + ] + + # act + schema = spark_client.get_schema(table="test_table", database="test_db") + + # assert + assert schema, expected_schema From 5fe4c40777c6258cfe7361552179f89f46e510c3 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Tue, 16 Mar 2021 16:49:20 -0300 Subject: [PATCH 18/86] [MLOP-648] Implement query method, metastore (#294) --- butterfree/constants/migrations.py | 8 ++ .../database_migration/cassandra_migration.py | 58 +------- .../database_migration/database_migration.py | 84 ++++++++++- .../database_migration/metastore_migration.py | 131 +++++++++++++++--- .../test_metastore_migration.py | 50 +++++++ 5 files changed, 257 insertions(+), 74 deletions(-) create mode 100644 butterfree/constants/migrations.py create mode 100644 tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py diff --git a/butterfree/constants/migrations.py b/butterfree/constants/migrations.py new file mode 100644 index 00000000..b1c0947d --- /dev/null +++ b/butterfree/constants/migrations.py @@ -0,0 +1,8 @@ +"""Migrations' Constants.""" +from butterfree.constants import columns + +PARTITION_BY = [ + {"column_name": columns.PARTITION_YEAR, "type": "INT"}, + {"column_name": columns.PARTITION_MONTH, "type": "INT"}, + {"column_name": columns.PARTITION_DAY, "type": "INT"}, +] diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index 4141a7d5..3d26673f 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -1,7 +1,6 @@ """Cassandra Migration entity.""" -import logging -from typing import Any, Dict, List, Set +from typing import Any, Dict, List from butterfree.clients import CassandraClient from butterfree.configs.db import CassandraConfig @@ -138,58 +137,3 @@ def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> s parsed_columns = self._get_parsed_columns(columns) return f"ALTER TABLE {table_name} DROP ({parsed_columns});" - - def _get_queries( - self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None - ) -> List[str]: - """Create the desired queries for migration. - - Args: - schema_diff: list of Diff objects. - table_name: table name. - - Returns: - List of queries. - - """ - add_items = [] - drop_items = [] - alter_type_items = [] - alter_key_items = [] - - for diff in schema_diff: - if diff.kind == Diff.Kind.ADD: - add_items.append(diff) - elif diff.kind == Diff.Kind.ALTER_TYPE: - alter_type_items.append(diff) - elif diff.kind == Diff.Kind.DROP: - drop_items.append(diff) - elif diff.kind == Diff.Kind.ALTER_KEY: - alter_key_items.append(diff) - - queries = [] - if add_items: - alter_table_add_query = self._get_alter_table_add_query( - add_items, table_name - ) - queries.append(alter_table_add_query) - if drop_items: - if write_on_entity: - logging.info( - "Features will not be dropped automatically " - "when data is loaded to an entity table" - ) - else: - drop_columns_query = self._get_alter_table_drop_query( - drop_items, table_name - ) - queries.append(drop_columns_query) - if alter_type_items: - alter_column_types_query = self._get_alter_column_type_query( - alter_type_items, table_name - ) - queries.append(alter_column_types_query) - if alter_key_items: - logging.info("This operation is not supported by Cassandra DB.") - - return queries diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index a2106f3c..160f6728 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -1,4 +1,5 @@ """Migration entity.""" +import logging from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum, auto @@ -56,6 +57,47 @@ def _get_create_table_query( pass @abstractmethod + def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> str: + """Creates desired statement to add columns to a table. + + Args: + columns: list of Diff objects with ADD kind. + table_name: table name. + + Returns: + Alter table query. + + """ + pass + + @abstractmethod + def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> str: + """Creates desired statement to drop columns from a table. + + Args: + columns: list of Diff objects with DROP kind. + table_name: table name. + + Returns: + Drop columns from a given table query. + + """ + pass + + @abstractmethod + def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + """Creates desired statement to alter columns' types. + + Args: + columns: list of Diff objects with ALTER_TYPE kind. + table_name: table name. + + Returns: + Alter column type query. + + """ + pass + def _get_queries( self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None ) -> Any: @@ -69,7 +111,47 @@ def _get_queries( List of queries. """ - pass + add_items = [] + drop_items = [] + alter_type_items = [] + alter_key_items = [] + + for diff in schema_diff: + if diff.kind == Diff.Kind.ADD: + add_items.append(diff) + elif diff.kind == Diff.Kind.ALTER_TYPE: + alter_type_items.append(diff) + elif diff.kind == Diff.Kind.DROP: + drop_items.append(diff) + elif diff.kind == Diff.Kind.ALTER_KEY: + alter_key_items.append(diff) + + queries = [] + if add_items: + alter_table_add_query = self._get_alter_table_add_query( + add_items, table_name + ) + queries.append(alter_table_add_query) + if drop_items: + if write_on_entity: + logging.info( + "Features will not be dropped automatically " + "when data is loaded to an entity table" + ) + else: + drop_columns_query = self._get_alter_table_drop_query( + drop_items, table_name + ) + queries.append(drop_columns_query) + if alter_type_items: + alter_column_types_query = self._get_alter_column_type_query( + alter_type_items, table_name + ) + queries.append(alter_column_types_query) + if alter_key_items: + logging.info("This operation is not supported by Spark.") + + return queries def create_query( self, diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 4f51ddf2..89017374 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -2,31 +2,130 @@ from typing import Any, Dict, List -from butterfree.clients import SparkClient -from butterfree.configs.db import MetastoreConfig +from butterfree.configs import environment +from butterfree.constants.migrations import PARTITION_BY from butterfree.migrations.database_migration.database_migration import ( DatabaseMigration, + Diff, ) class MetastoreMigration(DatabaseMigration): - """Metastore class for Migrations.""" + """MetastoreMigration class for performing migrations. - def __init__(self) -> None: - self._db_config = MetastoreConfig() - self._client = SparkClient() + This class implements some methods of the parent DatabaseMigration class and + has specific methods for query building. + The MetastoreMigration class will be used, as the name suggests, for applying + changes to a given Metastore table. There are, however, some remarks that need + to be highlighted: + - If an existing feature has its type changed, then it's extremely important to + make sure that this conversion would not result in data loss; + - If new features are added to your feature set, then they're going to be added + to the corresponding Metastore table; + - Since feature sets can be written both to a feature set and an entity table, + we're not going to automatically drop features when using entity tables, since + it means that some features belong to a different feature set. In summary, if + data is being loaded into an entity table, then users can drop columns manually. + """ - def create_query( - self, - fs_schema: List[Dict[str, Any]], - table_name: str, - db_schema: List[Dict[str, Any]] = None, - write_on_entity: bool = None, - ) -> Any: - """Create a query regarding Metastore. + def __init__( + self, database: str = None, + ): + self.database = database or environment.get_variable( + "FEATURE_STORE_HISTORICAL_DATABASE" + ) + + @staticmethod + def _get_parsed_columns(columns: List[Diff]) -> List[str]: + """Parse columns from a list of Diff objects. + + Args: + columns: list of Diff objects. + + Returns: + Parsed columns. + + """ + parsed_columns = [] + for col in columns: + parsed_columns.append(f"{col.column} {col.value}") + + parsed_columns = ", ".join(parsed_columns) # type: ignore + + return parsed_columns + + def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> str: + """Creates SQL statement to add columns to a table. + + Args: + columns: list of Diff objects with ADD kind. + table_name: table name. + + Returns: + Alter table query. + + """ + parsed_columns = self._get_parsed_columns(columns) + + return ( + f"ALTER TABLE {self.database}.{table_name} " + f"ADD IF NOT EXISTS columns ({parsed_columns});" + ) + + def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + """Creates SQL statement to alter columns' types. + + Args: + columns: list of Diff objects with ALTER_TYPE kind. + table_name: table name. Returns: - Schema object. + Alter column type query. """ - pass + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} ALTER COLUMN ({parsed_columns});" + + def _get_create_table_query( + self, columns: List[Dict[str, Any]], table_name: str + ) -> str: + """Creates SQL statement to create a table. + + Args: + columns: object that contains column's schemas. + table_name: table name. + + Returns: + Create table query. + + """ + columns.extend(PARTITION_BY) + + parsed_columns = [] + for col in columns: + parsed_columns.append(f"{col['column_name']} {col['type']}") + parsed_columns = ", ".join(parsed_columns) # type: ignore + + return ( + f"CREATE TABLE IF NOT EXISTS " + f"{self.database}.{table_name} ({parsed_columns}) " + f"PARTITIONED BY ({PARTITION_BY[0]['column_name']}, " + f"{PARTITION_BY[1]['column_name']}, " + f"{PARTITION_BY[2]['column_name']});" + ) + + def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> str: + """Creates SQL statement to drop columns from a table. + + Args: + columns: list of Diff objects with DROP kind. + table_name: table name. + + Returns: + Drop columns from a given table query. + + """ + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} DROP IF EXISTS ({parsed_columns});" diff --git a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py new file mode 100644 index 00000000..fd1dfad8 --- /dev/null +++ b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py @@ -0,0 +1,50 @@ +from butterfree.migrations.database_migration import MetastoreMigration + + +class TestMetastoreMigration: + def test_queries(self, fs_schema, db_schema): + metastore_migration = MetastoreMigration() + + expected_query = [ + "ALTER TABLE test.table_name ADD IF NOT EXISTS " + "columns (new_feature FloatType);", + "ALTER TABLE table_name DROP IF EXISTS " + "(feature1__avg_over_2_days_rolling_windows None);", + "ALTER TABLE table_name ALTER COLUMN " + "(feature1__avg_over_1_week_rolling_windows FloatType);", + ] + + query = metastore_migration.create_query(fs_schema, "table_name", db_schema) + + assert query, expected_query + + def test_queries_on_entity(self, fs_schema, db_schema): + metastore_migration = MetastoreMigration() + + expected_query = [ + "ALTER TABLE test.table_name ADD IF NOT EXISTS " + "columns (new_feature FloatType);", + "ALTER TABLE table_name ALTER COLUMN " + "(feature1__avg_over_1_week_rolling_windows FloatType);", + ] + + query = metastore_migration.create_query( + fs_schema, "table_name", db_schema, True + ) + + assert query, expected_query + + def test_create_table_query(self, fs_schema): + + metastore_migration = MetastoreMigration() + + expected_query = [ + "CREATE TABLE IF NOT EXISTS test.table_name " + "(id LongType, timestamp TimestampType, new_feature FloatType, " + "feature1__avg_over_1_week_rolling_windows FloatType, year INT, " + "month INT, day INT) PARTITIONED BY (year, month, day);" + ] + + query = metastore_migration.create_query(fs_schema, "table_name") + + assert query, expected_query From e8fc0dadd6f17c64b2f77c2c3311b7e294012436 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Mon, 22 Mar 2021 14:20:58 -0300 Subject: [PATCH 19/86] Fix Validation Step (#302) --- butterfree/configs/db/metastore_config.py | 2 +- .../historical_feature_store_writer.py | 9 +---- .../writers/online_feature_store_writer.py | 18 +-------- setup.py | 2 +- .../integration/butterfree/load/test_sink.py | 11 +----- .../pipelines/test_feature_set_pipeline.py | 22 +---------- tests/unit/butterfree/load/test_sink.py | 32 +--------------- .../test_historical_feature_store_writer.py | 22 ----------- .../test_online_feature_store_writer.py | 38 ++----------------- 9 files changed, 12 insertions(+), 144 deletions(-) diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index a3b315d5..a3013de9 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -117,4 +117,4 @@ def get_path_with_partitions(self, key: str, dataframe: DataFrame) -> List: def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Translate feature set spark schema to the corresponding database.""" - pass + return schema diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 456d9e6b..6274840c 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -140,14 +140,7 @@ def write( """ dataframe = self._create_partitions(dataframe) - partition_df = self._apply_transformations(dataframe) - - if self.debug_mode: - dataframe = partition_df - else: - dataframe = self.check_schema( - spark_client, partition_df, feature_set.name, self.database - ) + dataframe = self._apply_transformations(dataframe) if self.interval_mode: if self.debug_mode: diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index fade3789..310f54cc 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -7,7 +7,7 @@ from pyspark.sql.functions import col, row_number from pyspark.sql.streaming import StreamingQuery -from butterfree.clients import CassandraClient, SparkClient +from butterfree.clients import SparkClient from butterfree.configs.db import AbstractWriteConfig, CassandraConfig from butterfree.constants.columns import TIMESTAMP_COLUMN from butterfree.hooks import Hook @@ -180,22 +180,6 @@ def write( """ table_name = feature_set.entity if self.write_to_entity else feature_set.name - if not self.debug_mode: - config = ( - self.db_config - if self.db_config == CassandraConfig - else CassandraConfig() - ) - - cassandra_client = CassandraClient( - host=[config.host], - keyspace=config.keyspace, - user=config.username, - password=config.password, - ) - - dataframe = self.check_schema(cassandra_client, dataframe, table_name) - if dataframe.isStreaming: dataframe = self._apply_transformations(dataframe) if self.debug_mode: diff --git a/setup.py b/setup.py index d211098c..2dece452 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev2" +__version__ = "1.2.0.dev3" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/integration/butterfree/load/test_sink.py b/tests/integration/butterfree/load/test_sink.py index f507a335..b5f97879 100644 --- a/tests/integration/butterfree/load/test_sink.py +++ b/tests/integration/butterfree/load/test_sink.py @@ -9,7 +9,7 @@ ) -def test_sink(input_dataframe, feature_set, mocker): +def test_sink(input_dataframe, feature_set): # arrange client = SparkClient() client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") @@ -34,11 +34,6 @@ def test_sink(input_dataframe, feature_set, mocker): db_config=s3config, interval_mode=True ) - schema_dataframe = historical_writer._create_partitions(feature_set_df) - historical_writer.check_schema_hook = mocker.stub("check_schema_hook") - historical_writer.check_schema_hook.run = mocker.stub("run") - historical_writer.check_schema_hook.run.return_value = schema_dataframe - # setup online writer # TODO: Change for CassandraConfig when Cassandra for test is ready online_config = Mock() @@ -49,10 +44,6 @@ def test_sink(input_dataframe, feature_set, mocker): ) online_writer = OnlineFeatureStoreWriter(db_config=online_config) - online_writer.check_schema_hook = mocker.stub("check_schema_hook") - online_writer.check_schema_hook.run = mocker.stub("run") - online_writer.check_schema_hook.run.return_value = feature_set_df - writers = [historical_writer, online_writer] sink = Sink(writers) diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index a302dc9e..753dfe7c 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -4,7 +4,6 @@ from pyspark.sql import DataFrame from pyspark.sql import functions as F -from butterfree.clients import SparkClient from butterfree.configs import environment from butterfree.configs.db import MetastoreConfig from butterfree.constants import DataType @@ -75,11 +74,7 @@ def create_ymd(dataframe): class TestFeatureSetPipeline: def test_feature_set_pipeline( - self, - mocked_df, - spark_session, - fixed_windows_output_feature_set_dataframe, - mocker, + self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange table_reader_id = "a_source" @@ -93,11 +88,6 @@ def test_feature_set_pipeline( table_reader_table=table_reader_table, ) - spark_client = SparkClient() - spark_client.conn.conf.set( - "spark.sql.sources.partitionOverwriteMode", "dynamic" - ) - dbconfig = Mock() dbconfig.mode = "overwrite" dbconfig.format_ = "parquet" @@ -107,12 +97,6 @@ def test_feature_set_pipeline( historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) - historical_writer.check_schema_hook = mocker.stub("check_schema_hook") - historical_writer.check_schema_hook.run = mocker.stub("run") - historical_writer.check_schema_hook.run.return_value = ( - fixed_windows_output_feature_set_dataframe - ) - # act test_pipeline = FeatureSetPipeline( source=Source( @@ -187,7 +171,6 @@ def test_feature_set_pipeline_with_dates( spark_session, fixed_windows_output_feature_set_date_dataframe, feature_set_pipeline, - mocker, ): # arrange table_reader_table = "b_table" @@ -211,7 +194,6 @@ def test_feature_set_pipeline_with_execution_date( spark_session, fixed_windows_output_feature_set_date_dataframe, feature_set_pipeline, - mocker, ): # arrange table_reader_table = "b_table" @@ -233,7 +215,7 @@ def test_feature_set_pipeline_with_execution_date( # assert assert_dataframe_equality(df, target_df) - def test_pipeline_with_hooks(self, spark_session, mocker): + def test_pipeline_with_hooks(self, spark_session): # arrange hook1 = AddHook(value=1) diff --git a/tests/unit/butterfree/load/test_sink.py b/tests/unit/butterfree/load/test_sink.py index ef377f67..517f651e 100644 --- a/tests/unit/butterfree/load/test_sink.py +++ b/tests/unit/butterfree/load/test_sink.py @@ -120,7 +120,7 @@ def test_flush_with_writers_list_empty(self): with pytest.raises(ValueError): Sink(writers=writer) - def test_flush_streaming_df(self, feature_set, mocker): + def test_flush_streaming_df(self, feature_set): """Testing the return of the streaming handlers by the sink.""" # arrange spark_client = SparkClient() @@ -137,24 +137,10 @@ def test_flush_streaming_df(self, feature_set, mocker): online_feature_store_writer = OnlineFeatureStoreWriter() - online_feature_store_writer.check_schema_hook = mocker.stub("check_schema_hook") - online_feature_store_writer.check_schema_hook.run = mocker.stub("run") - online_feature_store_writer.check_schema_hook.run.return_value = ( - mocked_stream_df - ) - online_feature_store_writer_on_entity = OnlineFeatureStoreWriter( write_to_entity=True ) - online_feature_store_writer_on_entity.check_schema_hook = mocker.stub( - "check_schema_hook" - ) - online_feature_store_writer_on_entity.check_schema_hook.run = mocker.stub("run") - online_feature_store_writer_on_entity.check_schema_hook.run.return_value = ( - mocked_stream_df - ) - sink = Sink( writers=[ online_feature_store_writer, @@ -177,7 +163,7 @@ def test_flush_streaming_df(self, feature_set, mocker): assert isinstance(handler, StreamingQuery) def test_flush_with_multiple_online_writers( - self, feature_set, feature_set_dataframe, mocker + self, feature_set, feature_set_dataframe ): """Testing the flow of writing to a feature-set table and to an entity table.""" # arrange @@ -189,24 +175,10 @@ def test_flush_with_multiple_online_writers( online_feature_store_writer = OnlineFeatureStoreWriter() - online_feature_store_writer.check_schema_hook = mocker.stub("check_schema_hook") - online_feature_store_writer.check_schema_hook.run = mocker.stub("run") - online_feature_store_writer.check_schema_hook.run.return_value = ( - feature_set_dataframe - ) - online_feature_store_writer_on_entity = OnlineFeatureStoreWriter( write_to_entity=True ) - online_feature_store_writer_on_entity.check_schema_hook = mocker.stub( - "check_schema_hook" - ) - online_feature_store_writer_on_entity.check_schema_hook.run = mocker.stub("run") - online_feature_store_writer_on_entity.check_schema_hook.run.return_value = ( - feature_set_dataframe - ) - sink = Sink( writers=[online_feature_store_writer, online_feature_store_writer_on_entity] ) diff --git a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py index aac806f7..8bab23ba 100644 --- a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py @@ -23,11 +23,6 @@ def test_write( spark_client.write_table = mocker.stub("write_table") writer = HistoricalFeatureStoreWriter() - schema_dataframe = writer._create_partitions(feature_set_dataframe) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = schema_dataframe - # when writer.write( feature_set=feature_set, @@ -62,11 +57,6 @@ def test_write_interval_mode( ) writer = HistoricalFeatureStoreWriter(interval_mode=True) - schema_dataframe = writer._create_partitions(feature_set_dataframe) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = schema_dataframe - # when writer.write( feature_set=feature_set, @@ -104,11 +94,6 @@ def test_write_interval_mode_invalid_partition_mode( writer = HistoricalFeatureStoreWriter(interval_mode=True) - schema_dataframe = writer._create_partitions(feature_set_dataframe) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = schema_dataframe - # when with pytest.raises(RuntimeError): _ = writer.write( @@ -123,7 +108,6 @@ def test_write_in_debug_mode( historical_feature_set_dataframe, feature_set, spark_session, - mocker, ): # given spark_client = SparkClient() @@ -321,12 +305,6 @@ def test_write_with_transform( writer = HistoricalFeatureStoreWriter().with_(json_transform) - schema_dataframe = writer._create_partitions(feature_set_dataframe) - json_dataframe = writer._apply_transformations(schema_dataframe) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = json_dataframe - # when writer.write( feature_set=feature_set, diff --git a/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py index 384ec152..78f6862e 100644 --- a/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py @@ -68,10 +68,6 @@ def test_write( spark_client.write_dataframe = mocker.stub("write_dataframe") writer = OnlineFeatureStoreWriter(cassandra_config) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = feature_set_dataframe - # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -104,10 +100,6 @@ def test_write_in_debug_mode( spark_client = SparkClient() writer = OnlineFeatureStoreWriter(debug_mode=True) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = feature_set_dataframe - # when writer.write( feature_set=feature_set, @@ -119,7 +111,7 @@ def test_write_in_debug_mode( # then assert_dataframe_equality(latest_feature_set_dataframe, result_df) - def test_write_in_debug_and_stream_mode(self, feature_set, spark_session, mocker): + def test_write_in_debug_and_stream_mode(self, feature_set, spark_session): # arrange spark_client = SparkClient() @@ -132,10 +124,6 @@ def test_write_in_debug_and_stream_mode(self, feature_set, spark_session, mocker writer = OnlineFeatureStoreWriter(debug_mode=True) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = mocked_stream_df - # act handler = writer.write( feature_set=feature_set, @@ -151,7 +139,7 @@ def test_write_in_debug_and_stream_mode(self, feature_set, spark_session, mocker assert isinstance(handler, StreamingQuery) @pytest.mark.parametrize("has_checkpoint", [True, False]) - def test_write_stream(self, feature_set, has_checkpoint, monkeypatch, mocker): + def test_write_stream(self, feature_set, has_checkpoint, monkeypatch): # arrange spark_client = SparkClient() spark_client.write_stream = Mock() @@ -174,10 +162,6 @@ def test_write_stream(self, feature_set, has_checkpoint, monkeypatch, mocker): writer = OnlineFeatureStoreWriter(cassandra_config) writer.filter_latest = Mock() - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = dataframe - # act stream_handler = writer.write(feature_set, dataframe, spark_client) @@ -201,7 +185,7 @@ def test_get_db_schema(self, cassandra_config, test_feature_set, expected_schema assert schema == expected_schema - def test_write_stream_on_entity(self, feature_set, monkeypatch, mocker): + def test_write_stream_on_entity(self, feature_set, monkeypatch): """Test write method with stream dataframe and write_to_entity enabled. The main purpose of this test is assert the correct setup of stream checkpoint @@ -224,10 +208,6 @@ def test_write_stream_on_entity(self, feature_set, monkeypatch, mocker): writer = OnlineFeatureStoreWriter(write_to_entity=True) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = dataframe - # act stream_handler = writer.write(feature_set, dataframe, spark_client) @@ -256,10 +236,6 @@ def test_write_with_transform( spark_client.write_dataframe = mocker.stub("write_dataframe") writer = OnlineFeatureStoreWriter(cassandra_config).with_(json_transform) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = feature_set_dataframe - # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -293,10 +269,6 @@ def test_write_with_kafka_config( kafka_config = KafkaConfig() writer = OnlineFeatureStoreWriter(kafka_config).with_(json_transform) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = feature_set_dataframe - # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -320,10 +292,6 @@ def test_write_with_custom_kafka_config( json_transform ) - custom_writer.check_schema_hook = mocker.stub("check_schema_hook") - custom_writer.check_schema_hook.run = mocker.stub("run") - custom_writer.check_schema_hook.run.return_value = feature_set_dataframe - # when custom_writer.write(feature_set, feature_set_dataframe, spark_client) From 3d93a098f110108c50a1273abc382c1ad7a8b99f Mon Sep 17 00:00:00 2001 From: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Date: Tue, 23 Mar 2021 14:15:57 -0300 Subject: [PATCH 20/86] [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato --- butterfree/_cli/migrate.py | 28 ++++++++++- butterfree/configs/db/abstract_config.py | 5 ++ butterfree/configs/db/cassandra_config.py | 5 ++ butterfree/configs/db/kafka_config.py | 5 ++ butterfree/configs/db/metastore_config.py | 5 ++ .../historical_feature_store_writer.py | 9 ++-- .../writers/online_feature_store_writer.py | 8 +-- butterfree/load/writers/writer.py | 11 ++++- butterfree/migrations/__init__.py | 3 -- .../migrations/database_migration/__init__.py | 6 +++ .../database_migration/cassandra_migration.py | 12 +++-- .../database_migration/database_migration.py | 49 ++++++++++++++++--- .../database_migration/metastore_migration.py | 8 +-- butterfree/migrations/migrate.py | 41 ---------------- tests/unit/butterfree/_cli/test_migrate.py | 23 ++++++++- .../migrations/database_migration/conftest.py | 22 +++++++++ .../test_database_migration.py | 11 +++++ 17 files changed, 179 insertions(+), 72 deletions(-) delete mode 100644 butterfree/migrations/migrate.py diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index ee083f73..10b6310b 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -8,6 +8,7 @@ import typer from butterfree._cli import cli_logger +from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline app = typer.Typer() @@ -88,6 +89,28 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: ) +class Migrate: + """Execute migration operations in a Database based on pipeline Writer. + + Attributes: + pipelines: list of Feature Set Pipelines to use to migration. + """ + + def __init__(self, pipelines: Set[FeatureSetPipeline]) -> None: + self.pipelines = pipelines + + def _send_logs_to_s3(self) -> None: + """Send all migration logs to S3.""" + pass + + def run(self) -> None: + """Construct and apply the migrations.""" + for pipeline in self.pipelines: + for writer in pipeline.sink.writers: + migration = ALLOWED_DATABASE[writer.db_config.database] + migration.apply_migration(pipeline.feature_set, writer) + + @app.callback() def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: """Scan and run database migrations for feature set pipelines defined under PATH. @@ -100,5 +123,6 @@ def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: All pipelines must be under python modules inside path, so we can dynamically import and instantiate them. """ - # TODO call the Migration actor with all feature set pipeline objects - return __fs_objects(path) + pipe_set = __fs_objects(path) + Migrate(pipe_set).run() + return pipe_set diff --git a/butterfree/configs/db/abstract_config.py b/butterfree/configs/db/abstract_config.py index 8e98aab6..fbd48c53 100644 --- a/butterfree/configs/db/abstract_config.py +++ b/butterfree/configs/db/abstract_config.py @@ -7,6 +7,11 @@ class AbstractWriteConfig(ABC): """Abstract class for database write configurations with spark.""" + @property + @abstractmethod + def database(self) -> str: + """Database name.""" + @property @abstractmethod def mode(self) -> Any: diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index b58a2e0a..e9329c5d 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -50,6 +50,11 @@ def __init__( self.stream_output_mode = stream_output_mode self.stream_checkpoint_path = stream_checkpoint_path + @property + def database(self) -> str: + """Database name.""" + return "cassandra" + @property def username(self) -> Optional[str]: """Username used in connection to Cassandra DB.""" diff --git a/butterfree/configs/db/kafka_config.py b/butterfree/configs/db/kafka_config.py index 67b2dc57..79cad15b 100644 --- a/butterfree/configs/db/kafka_config.py +++ b/butterfree/configs/db/kafka_config.py @@ -41,6 +41,11 @@ def __init__( self.stream_output_mode = stream_output_mode self.stream_checkpoint_path = stream_checkpoint_path + @property + def database(self) -> str: + """Database name.""" + return "kafka" + @property def kafka_topic(self) -> Optional[str]: """Kafka topic name.""" diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index a3013de9..97a999c2 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -35,6 +35,11 @@ def __init__( self.format_ = format_ self.file_system = file_system + @property + def database(self) -> str: + """Database name.""" + return "metastore" + @property def path(self) -> Optional[str]: """Bucket name.""" diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 6274840c..e3e9b9b7 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -1,7 +1,7 @@ """Holds the Historical Feature Store writer class.""" import os -from typing import Any, Union +from typing import Any from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import dayofmonth, month, year @@ -106,7 +106,7 @@ class HistoricalFeatureStoreWriter(Writer): def __init__( self, - db_config: Union[AbstractWriteConfig, MetastoreConfig] = None, + db_config: AbstractWriteConfig = None, database: str = None, num_partitions: int = None, validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD, @@ -114,8 +114,9 @@ def __init__( interval_mode: bool = False, check_schema_hook: Hook = None, ): - super(HistoricalFeatureStoreWriter, self).__init__(debug_mode, interval_mode) - self.db_config = db_config or MetastoreConfig() + super(HistoricalFeatureStoreWriter, self).__init__( + db_config or MetastoreConfig(), debug_mode, interval_mode + ) self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" ) diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index 310f54cc..b51d9923 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -80,15 +80,15 @@ class OnlineFeatureStoreWriter(Writer): def __init__( self, - db_config: Union[AbstractWriteConfig, CassandraConfig] = None, + db_config: AbstractWriteConfig = None, debug_mode: bool = False, write_to_entity: bool = False, interval_mode: bool = False, check_schema_hook: Hook = None, ): - super(OnlineFeatureStoreWriter, self).__init__(debug_mode, interval_mode) - self.db_config = db_config or CassandraConfig() - self.write_to_entity = write_to_entity + super(OnlineFeatureStoreWriter, self).__init__( + db_config or CassandraConfig(), debug_mode, interval_mode, write_to_entity + ) self.check_schema_hook = check_schema_hook @staticmethod diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index 7e0f9018..e12a4317 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -7,6 +7,7 @@ from pyspark.sql.dataframe import DataFrame from butterfree.clients import SparkClient +from butterfree.configs.db import AbstractWriteConfig from butterfree.hooks import HookableComponent from butterfree.transform import FeatureSet @@ -19,11 +20,19 @@ class Writer(ABC, HookableComponent): """ - def __init__(self, debug_mode: bool = False, interval_mode: bool = False) -> None: + def __init__( + self, + db_config: AbstractWriteConfig, + debug_mode: bool = False, + interval_mode: bool = False, + write_to_entity: bool = False, + ) -> None: super().__init__() + self.db_config = db_config self.transformations: List[Dict[str, Any]] = [] self.debug_mode = debug_mode self.interval_mode = interval_mode + self.write_to_entity = write_to_entity def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any diff --git a/butterfree/migrations/__init__.py b/butterfree/migrations/__init__.py index 39cabfb7..791f5fef 100644 --- a/butterfree/migrations/__init__.py +++ b/butterfree/migrations/__init__.py @@ -1,4 +1 @@ """Holds available migrations.""" -from butterfree.migrations.migrate import Migrate - -__all__ = ["Migrate"] diff --git a/butterfree/migrations/database_migration/__init__.py b/butterfree/migrations/database_migration/__init__.py index 7138c445..e3180088 100644 --- a/butterfree/migrations/database_migration/__init__.py +++ b/butterfree/migrations/database_migration/__init__.py @@ -9,3 +9,9 @@ ) __all__ = ["CassandraMigration", "MetastoreMigration", "Diff"] + + +ALLOWED_DATABASE = { + "cassandra": CassandraMigration(), + "metastore": MetastoreMigration(), +} diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index 3d26673f..c511479b 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -32,11 +32,13 @@ class CassandraMigration(DatabaseMigration): def __init__(self) -> None: self._db_config = CassandraConfig() - self._client = CassandraClient( - host=[self._db_config.host], - keyspace=self._db_config.keyspace, # type: ignore - user=self._db_config.username, - password=self._db_config.password, + super(CassandraMigration, self).__init__( + CassandraClient( + host=[self._db_config.host], + keyspace=self._db_config.keyspace, # type: ignore + user=self._db_config.username, + password=self._db_config.password, + ) ) @staticmethod diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 160f6728..fb2b9e7f 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -5,6 +5,8 @@ from enum import Enum, auto from typing import Any, Dict, List, Set +from butterfree.clients import AbstractClient +from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet @@ -40,6 +42,9 @@ def __eq__(self, other: object) -> bool: class DatabaseMigration(ABC): """Abstract base class for Migrations.""" + def __init__(self, client: AbstractClient) -> None: + self._client = client + @abstractmethod def _get_create_table_query( self, columns: List[Dict[str, Any]], table_name: str @@ -173,10 +178,6 @@ def create_query( return self._get_queries(schema_diff, table_name, write_on_entity) - def _apply_migration(self, feature_set: FeatureSet) -> None: - """Apply the migration in the respective database.""" - pass - @staticmethod def _get_diff( fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]], @@ -238,11 +239,43 @@ def _get_diff( ) return schema_diff - def run(self, feature_set: FeatureSet) -> None: - """Runs the migrations. + def _get_schema(self, table_name: str) -> List[Dict[str, Any]]: + """Get a table schema in the respective database. Args: - feature_set: the feature set. + table_name: Table name to get schema. + Returns: + Schema object. """ - pass + try: + db_schema = self._client.get_schema(table_name) + except Exception: # noqa + db_schema = [] + return db_schema + + def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: + """Apply the migration in the respective database. + + Args: + feature_set: the feature set. + writer: the writer being used to load the feature set. + """ + logging.info(f"Migrating feature set: {feature_set.name}") + + table_name = ( + feature_set.name if not writer.write_to_entity else feature_set.entity + ) + + fs_schema = writer.db_config.translate(feature_set.get_schema()) + db_schema = self._get_schema(table_name) + + queries = self.create_query( + fs_schema, table_name, db_schema, writer.write_to_entity + ) + + for q in queries: + logging.info(f"Applying {q}...") + self._client.sql(q) + + logging.info(f"Feature Set migration finished successfully.") diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 89017374..1c5667db 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -2,7 +2,9 @@ from typing import Any, Dict, List +from butterfree.clients import SparkClient from butterfree.configs import environment +from butterfree.configs.db import MetastoreConfig from butterfree.constants.migrations import PARTITION_BY from butterfree.migrations.database_migration.database_migration import ( DatabaseMigration, @@ -28,12 +30,12 @@ class MetastoreMigration(DatabaseMigration): data is being loaded into an entity table, then users can drop columns manually. """ - def __init__( - self, database: str = None, - ): + def __init__(self, database: str = None,) -> None: + self._db_config = MetastoreConfig() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" ) + super(MetastoreMigration, self).__init__(SparkClient()) @staticmethod def _get_parsed_columns(columns: List[Diff]) -> List[str]: diff --git a/butterfree/migrations/migrate.py b/butterfree/migrations/migrate.py deleted file mode 100644 index f128dee1..00000000 --- a/butterfree/migrations/migrate.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Holds the Migrator Class.""" - -from typing import Callable, List, Tuple - -from butterfree.pipelines import FeatureSetPipeline -from butterfree.transform import FeatureSet - - -class Migrate: - """Execute migration operations in a Database based on pipeline Writer. - - Attributes: - pipelines: list of Feature Set Pipelines to use to migration. - - """ - - def __init__(self, pipelines: List[FeatureSetPipeline]) -> None: - self.pipelines = pipelines - - def _parse_feature_set_pipeline( - self, pipeline: FeatureSetPipeline - ) -> List[Tuple[Callable, FeatureSet]]: - feature_set = pipeline.feature_set - migrations = [ - writer.db_config._migration_class for writer in pipeline.sink.writers - ] - - return [(migrate, feature_set) for migrate in migrations] - - def _send_logs_to_s3(self) -> None: - """Send all migration logs to S3.""" - pass - - def migration(self) -> None: - """Construct and apply the migrations.""" - migration_list = [ - self._parse_feature_set_pipeline(pipeline) for pipeline in self.pipelines - ] - - for migration, fs in migration_list: - migration.run(fs) diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index 6a63453f..2e4b2db0 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -1,8 +1,29 @@ +from unittest.mock import call + from butterfree._cli import migrate +from butterfree.migrations.database_migration import ( + CassandraMigration, + MetastoreMigration, +) from butterfree.pipelines import FeatureSetPipeline -def test_migrate_success(): +def test_migrate_success(mocker): + mocker.patch.object(migrate.Migrate, "run") all_fs = migrate.migrate("tests/mocks/entities/") assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] + + +def test_migrate_all_pairs(mocker): + mocker.patch.object(MetastoreMigration, "apply_migration") + mocker.patch.object(CassandraMigration, "apply_migration") + all_fs = migrate.migrate("tests/mocks/entities/") + + assert MetastoreMigration.apply_migration.call_count == 2 + assert CassandraMigration.apply_migration.call_count == 2 + + metastore_pairs = [call(pipe.feature_set, pipe.sink.writers[0]) for pipe in all_fs] + cassandra_pairs = [call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs] + MetastoreMigration.apply_migration.assert_has_calls(metastore_pairs, any_order=True) + CassandraMigration.apply_migration.assert_has_calls(cassandra_pairs, any_order=True) diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py index bcf7f7f3..dcd96714 100644 --- a/tests/unit/butterfree/migrations/database_migration/conftest.py +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -1,6 +1,10 @@ from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType from pytest import fixture +from butterfree.constants import DataType +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature + @fixture def db_schema(): @@ -32,3 +36,21 @@ def fs_schema(): "primary_key": False, }, ] + + +@fixture +def feature_set(): + feature_set = FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature(name="feature_float", description="test", dtype=DataType.FLOAT,), + ], + keys=[ + KeyFeature(name="id", description="The device ID", dtype=DataType.BIGINT,) + ], + timestamp=TimestampFeature(), + ) + + return feature_set diff --git a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py index 30277992..befb55a3 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py @@ -54,3 +54,14 @@ def test__get_diff(self, mocker, db_schema): m._client = mocker.stub("client") diff = m._get_diff(fs_schema, db_schema) assert diff == expected_diff + + def test_apply_migration(self, feature_set, mocker): + # given + m = CassandraMigration() + m.apply_migration = mocker.stub("apply_migration") + + # when + m.apply_migration(feature_set) + + # then + m.apply_migration.assert_called_once() From 0d309327258c1a61f0d1aeb484da2f67f97b7a88 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 30 Mar 2021 14:58:08 -0300 Subject: [PATCH 21/86] [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. --- butterfree/load/writers/historical_feature_store_writer.py | 5 +++-- setup.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index e3e9b9b7..5defb00b 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -225,7 +225,6 @@ def validate( Raises: AssertionError: if count of written data doesn't match count in current feature set dataframe. - """ table_name = ( f"{feature_set.name}" @@ -240,7 +239,9 @@ def validate( written_count = ( spark_client.read( self.db_config.format_, - path=self.db_config.get_path_with_partitions(table_name, dataframe), + path=self.db_config.get_path_with_partitions( + table_name, self._create_partitions(dataframe) + ), ).count() if self.interval_mode and not self.debug_mode else spark_client.read_table(table_name).count() diff --git a/setup.py b/setup.py index 2dece452..a86ee049 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev3" +__version__ = "1.2.0.dev4" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From d607297e004f2e2fa1b625d1218bf79dfe929aa8 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 30 Mar 2021 16:09:07 -0300 Subject: [PATCH 22/86] [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> --- butterfree/load/writers/historical_feature_store_writer.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 5defb00b..c4344041 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -227,7 +227,7 @@ def validate( feature set dataframe. """ table_name = ( - f"{feature_set.name}" + os.path.join("historical", feature_set.entity, feature_set.name) if self.interval_mode and not self.debug_mode else ( f"{self.database}.{feature_set.name}" diff --git a/setup.py b/setup.py index a86ee049..3ff18737 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev4" +__version__ = "1.2.0.dev5" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 3dcd975d9b6d4ed2380bd3a7701160e546014cd2 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Thu, 1 Apr 2021 11:10:03 -0300 Subject: [PATCH 23/86] [FIX] Add Partition types for Metastore (#305) --- .../migrations/database_migration/metastore_migration.py | 9 ++++----- .../database_migration/test_metastore_migration.py | 5 ++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 1c5667db..8b7c6af0 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -102,8 +102,6 @@ def _get_create_table_query( Create table query. """ - columns.extend(PARTITION_BY) - parsed_columns = [] for col in columns: parsed_columns.append(f"{col['column_name']} {col['type']}") @@ -112,9 +110,10 @@ def _get_create_table_query( return ( f"CREATE TABLE IF NOT EXISTS " f"{self.database}.{table_name} ({parsed_columns}) " - f"PARTITIONED BY ({PARTITION_BY[0]['column_name']}, " - f"{PARTITION_BY[1]['column_name']}, " - f"{PARTITION_BY[2]['column_name']});" + f"PARTITIONED BY (" + f"{PARTITION_BY[0]['column_name']} {PARTITION_BY[0]['type']}, " + f"{PARTITION_BY[1]['column_name']} {PARTITION_BY[1]['type']}, " + f"{PARTITION_BY[2]['column_name']} {PARTITION_BY[2]['type']});" ) def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> str: diff --git a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py index fd1dfad8..5bac9352 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py @@ -40,9 +40,8 @@ def test_create_table_query(self, fs_schema): expected_query = [ "CREATE TABLE IF NOT EXISTS test.table_name " - "(id LongType, timestamp TimestampType, new_feature FloatType, " - "feature1__avg_over_1_week_rolling_windows FloatType, year INT, " - "month INT, day INT) PARTITIONED BY (year, month, day);" + "(id LongType, timestamp TimestampType, new_feature FloatType) " + "PARTITIONED BY (year INT, month INT, day INT);" ] query = metastore_migration.create_query(fs_schema, "table_name") From 8077d8656fc4b6f5259f991c9797b90fe8e9b67c Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Thu, 1 Apr 2021 14:46:53 -0300 Subject: [PATCH 24/86] [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. --- butterfree/__init__.py | 7 +++ butterfree/_cli/__init__.py | 10 ---- butterfree/_cli/main.py | 2 +- butterfree/_cli/migrate.py | 56 ++++++++++++++----- butterfree/configs/db/metastore_config.py | 27 ++++++++- butterfree/constants/data_type.py | 29 +++++----- butterfree/logging.conf | 52 +++++++++++++++++ .../database_migration/database_migration.py | 10 ++-- logs/logging.json | 0 requirements.txt | 3 +- setup.py | 2 +- tests/unit/butterfree/_cli/test_migrate.py | 41 +++++++++----- .../test_database_migration.py | 3 +- 13 files changed, 181 insertions(+), 61 deletions(-) create mode 100644 butterfree/logging.conf create mode 100644 logs/logging.json diff --git a/butterfree/__init__.py b/butterfree/__init__.py index 18759b03..25b955c6 100644 --- a/butterfree/__init__.py +++ b/butterfree/__init__.py @@ -1 +1,8 @@ """Module docstring example, following Google's docstring style.""" +import logging.config +import os +import sys + +sys.path.insert(0, os.path.abspath(".")) + +logging.config.fileConfig(fname="butterfree/logging.conf") diff --git a/butterfree/_cli/__init__.py b/butterfree/_cli/__init__.py index ec8a1792..e69de29b 100644 --- a/butterfree/_cli/__init__.py +++ b/butterfree/_cli/__init__.py @@ -1,10 +0,0 @@ -import logging - - -def __logger(name: str) -> logging.Logger: - format_ = "%(name)s:%(asctime)-15s:%(levelname)s:< %(message)s >" - logging.basicConfig(format=format_, level=logging.INFO) - return logging.getLogger(name) - - -cli_logger = __logger("butterfree") diff --git a/butterfree/_cli/main.py b/butterfree/_cli/main.py index e340bc1b..636fdb25 100644 --- a/butterfree/_cli/main.py +++ b/butterfree/_cli/main.py @@ -3,7 +3,7 @@ from butterfree._cli import migrate app = typer.Typer() -app.add_typer(migrate.app) +app.add_typer(migrate.app, name="migrate") if __name__ == "__main__": app() diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 10b6310b..6e3e9b59 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,5 +1,7 @@ import importlib import inspect +import logging +import os import pkgutil import sys from typing import Set @@ -7,11 +9,15 @@ import setuptools import typer -from butterfree._cli import cli_logger +from butterfree.clients import SparkClient +from butterfree.configs import environment +from butterfree.extract.readers import FileReader from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline -app = typer.Typer() +app = typer.Typer(help="Apply the automatic migrations in a database.") + +logger = logging.getLogger("migrate") def __find_modules(path: str) -> Set[str]: @@ -33,18 +39,18 @@ def __find_modules(path: str) -> Set[str]: def __fs_objects(path: str) -> Set[FeatureSetPipeline]: - cli_logger.info(f"Looking for python modules under {path}...") + logger.info(f"Looking for python modules under {path}...") modules = __find_modules(path) if not modules: return set() - cli_logger.info(f"Importing modules...") + logger.info(f"Importing modules...") package = ".".join(path.strip("/").split("/")) imported = set( importlib.import_module(f".{name}", package=package) for name in modules ) - cli_logger.info(f"Scanning modules...") + logger.info(f"Scanning modules...") content = { module: set( filter( @@ -80,7 +86,7 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: instances.add(value) - cli_logger.info("Creating instances...") + logger.info("Creating instances...") return set(value() for value in instances) @@ -88,6 +94,10 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: ..., help="Full or relative path to where feature set pipelines are being defined.", ) +GENERATE_LOGS = typer.Option( + False, help="To generate the logs in local file 'logging.json'." +) + class Migrate: """Execute migration operations in a Database based on pipeline Writer. @@ -96,23 +106,43 @@ class Migrate: pipelines: list of Feature Set Pipelines to use to migration. """ - def __init__(self, pipelines: Set[FeatureSetPipeline]) -> None: + def __init__( + self, pipelines: Set[FeatureSetPipeline], spark_client: SparkClient = None + ) -> None: self.pipelines = pipelines + self.spark_client = spark_client or SparkClient() - def _send_logs_to_s3(self) -> None: + def _send_logs_to_s3(self, file_local: bool) -> None: """Send all migration logs to S3.""" - pass + file_reader = FileReader(id="name", path="logs/logging.json", format="json") + df = file_reader.consume(self.spark_client) + + path = environment.get_variable("FEATURE_STORE_S3_BUCKET") - def run(self) -> None: + self.spark_client.write_dataframe( + dataframe=df, + format_="json", + mode="append", + **{"path": f"s3a://{path}/logging"}, + ) + + if not file_local: + os.rmdir("logs/logging.json") + + def run(self, generate_logs: bool) -> None: """Construct and apply the migrations.""" for pipeline in self.pipelines: for writer in pipeline.sink.writers: migration = ALLOWED_DATABASE[writer.db_config.database] migration.apply_migration(pipeline.feature_set, writer) + self._send_logs_to_s3(generate_logs) + -@app.callback() -def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: +@app.command("apply") +def migrate( + path: str = PATH, generate_logs: bool = GENERATE_LOGS, +) -> Set[FeatureSetPipeline]: """Scan and run database migrations for feature set pipelines defined under PATH. Butterfree will scan a given path for classes that inherit from its @@ -124,5 +154,5 @@ def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: import and instantiate them. """ pipe_set = __fs_objects(path) - Migrate(pipe_set).run() + Migrate(pipe_set).run(generate_logs) return pipe_set diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index 97a999c2..ff7ed01d 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -122,4 +122,29 @@ def get_path_with_partitions(self, key: str, dataframe: DataFrame) -> List: def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Translate feature set spark schema to the corresponding database.""" - return schema + spark_sql_mapping = { + "TimestampType": "TIMESTAMP", + "BinaryType": "BINARY", + "BooleanType": "BOOLEAN", + "DateType": "DATE", + "DecimalType": "DECIMAL", + "DoubleType": "DOUBLE", + "FloatType": "FLOAT", + "IntegerType": "INT", + "LongType": "BIGINT", + "StringType": "STRING", + "ArrayType(LongType,true)": "ARRAY", + "ArrayType(StringType,true)": "ARRAY", + "ArrayType(FloatType,true)": "ARRAY", + } + sql_schema = [] + for features in schema: + sql_schema.append( + { + "column_name": features["column_name"], + "type": spark_sql_mapping[str(features["type"])], + "primary_key": features["primary_key"], + } + ) + + return sql_schema diff --git a/butterfree/constants/data_type.py b/butterfree/constants/data_type.py index 157d4a1f..e99525f7 100644 --- a/butterfree/constants/data_type.py +++ b/butterfree/constants/data_type.py @@ -21,20 +21,21 @@ class DataType(Enum): """Holds constants for data types within Butterfree.""" - TIMESTAMP = (TimestampType(), "timestamp") - BINARY = (BinaryType(), "boolean") - BOOLEAN = (BooleanType(), "boolean") - DATE = (DateType(), "timestamp") - DECIMAL = (DecimalType(), "decimal") - DOUBLE = (DoubleType(), "double") - FLOAT = (FloatType(), "float") - INTEGER = (IntegerType(), "int") - BIGINT = (LongType(), "bigint") - STRING = (StringType(), "text") - ARRAY_BIGINT = (ArrayType(LongType()), "frozen>") - ARRAY_STRING = (ArrayType(StringType()), "frozen>") - ARRAY_FLOAT = (ArrayType(FloatType()), "frozen>") + TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP") + BINARY = (BinaryType(), "boolean", "BINARY") + BOOLEAN = (BooleanType(), "boolean", "BOOLEAN") + DATE = (DateType(), "timestamp", "DATE") + DECIMAL = (DecimalType(), "decimal", "DECIMAL") + DOUBLE = (DoubleType(), "double", "DOUBLE") + FLOAT = (FloatType(), "float", "FLOAT") + INTEGER = (IntegerType(), "int", "INT") + BIGINT = (LongType(), "bigint", "BIGINT") + STRING = (StringType(), "text", "STRING") + ARRAY_BIGINT = (ArrayType(LongType()), "frozen>", "ARRAY") + ARRAY_STRING = (ArrayType(StringType()), "frozen>", "ARRAY") + ARRAY_FLOAT = (ArrayType(FloatType()), "frozen>", "ARRAY") - def __init__(self, spark: PySparkDataType, cassandra: str) -> None: + def __init__(self, spark: PySparkDataType, cassandra: str, spark_sql: str) -> None: self.spark = spark self.cassandra = cassandra + self.spark_sql = spark_sql diff --git a/butterfree/logging.conf b/butterfree/logging.conf new file mode 100644 index 00000000..1ee6da86 --- /dev/null +++ b/butterfree/logging.conf @@ -0,0 +1,52 @@ +[loggers] +keys=root,cli,migrate,database_migrate + +[handlers] +keys=consoleHandler,file + +[formatters] +keys=simpleFormatter,jsonFormatter + +[logger_root] +level=DEBUG +handlers=consoleHandler + +[logger_cli] +level=DEBUG +handlers=file +qualname=cli +propagate=0 + +[logger_migrate] +level=DEBUG +handlers=file +qualname=migrate +propagate=0 + +[logger_database_migrate] +level=DEBUG +handlers=file +qualname=database_migrate +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=simpleFormatter +args=(sys.stdout,) + +[handler_file] +class=FileHandler +level=DEBUG +formatter=jsonFormatter +args=('logs/logging.json', "a") + +[formatter_simpleFormatter] +format=%(name)s:%(asctime)-15s:%(levelname)s:%(message)s +datefmt= +class=logging.Formatter + +[formatter_jsonFormatter] +format={"name": "%(name)s", "timestamp": "%(asctime)-15s", "level": "%(levelname)s", "message": "%(message)s"} +datefmt= +class=logging.Formatter \ No newline at end of file diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index fb2b9e7f..28f4f06c 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -9,6 +9,8 @@ from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet +logger = logging.getLogger("database_migrate") + @dataclass class Diff: @@ -154,7 +156,7 @@ def _get_queries( ) queries.append(alter_column_types_query) if alter_key_items: - logging.info("This operation is not supported by Spark.") + logger.info("This operation is not supported by Spark.") return queries @@ -261,7 +263,7 @@ def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: feature_set: the feature set. writer: the writer being used to load the feature set. """ - logging.info(f"Migrating feature set: {feature_set.name}") + logger.info(f"Migrating feature set: {feature_set.name}") table_name = ( feature_set.name if not writer.write_to_entity else feature_set.entity @@ -275,7 +277,7 @@ def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: ) for q in queries: - logging.info(f"Applying {q}...") + logger.info(f"Applying this query: {q} ...") self._client.sql(q) - logging.info(f"Feature Set migration finished successfully.") + logger.info(f"Feature Set migration finished successfully.") diff --git a/logs/logging.json b/logs/logging.json new file mode 100644 index 00000000..e69de29b diff --git a/requirements.txt b/requirements.txt index bac7f2c7..82a99d7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ pandas>=0.24,<1.1 parameters-validation>=1.1.5,<2.0 pyspark==3.* typer>=0.3,<0.4 -setuptools>=41,<42 \ No newline at end of file +setuptools>=41,<42 +typing-extensions==3.7.4.3 \ No newline at end of file diff --git a/setup.py b/setup.py index 3ff18737..abd56ccb 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev5" +__version__ = "1.2.0.dev6" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index 2e4b2db0..aa2c86db 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -8,22 +8,33 @@ from butterfree.pipelines import FeatureSetPipeline -def test_migrate_success(mocker): - mocker.patch.object(migrate.Migrate, "run") - all_fs = migrate.migrate("tests/mocks/entities/") - assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) - assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] +class TestMigrate: + def test_migrate_success(self, mocker): + mocker.patch.object(migrate.Migrate, "run") + all_fs = migrate.migrate("tests/mocks/entities/") + assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) + assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] + def test_migrate_all_pairs(self, mocker): + mocker.patch.object(MetastoreMigration, "apply_migration") + mocker.patch.object(CassandraMigration, "apply_migration") + mocker.patch.object(migrate.Migrate, "_send_logs_to_s3") -def test_migrate_all_pairs(mocker): - mocker.patch.object(MetastoreMigration, "apply_migration") - mocker.patch.object(CassandraMigration, "apply_migration") - all_fs = migrate.migrate("tests/mocks/entities/") + all_fs = migrate.migrate("tests/mocks/entities/") - assert MetastoreMigration.apply_migration.call_count == 2 - assert CassandraMigration.apply_migration.call_count == 2 + assert MetastoreMigration.apply_migration.call_count == 2 + assert CassandraMigration.apply_migration.call_count == 2 - metastore_pairs = [call(pipe.feature_set, pipe.sink.writers[0]) for pipe in all_fs] - cassandra_pairs = [call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs] - MetastoreMigration.apply_migration.assert_has_calls(metastore_pairs, any_order=True) - CassandraMigration.apply_migration.assert_has_calls(cassandra_pairs, any_order=True) + metastore_pairs = [ + call(pipe.feature_set, pipe.sink.writers[0]) for pipe in all_fs + ] + cassandra_pairs = [ + call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs + ] + MetastoreMigration.apply_migration.assert_has_calls( + metastore_pairs, any_order=True + ) + CassandraMigration.apply_migration.assert_has_calls( + cassandra_pairs, any_order=True + ) + migrate.Migrate._send_logs_to_s3.assert_called_once() diff --git a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py index befb55a3..ea7ce815 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py @@ -1,5 +1,6 @@ from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType +from butterfree.load.writers import HistoricalFeatureStoreWriter from butterfree.migrations.database_migration import CassandraMigration, Diff @@ -61,7 +62,7 @@ def test_apply_migration(self, feature_set, mocker): m.apply_migration = mocker.stub("apply_migration") # when - m.apply_migration(feature_set) + m.apply_migration(feature_set, HistoricalFeatureStoreWriter()) # then m.apply_migration.assert_called_once() From 6d2a8f9897ddd665a68602ca12713dd3d0249f4b Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 6 Apr 2021 09:09:46 -0300 Subject: [PATCH 25/86] [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. --- Makefile | 1 + butterfree/__init__.py | 7 --- butterfree/_cli/migrate.py | 14 ++--- butterfree/configs/logger.py | 24 +++++++++ butterfree/logging.conf | 52 ------------------- .../database_migration/database_migration.py | 3 +- logs/logging.json | 0 setup.py | 2 +- tests/unit/butterfree/_cli/test_migrate.py | 14 +++++ 9 files changed, 50 insertions(+), 67 deletions(-) create mode 100644 butterfree/configs/logger.py delete mode 100644 butterfree/logging.conf delete mode 100644 logs/logging.json diff --git a/Makefile b/Makefile index 397d04bf..95cc6e3a 100644 --- a/Makefile +++ b/Makefile @@ -122,6 +122,7 @@ clean: @find ./ -type f -name 'coverage.xml' -exec rm -f {} \; @find ./ -type f -name '.coverage*' -exec rm -f {} \; @find ./ -type f -name '*derby.log' -exec rm -f {} \; + @find ./ -type f -name 'logging.json' -exec rm -f {} \; @find ./ -name '*.pyc' -exec rm -f {} \; @find ./ -name '*.pyo' -exec rm -f {} \; @find ./ -name '*~' -exec rm -f {} \; diff --git a/butterfree/__init__.py b/butterfree/__init__.py index 25b955c6..18759b03 100644 --- a/butterfree/__init__.py +++ b/butterfree/__init__.py @@ -1,8 +1 @@ """Module docstring example, following Google's docstring style.""" -import logging.config -import os -import sys - -sys.path.insert(0, os.path.abspath(".")) - -logging.config.fileConfig(fname="butterfree/logging.conf") diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 6e3e9b59..f3c533d7 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,6 +1,5 @@ import importlib import inspect -import logging import os import pkgutil import sys @@ -11,13 +10,14 @@ from butterfree.clients import SparkClient from butterfree.configs import environment +from butterfree.configs.logger import __logger from butterfree.extract.readers import FileReader from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline app = typer.Typer(help="Apply the automatic migrations in a database.") -logger = logging.getLogger("migrate") +logger = __logger("migrate", True) def __find_modules(path: str) -> Set[str]: @@ -114,7 +114,9 @@ def __init__( def _send_logs_to_s3(self, file_local: bool) -> None: """Send all migration logs to S3.""" - file_reader = FileReader(id="name", path="logs/logging.json", format="json") + log_path = "../logging.json" + + file_reader = FileReader(id="name", path=log_path, format="json") df = file_reader.consume(self.spark_client) path = environment.get_variable("FEATURE_STORE_S3_BUCKET") @@ -126,10 +128,10 @@ def _send_logs_to_s3(self, file_local: bool) -> None: **{"path": f"s3a://{path}/logging"}, ) - if not file_local: - os.rmdir("logs/logging.json") + if not file_local and os.path.exists(log_path): + os.remove(log_path) - def run(self, generate_logs: bool) -> None: + def run(self, generate_logs: bool = False) -> None: """Construct and apply the migrations.""" for pipeline in self.pipelines: for writer in pipeline.sink.writers: diff --git a/butterfree/configs/logger.py b/butterfree/configs/logger.py new file mode 100644 index 00000000..60dab67c --- /dev/null +++ b/butterfree/configs/logger.py @@ -0,0 +1,24 @@ +"""Logger funcion.""" + +import logging + + +def __config(json_file_logs: bool = False) -> None: + + if json_file_logs: + return logging.basicConfig( + format='{"name": "%(name)s", "timestamp": "%(asctime)-15s", ' + '"level": "%(levelname)s", "message": "%(message)s"}', + level=logging.INFO, + filename="../logging.json", + ) + return logging.basicConfig( + format="%(name)s:%(asctime)-15s:%(levelname)s:< %(message)s >", + level=logging.INFO, + ) + + +def __logger(name: str, file_logs: bool = False) -> logging.Logger: + + __config(file_logs) + return logging.getLogger(name) diff --git a/butterfree/logging.conf b/butterfree/logging.conf deleted file mode 100644 index 1ee6da86..00000000 --- a/butterfree/logging.conf +++ /dev/null @@ -1,52 +0,0 @@ -[loggers] -keys=root,cli,migrate,database_migrate - -[handlers] -keys=consoleHandler,file - -[formatters] -keys=simpleFormatter,jsonFormatter - -[logger_root] -level=DEBUG -handlers=consoleHandler - -[logger_cli] -level=DEBUG -handlers=file -qualname=cli -propagate=0 - -[logger_migrate] -level=DEBUG -handlers=file -qualname=migrate -propagate=0 - -[logger_database_migrate] -level=DEBUG -handlers=file -qualname=database_migrate -propagate=0 - -[handler_consoleHandler] -class=StreamHandler -level=DEBUG -formatter=simpleFormatter -args=(sys.stdout,) - -[handler_file] -class=FileHandler -level=DEBUG -formatter=jsonFormatter -args=('logs/logging.json', "a") - -[formatter_simpleFormatter] -format=%(name)s:%(asctime)-15s:%(levelname)s:%(message)s -datefmt= -class=logging.Formatter - -[formatter_jsonFormatter] -format={"name": "%(name)s", "timestamp": "%(asctime)-15s", "level": "%(levelname)s", "message": "%(message)s"} -datefmt= -class=logging.Formatter \ No newline at end of file diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 28f4f06c..2ceca0b8 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -6,10 +6,11 @@ from typing import Any, Dict, List, Set from butterfree.clients import AbstractClient +from butterfree.configs.logger import __logger from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet -logger = logging.getLogger("database_migrate") +logger = __logger("database_migrate", True) @dataclass diff --git a/logs/logging.json b/logs/logging.json deleted file mode 100644 index e69de29b..00000000 diff --git a/setup.py b/setup.py index abd56ccb..5122a831 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev6" +__version__ = "1.2.0.dev7" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index aa2c86db..75487bed 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -1,12 +1,17 @@ from unittest.mock import call +from typer.testing import CliRunner + from butterfree._cli import migrate +from butterfree._cli.main import app from butterfree.migrations.database_migration import ( CassandraMigration, MetastoreMigration, ) from butterfree.pipelines import FeatureSetPipeline +runner = CliRunner() + class TestMigrate: def test_migrate_success(self, mocker): @@ -38,3 +43,12 @@ def test_migrate_all_pairs(self, mocker): cassandra_pairs, any_order=True ) migrate.Migrate._send_logs_to_s3.assert_called_once() + + def test_app_cli(self): + result = runner.invoke(app, "migrate") + assert result.exit_code == 0 + + def test_app_migrate(self, mocker): + mocker.patch.object(migrate.Migrate, "run") + result = runner.invoke(app, ["migrate", "apply", "tests/mocks/entities/"]) + assert result.exit_code == 0 From d2c5d39b0748e68ca44603df25c309427cd5c7e8 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Thu, 8 Apr 2021 14:07:12 -0300 Subject: [PATCH 26/86] Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 --- butterfree/_cli/migrate.py | 45 +++++++++++++++++++++----------------- requirements.txt | 3 ++- setup.py | 2 +- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index f3c533d7..42b3fb4a 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,3 +1,4 @@ +import datetime import importlib import inspect import os @@ -5,13 +6,13 @@ import sys from typing import Set +import boto3 import setuptools import typer +from botocore.exceptions import ClientError -from butterfree.clients import SparkClient from butterfree.configs import environment from butterfree.configs.logger import __logger -from butterfree.extract.readers import FileReader from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline @@ -106,30 +107,34 @@ class Migrate: pipelines: list of Feature Set Pipelines to use to migration. """ - def __init__( - self, pipelines: Set[FeatureSetPipeline], spark_client: SparkClient = None - ) -> None: + def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None: self.pipelines = pipelines - self.spark_client = spark_client or SparkClient() def _send_logs_to_s3(self, file_local: bool) -> None: """Send all migration logs to S3.""" - log_path = "../logging.json" - - file_reader = FileReader(id="name", path=log_path, format="json") - df = file_reader.consume(self.spark_client) - - path = environment.get_variable("FEATURE_STORE_S3_BUCKET") - - self.spark_client.write_dataframe( - dataframe=df, - format_="json", - mode="append", - **{"path": f"s3a://{path}/logging"}, + s3_client = boto3.client("s3") + + file_name = "../logging.json" + timestamp = datetime.datetime.now() + object_name = ( + f"logs/migrate/" + f"{timestamp.strftime('%Y-%m-%d')}" + f"/logging-{timestamp.strftime('%H:%M:%S')}.json" ) + bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") + + try: + s3_client.upload_file( + file_name, + bucket, + object_name, + ExtraArgs={"ACL": "bucket-owner-full-control"}, + ) + except ClientError: + raise - if not file_local and os.path.exists(log_path): - os.remove(log_path) + if not file_local and os.path.exists(file_name): + os.remove(file_name) def run(self, generate_logs: bool = False) -> None: """Construct and apply the migrations.""" diff --git a/requirements.txt b/requirements.txt index 82a99d7f..9548edb3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ parameters-validation>=1.1.5,<2.0 pyspark==3.* typer>=0.3,<0.4 setuptools>=41,<42 -typing-extensions==3.7.4.3 \ No newline at end of file +typing-extensions==3.7.4.3 +boto3==1.17.* \ No newline at end of file diff --git a/setup.py b/setup.py index 5122a831..348e5f98 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev7" +__version__ = "1.2.0.dev8" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 43392f444d1ad7bb50136f665fd00a2fe940857f Mon Sep 17 00:00:00 2001 From: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Date: Tue, 13 Apr 2021 16:02:37 -0300 Subject: [PATCH 27/86] Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira --- butterfree/clients/cassandra_client.py | 22 +++++++--- butterfree/configs/db/cassandra_config.py | 30 +++++++++++++ butterfree/configs/environment.py | 2 + .../configs/db/test_cassandra_config.py | 44 +++++++++++++++++++ 4 files changed, 93 insertions(+), 5 deletions(-) diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 938d4e4d..a4605362 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -3,9 +3,15 @@ from typing import Dict, List, Optional from cassandra.auth import PlainTextAuthProvider -from cassandra.cluster import Cluster, ResponseFuture, Session -from cassandra.policies import RoundRobinPolicy -from cassandra.query import dict_factory +from cassandra.cluster import ( + EXEC_PROFILE_DEFAULT, + Cluster, + ExecutionProfile, + ResponseFuture, + Session, +) +from cassandra.policies import DCAwareRoundRobinPolicy +from cassandra.query import ConsistencyLevel, dict_factory from typing_extensions import TypedDict from butterfree.clients import AbstractClient @@ -70,14 +76,20 @@ def conn(self, *, ssl_path: str = None) -> Session: # type: ignore else None ) + execution_profiles = { + EXEC_PROFILE_DEFAULT: ExecutionProfile( + load_balancing_policy=DCAwareRoundRobinPolicy(), + consistency_level=ConsistencyLevel.LOCAL_QUORUM, + row_factory=dict_factory, + ) + } cluster = Cluster( contact_points=self.host, auth_provider=auth_provider, ssl_options=ssl_opts, - load_balancing_policy=RoundRobinPolicy(), + execution_profiles=execution_profiles, ) self._session = cluster.connect(self.keyspace) - self._session.row_factory = dict_factory return self._session def sql(self, query: str) -> ResponseFuture: diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index e9329c5d..3f9e129d 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -21,6 +21,8 @@ class CassandraConfig(AbstractWriteConfig): stream_processing_time: processing time interval for streaming jobs. stream_output_mode: specify the mode from writing streaming data. stream_checkpoint_path: path on S3 to save checkpoints for the stream job. + read_consistency_level: read consistency level used in connection. + write_consistency_level: write consistency level used in connection. More information about processing_time, output_mode and checkpoint_path can be found in Spark documentation: @@ -39,6 +41,8 @@ def __init__( stream_processing_time: str = None, stream_output_mode: str = None, stream_checkpoint_path: str = None, + read_consistency_level: str = None, + write_consistency_level: str = None, ): self.username = username self.password = password @@ -49,6 +53,8 @@ def __init__( self.stream_processing_time = stream_processing_time self.stream_output_mode = stream_output_mode self.stream_checkpoint_path = stream_checkpoint_path + self.read_consistency_level = read_consistency_level + self.write_consistency_level = write_consistency_level @property def database(self) -> str: @@ -150,6 +156,28 @@ def stream_checkpoint_path(self, value: str) -> None: "STREAM_CHECKPOINT_PATH" ) + @property + def read_consistency_level(self) -> Optional[str]: + """Read consistency level for Cassandra.""" + return self.__read_consistency_level + + @read_consistency_level.setter + def read_consistency_level(self, value: str) -> None: + self.__read_consistency_level = value or environment.get_variable( + "CASSANDRA_READ_CONSISTENCY_LEVEL", "LOCAL_ONE" + ) + + @property + def write_consistency_level(self) -> Optional[str]: + """Write consistency level for Cassandra.""" + return self.__write_consistency_level + + @write_consistency_level.setter + def write_consistency_level(self, value: str) -> None: + self.__write_consistency_level = value or environment.get_variable( + "CASSANDRA_WRITE_CONSISTENCY_LEVEL", "LOCAL_QUORUM" + ) + def get_options(self, table: str) -> Dict[Optional[str], Optional[str]]: """Get options for connect to Cassandra DB. @@ -169,6 +197,8 @@ def get_options(self, table: str) -> Dict[Optional[str], Optional[str]]: "spark.cassandra.auth.username": self.username, "spark.cassandra.auth.password": self.password, "spark.cassandra.connection.host": self.host, + "spark.cassandra.input.consistency.level": self.read_consistency_level, + "spark.cassandra.output.consistency.level": self.write_consistency_level, } def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: diff --git a/butterfree/configs/environment.py b/butterfree/configs/environment.py index f98a7a01..5d8bb4e9 100644 --- a/butterfree/configs/environment.py +++ b/butterfree/configs/environment.py @@ -12,6 +12,8 @@ "FEATURE_STORE_HISTORICAL_DATABASE": "test", "KAFKA_CONSUMER_CONNECTION_STRING": "test_host:1234,test_host2:1234", "STREAM_CHECKPOINT_PATH": None, + "CASSANDRA_READ_CONSISTENCY_LEVEL": None, + "CASSANDRA_WRITE_CONSISTENCY_LEVEL": None, } diff --git a/tests/unit/butterfree/configs/db/test_cassandra_config.py b/tests/unit/butterfree/configs/db/test_cassandra_config.py index f51ffe8c..9af4c42b 100644 --- a/tests/unit/butterfree/configs/db/test_cassandra_config.py +++ b/tests/unit/butterfree/configs/db/test_cassandra_config.py @@ -159,6 +159,50 @@ def test_stream_checkpoint_path_custom(self, cassandra_config): # then assert cassandra_config.stream_checkpoint_path == value + def test_read_consistency_level(self, cassandra_config): + # expecting + default = "LOCAL_ONE" + assert cassandra_config.read_consistency_level == default + + def test_read_consistency_level_custom(self, cassandra_config): + # given + value = "Custom Config" + cassandra_config.read_consistency_level = value + + # then + assert cassandra_config.read_consistency_level == value + + def test_read_consistency_level_custom_env_var(self, mocker, cassandra_config): + # given + value = "Custom Config" + mocker.patch("butterfree.configs.environment.get_variable", return_value=value) + cassandra_config.read_consistency_level = value + + # then + assert cassandra_config.read_consistency_level == value + + def test_write_consistency_level(self, cassandra_config): + # expecting + default = "LOCAL_QUORUM" + assert cassandra_config.write_consistency_level == default + + def test_write_consistency_level_custom(self, cassandra_config): + # given + value = "Custom Config" + cassandra_config.write_consistency_level = value + + # then + assert cassandra_config.write_consistency_level == value + + def test_write_consistency_level_custom_env_var(self, mocker, cassandra_config): + # given + value = "Custom Config" + mocker.patch("butterfree.configs.environment.get_variable", return_value=value) + cassandra_config.write_consistency_level = value + + # then + assert cassandra_config.write_consistency_level == value + def test_set_credentials_on_instantiation(self): cassandra_config = CassandraConfig( # noqa: S106 username="username", password="password", host="host", keyspace="keyspace" From 0f31164b8d3a20689a31669bd28a1c54d6085022 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 14 Apr 2021 13:48:03 -0300 Subject: [PATCH 28/86] Fix kafka reader. (#310) --- butterfree/extract/readers/kafka_reader.py | 2 +- setup.py | 2 +- tests/unit/butterfree/extract/readers/test_kafka_reader.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/butterfree/extract/readers/kafka_reader.py b/butterfree/extract/readers/kafka_reader.py index 8cac4c19..1b8042bc 100644 --- a/butterfree/extract/readers/kafka_reader.py +++ b/butterfree/extract/readers/kafka_reader.py @@ -174,7 +174,7 @@ def consume(self, client: SparkClient) -> DataFrame: """ # read using client and cast key and value columns from binary to string raw_df = ( - client.read(format="kafka", options=self.options, stream=self.stream) + client.read(format="kafka", stream=self.stream, **self.options) .withColumn("key", col("key").cast("string")) .withColumn("value", col("value").cast("string")) ) diff --git a/setup.py b/setup.py index 348e5f98..c015e1e1 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev8" +__version__ = "1.2.0.dev9" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/extract/readers/test_kafka_reader.py b/tests/unit/butterfree/extract/readers/test_kafka_reader.py index 5a07cbdd..f1ea82ae 100644 --- a/tests/unit/butterfree/extract/readers/test_kafka_reader.py +++ b/tests/unit/butterfree/extract/readers/test_kafka_reader.py @@ -99,7 +99,7 @@ def test_consume( # assert spark_client.read.assert_called_once_with( - format="kafka", options=options, stream=kafka_reader.stream + format="kafka", stream=kafka_reader.stream, **options ) assert_dataframe_equality(target_df, output_df) From e6f67e9a5ff42b4987c8d476a8ebf6df6cfa1aac Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 14 Apr 2021 18:13:03 -0300 Subject: [PATCH 29/86] Fix path validate. (#311) --- butterfree/clients/spark_client.py | 4 ++-- setup.py | 2 +- tests/unit/butterfree/clients/test_spark_client.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index 09a1bcd9..d5caec9c 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -58,7 +58,7 @@ def read( """ if not isinstance(format, str): raise ValueError("format needs to be a string with the desired read format") - if not isinstance(path, (str, list)): + if path and not isinstance(path, (str, list)): raise ValueError("path needs to be a string or a list of string") df_reader: Union[ @@ -67,7 +67,7 @@ def read( df_reader = df_reader.schema(schema) if schema else df_reader - return df_reader.format(format).load(path, **options) # type: ignore + return df_reader.format(format).load(path=path, **options) # type: ignore def read_table(self, table: str, database: str = None) -> DataFrame: """Use the SparkSession.read interface to read a metastore table. diff --git a/setup.py b/setup.py index c015e1e1..2f04f794 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev9" +__version__ = "1.2.0.dev10" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/clients/test_spark_client.py b/tests/unit/butterfree/clients/test_spark_client.py index dc40841c..12d8ac9d 100644 --- a/tests/unit/butterfree/clients/test_spark_client.py +++ b/tests/unit/butterfree/clients/test_spark_client.py @@ -65,7 +65,7 @@ def test_read( # assert mocked_spark_read.format.assert_called_once_with(format) - mocked_spark_read.load.assert_called_once_with(path, **options) + mocked_spark_read.load.assert_called_once_with(path=path, **options) assert target_df.collect() == result_df.collect() @pytest.mark.parametrize( From baa594ba9543daa021451470a6b713e7895b7726 Mon Sep 17 00:00:00 2001 From: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Date: Fri, 16 Apr 2021 12:21:28 -0300 Subject: [PATCH 30/86] Add local dc property (#312) * add local dc property * update version --- butterfree/configs/db/cassandra_config.py | 12 ++++++++++ butterfree/configs/environment.py | 1 + setup.py | 2 +- .../configs/db/test_cassandra_config.py | 22 +++++++++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index 3f9e129d..3d94e756 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -43,6 +43,7 @@ def __init__( stream_checkpoint_path: str = None, read_consistency_level: str = None, write_consistency_level: str = None, + local_dc: str = None, ): self.username = username self.password = password @@ -55,6 +56,7 @@ def __init__( self.stream_checkpoint_path = stream_checkpoint_path self.read_consistency_level = read_consistency_level self.write_consistency_level = write_consistency_level + self.local_dc = local_dc @property def database(self) -> str: @@ -178,6 +180,15 @@ def write_consistency_level(self, value: str) -> None: "CASSANDRA_WRITE_CONSISTENCY_LEVEL", "LOCAL_QUORUM" ) + @property + def local_dc(self) -> Optional[str]: + """Local DC for Cassandra connection.""" + return self.__local_dc + + @local_dc.setter + def local_dc(self, value: str) -> None: + self.__local_dc = value or environment.get_variable("CASSANDRA_LOCAL_DC") + def get_options(self, table: str) -> Dict[Optional[str], Optional[str]]: """Get options for connect to Cassandra DB. @@ -197,6 +208,7 @@ def get_options(self, table: str) -> Dict[Optional[str], Optional[str]]: "spark.cassandra.auth.username": self.username, "spark.cassandra.auth.password": self.password, "spark.cassandra.connection.host": self.host, + "spark.cassandra.connection.localDC": self.local_dc, "spark.cassandra.input.consistency.level": self.read_consistency_level, "spark.cassandra.output.consistency.level": self.write_consistency_level, } diff --git a/butterfree/configs/environment.py b/butterfree/configs/environment.py index 5d8bb4e9..f56efc5d 100644 --- a/butterfree/configs/environment.py +++ b/butterfree/configs/environment.py @@ -14,6 +14,7 @@ "STREAM_CHECKPOINT_PATH": None, "CASSANDRA_READ_CONSISTENCY_LEVEL": None, "CASSANDRA_WRITE_CONSISTENCY_LEVEL": None, + "CASSANDRA_LOCAL_DC": None, } diff --git a/setup.py b/setup.py index 2f04f794..264d9e0d 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev10" +__version__ = "1.2.0.dev11" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/configs/db/test_cassandra_config.py b/tests/unit/butterfree/configs/db/test_cassandra_config.py index 9af4c42b..d34c8e9f 100644 --- a/tests/unit/butterfree/configs/db/test_cassandra_config.py +++ b/tests/unit/butterfree/configs/db/test_cassandra_config.py @@ -203,6 +203,28 @@ def test_write_consistency_level_custom_env_var(self, mocker, cassandra_config): # then assert cassandra_config.write_consistency_level == value + def test_local_dc(self, cassandra_config): + # expecting + default = None + assert cassandra_config.local_dc == default + + def test_local_dc_custom(self, cassandra_config): + # given + value = "VPC_1" + cassandra_config.local_dc = value + + # then + assert cassandra_config.local_dc == value + + def test_local_dc_custom_env_var(self, mocker, cassandra_config): + # given + value = "VPC_1" + mocker.patch("butterfree.configs.environment.get_variable", return_value=value) + cassandra_config.local_dc = value + + # then + assert cassandra_config.local_dc == value + def test_set_credentials_on_instantiation(self): cassandra_config = CassandraConfig( # noqa: S106 username="username", password="password", host="host", keyspace="keyspace" From a74f098d972f9dfdf4f07a37863b8ab0baadaba3 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 20 Apr 2021 09:08:20 -0300 Subject: [PATCH 31/86] Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. --- butterfree/_cli/migrate.py | 55 ++++++++++++------- butterfree/clients/abstract_client.py | 14 +++++ butterfree/clients/cassandra_client.py | 2 +- butterfree/clients/spark_client.py | 2 +- .../writers/online_feature_store_writer.py | 2 + .../database_migration/database_migration.py | 8 ++- setup.py | 2 +- tests/unit/butterfree/_cli/test_migrate.py | 13 +---- 8 files changed, 59 insertions(+), 39 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 42b3fb4a..2eebe733 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,6 +1,7 @@ import datetime import importlib import inspect +import json import os import pkgutil import sys @@ -43,6 +44,7 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: logger.info(f"Looking for python modules under {path}...") modules = __find_modules(path) if not modules: + logger.error(f"Path: {path} not found!") return set() logger.info(f"Importing modules...") @@ -112,36 +114,47 @@ def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None: def _send_logs_to_s3(self, file_local: bool) -> None: """Send all migration logs to S3.""" - s3_client = boto3.client("s3") - file_name = "../logging.json" - timestamp = datetime.datetime.now() - object_name = ( - f"logs/migrate/" - f"{timestamp.strftime('%Y-%m-%d')}" - f"/logging-{timestamp.strftime('%H:%M:%S')}.json" - ) - bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") - - try: - s3_client.upload_file( - file_name, - bucket, - object_name, - ExtraArgs={"ACL": "bucket-owner-full-control"}, - ) - except ClientError: - raise if not file_local and os.path.exists(file_name): + s3_client = boto3.client("s3") + + timestamp = datetime.datetime.now() + object_name = ( + f"logs/migrate/" + f"{timestamp.strftime('%Y-%m-%d')}" + f"/logging-{timestamp.strftime('%H:%M:%S')}.json" + ) + bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") + + try: + s3_client.upload_file( + file_name, + bucket, + object_name, + ExtraArgs={"ACL": "bucket-owner-full-control"}, + ) + except ClientError: + raise + os.remove(file_name) + else: + with open(file_name, "r") as json_f: + json_data = json.load(json_f) + print(json_data) def run(self, generate_logs: bool = False) -> None: """Construct and apply the migrations.""" for pipeline in self.pipelines: for writer in pipeline.sink.writers: - migration = ALLOWED_DATABASE[writer.db_config.database] - migration.apply_migration(pipeline.feature_set, writer) + db = writer.db_config.database + if db != "metastore": + migration = ALLOWED_DATABASE[db] + migration.apply_migration(pipeline.feature_set, writer) + else: + logger.warning( + "Butterfree not supporting Metastore Migrations yet." + ) self._send_logs_to_s3(generate_logs) diff --git a/butterfree/clients/abstract_client.py b/butterfree/clients/abstract_client.py index 265706e6..ce5d33b6 100644 --- a/butterfree/clients/abstract_client.py +++ b/butterfree/clients/abstract_client.py @@ -23,3 +23,17 @@ def sql(self, query: str) -> Any: Set of records. """ pass + + @abstractmethod + def get_schema(self, table: str, database: str = None) -> Any: + """Returns desired table schema. + + Attributes: + table: desired table. + + Returns: + A list of dictionaries in the format + [{"column_name": "example1", type: "Spark_type"}, ...] + + """ + pass diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index a4605362..00a3d497 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -103,7 +103,7 @@ def sql(self, query: str) -> ResponseFuture: raise RuntimeError("There's no session available for this query.") return self._session.execute(query) - def get_schema(self, table: str) -> List[Dict[str, str]]: + def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: """Returns desired table schema. Attributes: diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index d5caec9c..bfa31d2a 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -314,7 +314,7 @@ def _convert_schema(self, schema: DataFrame) -> List[Dict[str, str]]: return converted_schema - def get_schema(self, table: str, database: str) -> List[Dict[str, str]]: + def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: """Returns desired table schema. Attributes: diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index b51d9923..17dc8af4 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -81,6 +81,7 @@ class OnlineFeatureStoreWriter(Writer): def __init__( self, db_config: AbstractWriteConfig = None, + database: str = None, debug_mode: bool = False, write_to_entity: bool = False, interval_mode: bool = False, @@ -90,6 +91,7 @@ def __init__( db_config or CassandraConfig(), debug_mode, interval_mode, write_to_entity ) self.check_schema_hook = check_schema_hook + self.database = database @staticmethod def filter_latest(dataframe: DataFrame, id_columns: List[Any]) -> DataFrame: diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 2ceca0b8..6df9ce95 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -242,7 +242,9 @@ def _get_diff( ) return schema_diff - def _get_schema(self, table_name: str) -> List[Dict[str, Any]]: + def _get_schema( + self, table_name: str, database: str = None + ) -> List[Dict[str, Any]]: """Get a table schema in the respective database. Args: @@ -252,7 +254,7 @@ def _get_schema(self, table_name: str) -> List[Dict[str, Any]]: Schema object. """ try: - db_schema = self._client.get_schema(table_name) + db_schema = self._client.get_schema(table_name, database) except Exception: # noqa db_schema = [] return db_schema @@ -271,7 +273,7 @@ def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: ) fs_schema = writer.db_config.translate(feature_set.get_schema()) - db_schema = self._get_schema(table_name) + db_schema = self._get_schema(table_name, writer.database) queries = self.create_query( fs_schema, table_name, db_schema, writer.write_to_entity diff --git a/setup.py b/setup.py index 264d9e0d..c1295ee3 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev11" +__version__ = "1.2.0.dev12" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index 75487bed..475db15f 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -4,10 +4,7 @@ from butterfree._cli import migrate from butterfree._cli.main import app -from butterfree.migrations.database_migration import ( - CassandraMigration, - MetastoreMigration, -) +from butterfree.migrations.database_migration import CassandraMigration from butterfree.pipelines import FeatureSetPipeline runner = CliRunner() @@ -21,24 +18,16 @@ def test_migrate_success(self, mocker): assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] def test_migrate_all_pairs(self, mocker): - mocker.patch.object(MetastoreMigration, "apply_migration") mocker.patch.object(CassandraMigration, "apply_migration") mocker.patch.object(migrate.Migrate, "_send_logs_to_s3") all_fs = migrate.migrate("tests/mocks/entities/") - assert MetastoreMigration.apply_migration.call_count == 2 assert CassandraMigration.apply_migration.call_count == 2 - metastore_pairs = [ - call(pipe.feature_set, pipe.sink.writers[0]) for pipe in all_fs - ] cassandra_pairs = [ call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs ] - MetastoreMigration.apply_migration.assert_has_calls( - metastore_pairs, any_order=True - ) CassandraMigration.apply_migration.assert_has_calls( cassandra_pairs, any_order=True ) From 378f3a55dc14914d6a58a820a803d17a3b61f1fb Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 20 Apr 2021 17:32:22 -0300 Subject: [PATCH 32/86] Fix link in our docs. (#315) --- docs/source/extract.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/extract.md b/docs/source/extract.md index 2d9f9fab..2b4f2e52 100644 --- a/docs/source/extract.md +++ b/docs/source/extract.md @@ -53,4 +53,4 @@ source = Source( ) ``` -It's important to state that we have some pre-processing methods as well, such as filter and pivot. Feel free to check them [here](https://github.com/quintoandar/butterfree/tree/staging/butterfree/core/extract/pre_processing). \ No newline at end of file +It's important to state that we have some pre-processing methods as well, such as filter and pivot. Feel free to check them [here](https://github.com/quintoandar/butterfree/tree/master/butterfree/extract/pre_processing). \ No newline at end of file From 3b18b5a98746e0b45e50ec4f9c7080fe32697500 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Fri, 23 Apr 2021 09:48:33 -0300 Subject: [PATCH 33/86] [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. --- butterfree/clients/cassandra_client.py | 59 +++++++++---------- setup.py | 2 +- .../clients/test_cassandra_client.py | 30 ---------- 3 files changed, 30 insertions(+), 61 deletions(-) diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 00a3d497..4c6f96fe 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -61,35 +61,36 @@ def __init__( @property def conn(self, *, ssl_path: str = None) -> Session: # type: ignore """Establishes a Cassandra connection.""" - auth_provider = ( - PlainTextAuthProvider(username=self.user, password=self.password) - if self.user is not None - else None - ) - ssl_opts = ( - { - "ca_certs": ssl_path, - "ssl_version": PROTOCOL_TLSv1, - "cert_reqs": CERT_REQUIRED, - } - if ssl_path is not None - else None - ) + if not self._session: + auth_provider = ( + PlainTextAuthProvider(username=self.user, password=self.password) + if self.user is not None + else None + ) + ssl_opts = ( + { + "ca_certs": ssl_path, + "ssl_version": PROTOCOL_TLSv1, + "cert_reqs": CERT_REQUIRED, + } + if ssl_path is not None + else None + ) - execution_profiles = { - EXEC_PROFILE_DEFAULT: ExecutionProfile( - load_balancing_policy=DCAwareRoundRobinPolicy(), - consistency_level=ConsistencyLevel.LOCAL_QUORUM, - row_factory=dict_factory, + execution_profiles = { + EXEC_PROFILE_DEFAULT: ExecutionProfile( + load_balancing_policy=DCAwareRoundRobinPolicy(), + consistency_level=ConsistencyLevel.LOCAL_QUORUM, + row_factory=dict_factory, + ) + } + cluster = Cluster( + contact_points=self.host, + auth_provider=auth_provider, + ssl_options=ssl_opts, + execution_profiles=execution_profiles, ) - } - cluster = Cluster( - contact_points=self.host, - auth_provider=auth_provider, - ssl_options=ssl_opts, - execution_profiles=execution_profiles, - ) - self._session = cluster.connect(self.keyspace) + self._session = cluster.connect(self.keyspace) return self._session def sql(self, query: str) -> ResponseFuture: @@ -99,9 +100,7 @@ def sql(self, query: str) -> ResponseFuture: query: desired query. """ - if not self._session: - raise RuntimeError("There's no session available for this query.") - return self._session.execute(query) + return self.conn.execute(query) def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: """Returns desired table schema. diff --git a/setup.py b/setup.py index c1295ee3..8d56a022 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev12" +__version__ = "1.2.0.dev13" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/clients/test_cassandra_client.py b/tests/unit/butterfree/clients/test_cassandra_client.py index aa52e6f8..0356e43f 100644 --- a/tests/unit/butterfree/clients/test_cassandra_client.py +++ b/tests/unit/butterfree/clients/test_cassandra_client.py @@ -1,8 +1,6 @@ from typing import Any, Dict, List from unittest.mock import MagicMock -import pytest - from butterfree.clients import CassandraClient from butterfree.clients.cassandra_client import CassandraColumn @@ -88,31 +86,3 @@ def test_cassandra_create_table( query = cassandra_client.sql.call_args[0][0] assert sanitize_string(query) == sanitize_string(expected_query) - - def test_cassandra_without_session(self, cassandra_client: CassandraClient) -> None: - cassandra_client = cassandra_client - - with pytest.raises( - RuntimeError, match="There's no session available for this query." - ): - cassandra_client.sql( - query="select feature1, feature2 from cassandra_feature_set" - ) - with pytest.raises( - RuntimeError, match="There's no session available for this query." - ): - cassandra_client.create_table( - [ - {"column_name": "id", "type": "int", "primary_key": True}, - { - "column_name": "rent_per_month", - "type": "float", - "primary_key": False, - }, - ], - "test", - ) - with pytest.raises( - RuntimeError, match="There's no session available for this query." - ): - cassandra_client.get_schema("test") From c46f171f05c45c6316aacbf43ea03578364ba781 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 26 Apr 2021 17:42:35 -0300 Subject: [PATCH 34/86] Fix migration query. (#318) --- .../database_migration/cassandra_migration.py | 6 +++--- .../database_migration/database_migration.py | 11 ++++++----- .../database_migration/metastore_migration.py | 6 +++--- setup.py | 2 +- .../database_migration/test_cassandra_migration.py | 4 ++-- .../database_migration/test_metastore_migration.py | 4 ++-- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index c511479b..ff7042b6 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -75,7 +75,7 @@ def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> st return f"ALTER TABLE {table_name} ADD ({parsed_columns});" - def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """Creates CQL statement to alter columns' types. Args: @@ -86,9 +86,9 @@ def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> Alter column type query. """ - parsed_columns = self._get_parsed_columns(columns) + parsed_columns = self._get_parsed_columns([column]) - return f"ALTER TABLE {table_name} ALTER ({parsed_columns});" + return f"ALTER TABLE {table_name} ALTER {parsed_columns};" @staticmethod def _get_create_table_query(columns: List[Dict[str, Any]], table_name: str) -> str: diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 6df9ce95..de6b2f80 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -93,7 +93,7 @@ def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> s pass @abstractmethod - def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """Creates desired statement to alter columns' types. Args: @@ -152,10 +152,11 @@ def _get_queries( ) queries.append(drop_columns_query) if alter_type_items: - alter_column_types_query = self._get_alter_column_type_query( - alter_type_items, table_name - ) - queries.append(alter_column_types_query) + for item in alter_type_items: + alter_column_types_query = self._get_alter_column_type_query( + item, table_name + ) + queries.append(alter_column_types_query) if alter_key_items: logger.info("This operation is not supported by Spark.") diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 8b7c6af0..daa0afd3 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -74,7 +74,7 @@ def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> st f"ADD IF NOT EXISTS columns ({parsed_columns});" ) - def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """Creates SQL statement to alter columns' types. Args: @@ -85,9 +85,9 @@ def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> Alter column type query. """ - parsed_columns = self._get_parsed_columns(columns) + parsed_columns = self._get_parsed_columns([column]) - return f"ALTER TABLE {table_name} ALTER COLUMN ({parsed_columns});" + return f"ALTER TABLE {table_name} ALTER COLUMN {parsed_columns};" def _get_create_table_query( self, columns: List[Dict[str, Any]], table_name: str diff --git a/setup.py b/setup.py index 8d56a022..a69c079c 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev13" +__version__ = "1.2.0.dev14" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py index 8f16a1d2..97f49958 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py @@ -8,7 +8,7 @@ def test_queries(self, fs_schema, db_schema): "ALTER TABLE table_name ADD (new_feature FloatType);", "ALTER TABLE table_name DROP (feature1__avg_over_2_days_rolling_windows);", "ALTER TABLE table_name ALTER " - "(feature1__avg_over_1_week_rolling_windows FloatType);", + "feature1__avg_over_1_week_rolling_windows FloatType;", ] query = cassandra_migration.create_query(fs_schema, "table_name", db_schema) @@ -19,7 +19,7 @@ def test_queries_on_entity(self, fs_schema, db_schema): expected_query = [ "ALTER TABLE table_name ADD (new_feature FloatType);", "ALTER TABLE table_name ALTER " - "(feature1__avg_over_1_week_rolling_windows FloatType);", + "feature1__avg_over_1_week_rolling_windows FloatType;", ] query = cassandra_migration.create_query( fs_schema, "table_name", db_schema, True diff --git a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py index 5bac9352..d9c2de3c 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py @@ -11,7 +11,7 @@ def test_queries(self, fs_schema, db_schema): "ALTER TABLE table_name DROP IF EXISTS " "(feature1__avg_over_2_days_rolling_windows None);", "ALTER TABLE table_name ALTER COLUMN " - "(feature1__avg_over_1_week_rolling_windows FloatType);", + "feature1__avg_over_1_week_rolling_windows FloatType;", ] query = metastore_migration.create_query(fs_schema, "table_name", db_schema) @@ -25,7 +25,7 @@ def test_queries_on_entity(self, fs_schema, db_schema): "ALTER TABLE test.table_name ADD IF NOT EXISTS " "columns (new_feature FloatType);", "ALTER TABLE table_name ALTER COLUMN " - "(feature1__avg_over_1_week_rolling_windows FloatType);", + "feature1__avg_over_1_week_rolling_windows FloatType;", ] query = metastore_migration.create_query( From bb124f57f2c3e3fbf6d6cecd80bfdd98f8f6ba4b Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 28 Apr 2021 09:23:19 -0300 Subject: [PATCH 35/86] Fix migration query add type key. (#319) --- .../migrations/database_migration/cassandra_migration.py | 4 +++- setup.py | 2 +- .../migrations/database_migration/test_cassandra_migration.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index ff7042b6..5a4f755f 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -88,7 +88,9 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """ parsed_columns = self._get_parsed_columns([column]) - return f"ALTER TABLE {table_name} ALTER {parsed_columns};" + return ( + f"ALTER TABLE {table_name} ALTER {parsed_columns.replace(' ', ' TYPE ')};" + ) @staticmethod def _get_create_table_query(columns: List[Dict[str, Any]], table_name: str) -> str: diff --git a/setup.py b/setup.py index a69c079c..4a138c7a 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev14" +__version__ = "1.2.0.dev15" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py index 97f49958..5666cc47 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py @@ -8,7 +8,7 @@ def test_queries(self, fs_schema, db_schema): "ALTER TABLE table_name ADD (new_feature FloatType);", "ALTER TABLE table_name DROP (feature1__avg_over_2_days_rolling_windows);", "ALTER TABLE table_name ALTER " - "feature1__avg_over_1_week_rolling_windows FloatType;", + "feature1__avg_over_1_week_rolling_windows TYPE FloatType;", ] query = cassandra_migration.create_query(fs_schema, "table_name", db_schema) @@ -19,7 +19,7 @@ def test_queries_on_entity(self, fs_schema, db_schema): expected_query = [ "ALTER TABLE table_name ADD (new_feature FloatType);", "ALTER TABLE table_name ALTER " - "feature1__avg_over_1_week_rolling_windows FloatType;", + "feature1__avg_over_1_week_rolling_windows TYPE FloatType;", ] query = cassandra_migration.create_query( fs_schema, "table_name", db_schema, True From 1c973169fdc0b9183e3677bf83a7c8f1c7acfd82 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 5 May 2021 17:47:08 -0300 Subject: [PATCH 36/86] Fix db-config condition (#321) * Fix db-config condition. * Apply style. --- butterfree/_cli/migrate.py | 6 ++---- setup.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 2eebe733..ebd21142 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -148,13 +148,11 @@ def run(self, generate_logs: bool = False) -> None: for pipeline in self.pipelines: for writer in pipeline.sink.writers: db = writer.db_config.database - if db != "metastore": + if db == "cassandra": migration = ALLOWED_DATABASE[db] migration.apply_migration(pipeline.feature_set, writer) else: - logger.warning( - "Butterfree not supporting Metastore Migrations yet." - ) + logger.warning(f"Butterfree not supporting {db} Migrations yet.") self._send_logs_to_s3(generate_logs) diff --git a/setup.py b/setup.py index 4a138c7a..6c2c2b46 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev15" +__version__ = "1.2.0.dev16" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From bb7ed77917d4553593d93e3603d3e273e7ec90a6 Mon Sep 17 00:00:00 2001 From: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Date: Fri, 7 May 2021 14:59:40 -0300 Subject: [PATCH 37/86] MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file --- docs/source/butterfree.clients.rst | 1 - docs/source/butterfree.configs.db.rst | 3 +- docs/source/butterfree.configs.rst | 5 +++ docs/source/butterfree.constants.rst | 11 +++++++ docs/source/butterfree.dataframe_service.rst | 13 +++++++- .../butterfree.extract.pre_processing.rst | 1 - docs/source/butterfree.extract.readers.rst | 1 - docs/source/butterfree.extract.rst | 1 - docs/source/butterfree.hooks.rst | 33 +++++++++++++++++++ .../butterfree.hooks.schema_compatibility.rst | 25 ++++++++++++++ docs/source/butterfree.load.processing.rst | 1 - docs/source/butterfree.load.rst | 1 - docs/source/butterfree.load.writers.rst | 1 - ...tterfree.migrations.database_migration.rst | 31 +++++++++++++++++ docs/source/butterfree.migrations.rst | 18 ++++++++++ docs/source/butterfree.pipelines.rst | 1 - docs/source/butterfree.reports.rst | 1 - docs/source/butterfree.rst | 2 ++ docs/source/butterfree.transform.features.rst | 1 - docs/source/butterfree.transform.rst | 1 - .../butterfree.transform.transformations.rst | 1 - ...transformations.user_defined_functions.rst | 1 - docs/source/butterfree.transform.utils.rst | 1 - docs/source/butterfree.validations.rst | 1 - docs/source/cli.md | 32 ++++++++++++++++++ docs/source/home.md | 6 ++++ docs/source/index.rst | 1 + requirements.dev.txt | 4 +++ 28 files changed, 181 insertions(+), 18 deletions(-) create mode 100644 docs/source/butterfree.hooks.rst create mode 100644 docs/source/butterfree.hooks.schema_compatibility.rst create mode 100644 docs/source/butterfree.migrations.database_migration.rst create mode 100644 docs/source/butterfree.migrations.rst create mode 100644 docs/source/cli.md diff --git a/docs/source/butterfree.clients.rst b/docs/source/butterfree.clients.rst index 3409d43a..1bfaa86d 100644 --- a/docs/source/butterfree.clients.rst +++ b/docs/source/butterfree.clients.rst @@ -22,7 +22,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.configs.db.rst b/docs/source/butterfree.configs.db.rst index a9973c56..3bb9f8b8 100644 --- a/docs/source/butterfree.configs.db.rst +++ b/docs/source/butterfree.configs.db.rst @@ -23,12 +23,11 @@ Submodules :show-inheritance: -.. automodule:: butterfree.configs.db.s3_config +.. automodule:: butterfree.configs.db.metastore_config :members: :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.configs.rst b/docs/source/butterfree.configs.rst index dc8a8c77..f3cf2aa2 100644 --- a/docs/source/butterfree.configs.rst +++ b/docs/source/butterfree.configs.rst @@ -19,6 +19,11 @@ Submodules :show-inheritance: +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/source/butterfree.constants.rst b/docs/source/butterfree.constants.rst index 083d20d7..d0e72fed 100644 --- a/docs/source/butterfree.constants.rst +++ b/docs/source/butterfree.constants.rst @@ -17,12 +17,23 @@ Submodules :show-inheritance: +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + + .. automodule:: butterfree.constants.spark_constants :members: :undoc-members: :show-inheritance: +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/source/butterfree.dataframe_service.rst b/docs/source/butterfree.dataframe_service.rst index b3c4cfc8..4343305b 100644 --- a/docs/source/butterfree.dataframe_service.rst +++ b/docs/source/butterfree.dataframe_service.rst @@ -5,12 +5,23 @@ Submodules ---------- -.. automodule:: butterfree.dataframe_service.repartition +.. automodule:: butterfree.dataframe_service.incremental_strategy + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.dataframe_service.partitioning :members: :undoc-members: :show-inheritance: +.. automodule:: butterfree.dataframe_service.repartition + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/source/butterfree.extract.pre_processing.rst b/docs/source/butterfree.extract.pre_processing.rst index 9420cd7e..172e6fb3 100644 --- a/docs/source/butterfree.extract.pre_processing.rst +++ b/docs/source/butterfree.extract.pre_processing.rst @@ -34,7 +34,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.extract.readers.rst b/docs/source/butterfree.extract.readers.rst index 6f7ee7b8..a67d47e9 100644 --- a/docs/source/butterfree.extract.readers.rst +++ b/docs/source/butterfree.extract.readers.rst @@ -28,7 +28,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.extract.rst b/docs/source/butterfree.extract.rst index 4454d6e9..a59d2e29 100644 --- a/docs/source/butterfree.extract.rst +++ b/docs/source/butterfree.extract.rst @@ -19,7 +19,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.hooks.rst b/docs/source/butterfree.hooks.rst new file mode 100644 index 00000000..72f13223 --- /dev/null +++ b/docs/source/butterfree.hooks.rst @@ -0,0 +1,33 @@ +butterfree.hooks package +======================== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + butterfree.hooks.schema_compatibility + +Submodules +---------- + + +.. automodule:: butterfree.hooks.hook + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.hooks.hookable_component + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: butterfree.hooks + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.hooks.schema_compatibility.rst b/docs/source/butterfree.hooks.schema_compatibility.rst new file mode 100644 index 00000000..a39c5b93 --- /dev/null +++ b/docs/source/butterfree.hooks.schema_compatibility.rst @@ -0,0 +1,25 @@ +butterfree.hooks.schema\_compatibility package +============================================== + +Submodules +---------- + + +.. automodule:: butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: butterfree.hooks.schema_compatibility + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.load.processing.rst b/docs/source/butterfree.load.processing.rst index 79ae36b9..4c5d2a2e 100644 --- a/docs/source/butterfree.load.processing.rst +++ b/docs/source/butterfree.load.processing.rst @@ -10,7 +10,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.load.rst b/docs/source/butterfree.load.rst index 2498b6f2..e38934a5 100644 --- a/docs/source/butterfree.load.rst +++ b/docs/source/butterfree.load.rst @@ -19,7 +19,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.load.writers.rst b/docs/source/butterfree.load.writers.rst index 88aa9e64..6ff438de 100644 --- a/docs/source/butterfree.load.writers.rst +++ b/docs/source/butterfree.load.writers.rst @@ -22,7 +22,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.migrations.database_migration.rst b/docs/source/butterfree.migrations.database_migration.rst new file mode 100644 index 00000000..892165df --- /dev/null +++ b/docs/source/butterfree.migrations.database_migration.rst @@ -0,0 +1,31 @@ +butterfree.migrations.database\_migration package +================================================= + +Submodules +---------- + + +.. automodule:: butterfree.migrations.database_migration.cassandra_migration + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.migrations.database_migration.database_migration + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.migrations.database_migration.metastore_migration + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: butterfree.migrations.database_migration + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.migrations.rst b/docs/source/butterfree.migrations.rst new file mode 100644 index 00000000..4770fd8e --- /dev/null +++ b/docs/source/butterfree.migrations.rst @@ -0,0 +1,18 @@ +butterfree.migrations package +============================= + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + butterfree.migrations.database_migration + +Module contents +--------------- + +.. automodule:: butterfree.migrations + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.pipelines.rst b/docs/source/butterfree.pipelines.rst index d5c65f4d..e0c31996 100644 --- a/docs/source/butterfree.pipelines.rst +++ b/docs/source/butterfree.pipelines.rst @@ -10,7 +10,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.reports.rst b/docs/source/butterfree.reports.rst index d49a701d..850db914 100644 --- a/docs/source/butterfree.reports.rst +++ b/docs/source/butterfree.reports.rst @@ -10,7 +10,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.rst b/docs/source/butterfree.rst index 76e664b4..0828f921 100644 --- a/docs/source/butterfree.rst +++ b/docs/source/butterfree.rst @@ -12,7 +12,9 @@ Subpackages butterfree.constants butterfree.dataframe_service butterfree.extract + butterfree.hooks butterfree.load + butterfree.migrations butterfree.pipelines butterfree.reports butterfree.testing diff --git a/docs/source/butterfree.transform.features.rst b/docs/source/butterfree.transform.features.rst index e4c9a926..f6c69095 100644 --- a/docs/source/butterfree.transform.features.rst +++ b/docs/source/butterfree.transform.features.rst @@ -22,7 +22,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.transform.rst b/docs/source/butterfree.transform.rst index 26d18093..02f8d4c6 100644 --- a/docs/source/butterfree.transform.rst +++ b/docs/source/butterfree.transform.rst @@ -26,7 +26,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.transform.transformations.rst b/docs/source/butterfree.transform.transformations.rst index 870c8468..0978edcf 100644 --- a/docs/source/butterfree.transform.transformations.rst +++ b/docs/source/butterfree.transform.transformations.rst @@ -54,7 +54,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.transform.transformations.user_defined_functions.rst b/docs/source/butterfree.transform.transformations.user_defined_functions.rst index becc5d6e..f93c7e98 100644 --- a/docs/source/butterfree.transform.transformations.user_defined_functions.rst +++ b/docs/source/butterfree.transform.transformations.user_defined_functions.rst @@ -16,7 +16,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.transform.utils.rst b/docs/source/butterfree.transform.utils.rst index bd8c1532..82e9038b 100644 --- a/docs/source/butterfree.transform.utils.rst +++ b/docs/source/butterfree.transform.utils.rst @@ -22,7 +22,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.validations.rst b/docs/source/butterfree.validations.rst index 9fd01557..35f5d199 100644 --- a/docs/source/butterfree.validations.rst +++ b/docs/source/butterfree.validations.rst @@ -16,7 +16,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/cli.md b/docs/source/cli.md new file mode 100644 index 00000000..ba07428f --- /dev/null +++ b/docs/source/cli.md @@ -0,0 +1,32 @@ +# Command-line Interface (CLI) + +Butterfree has now a command-line interface, introduced with the new automatic migration ability. + +As soon as you install butterfree, you can check what's available through butterfree's cli with: + +```shell +$~ butterfree --help +``` + +### Automated Database Schema Migration + +When developing your feature sets, you need also to prepare your database for the changes +to come into your Feature Store. Normally, when creating a new feature set, you needed +to manually create a new table in cassandra. Or, when creating a new feature in an existing +feature set, you needed to create new column in cassandra too. + +Now, you can just use `butterfree migrate apply ...`, butterfree will scan your python +files, looking for classes that inherit from `butterfree.pipelines.FeatureSetPipeline`, +then compare its schema with the database schema where the feature set would be written. +Then it will prepare migration queries and run against the databases. + +For more information, please, check `butterfree migrate apply --help` :) + +### Supported databases + +This functionality currently supports only the **Cassandra** database, which is the default +storage for an Online Feature Store built with Butterfree. Nonetheless, it was made with +the intent to be easily extended for other databases. + +Also, each database has its own rules for schema migration commands. Some changes may +still require manual interference. \ No newline at end of file diff --git a/docs/source/home.md b/docs/source/home.md index eada1739..fc297d2b 100644 --- a/docs/source/home.md +++ b/docs/source/home.md @@ -10,6 +10,7 @@ The main idea is for this repository to be a set of tools for easing [ETLs](http - [Load](#load) - [Streaming](#streaming) - [Setup Configuration](#setup-configuration) +- [Command-line Interface](#command-line-interface) ## What is going on here @@ -61,3 +62,8 @@ We also support streaming pipelines in Butterfree. More information is available ## Setup Configuration Some configurations are needed to run your ETL pipelines. Detailed information is provided at the [Configuration Section](configuration.md) + +## Command-line Interface + +Butterfree has its own command-line interface, to manage your feature sets. Detailed information +provided by the [Command-line Interface](cli.md) section. diff --git a/docs/source/index.rst b/docs/source/index.rst index 6548f9ad..12bf1609 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -22,3 +22,4 @@ Navigation stream configuration modules + cli diff --git a/requirements.dev.txt b/requirements.dev.txt index 8ebfa510..96ddefc1 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -5,3 +5,7 @@ jupyter==1.0.0 twine==3.1.1 mypy==0.790 pyspark-stubs==3.0.0 +sphinx==3.5.4 +sphinxemoji==0.1.8 +sphinx-rtd-theme==0.5.2 +recommonmark==0.7.1 \ No newline at end of file From 5a0a62244b4c1ac3fe6e199575141bddff5d710e Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 10 May 2021 17:43:08 -0300 Subject: [PATCH 38/86] [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. --- butterfree/_cli/migrate.py | 35 +++++++++++++------ .../database_migration/database_migration.py | 21 ++++++++--- setup.py | 2 +- tests/unit/butterfree/_cli/test_migrate.py | 6 ++-- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index ebd21142..bfa18b46 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -101,6 +101,11 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: False, help="To generate the logs in local file 'logging.json'." ) +DEBUG_MODE = typer.Option( + False, + help="To view the queries resulting from the migration, DON'T apply the migration.", +) + class Migrate: """Execute migration operations in a Database based on pipeline Writer. @@ -112,7 +117,7 @@ class Migrate: def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None: self.pipelines = pipelines - def _send_logs_to_s3(self, file_local: bool) -> None: + def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None: """Send all migration logs to S3.""" file_name = "../logging.json" @@ -120,11 +125,19 @@ def _send_logs_to_s3(self, file_local: bool) -> None: s3_client = boto3.client("s3") timestamp = datetime.datetime.now() - object_name = ( - f"logs/migrate/" - f"{timestamp.strftime('%Y-%m-%d')}" - f"/logging-{timestamp.strftime('%H:%M:%S')}.json" - ) + + if debug_mode: + object_name = ( + f"logs/migrate-debug-mode/" + f"{timestamp.strftime('%Y-%m-%d')}" + f"/logging-{timestamp.strftime('%H:%M:%S')}.json" + ) + else: + object_name = ( + f"logs/migrate/" + f"{timestamp.strftime('%Y-%m-%d')}" + f"/logging-{timestamp.strftime('%H:%M:%S')}.json" + ) bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") try: @@ -143,23 +156,23 @@ def _send_logs_to_s3(self, file_local: bool) -> None: json_data = json.load(json_f) print(json_data) - def run(self, generate_logs: bool = False) -> None: + def run(self, generate_logs: bool = False, debug_mode: bool = False) -> None: """Construct and apply the migrations.""" for pipeline in self.pipelines: for writer in pipeline.sink.writers: db = writer.db_config.database if db == "cassandra": migration = ALLOWED_DATABASE[db] - migration.apply_migration(pipeline.feature_set, writer) + migration.apply_migration(pipeline.feature_set, writer, debug_mode) else: logger.warning(f"Butterfree not supporting {db} Migrations yet.") - self._send_logs_to_s3(generate_logs) + self._send_logs_to_s3(generate_logs, debug_mode) @app.command("apply") def migrate( - path: str = PATH, generate_logs: bool = GENERATE_LOGS, + path: str = PATH, generate_logs: bool = GENERATE_LOGS, debug_mode: bool = DEBUG_MODE ) -> Set[FeatureSetPipeline]: """Scan and run database migrations for feature set pipelines defined under PATH. @@ -172,5 +185,5 @@ def migrate( import and instantiate them. """ pipe_set = __fs_objects(path) - Migrate(pipe_set).run(generate_logs) + Migrate(pipe_set).run(generate_logs, debug_mode) return pipe_set diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index de6b2f80..40192ff7 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -260,12 +260,15 @@ def _get_schema( db_schema = [] return db_schema - def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: + def apply_migration( + self, feature_set: FeatureSet, writer: Writer, debug_mode: bool + ) -> None: """Apply the migration in the respective database. Args: feature_set: the feature set. writer: the writer being used to load the feature set. + debug_mode: if active, it brings up the queries generated. """ logger.info(f"Migrating feature set: {feature_set.name}") @@ -280,8 +283,16 @@ def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: fs_schema, table_name, db_schema, writer.write_to_entity ) - for q in queries: - logger.info(f"Applying this query: {q} ...") - self._client.sql(q) + if debug_mode: + print( + "#### DEBUG MODE ###\n" + f"Feature set: {feature_set.name}\n" + "Queries:\n" + f"{queries}" + ) + else: + for q in queries: + logger.info(f"Applying this query: {q} ...") + self._client.sql(q) - logger.info(f"Feature Set migration finished successfully.") + logger.info(f"Feature Set migration finished successfully.") diff --git a/setup.py b/setup.py index 6c2c2b46..56cf8842 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev16" +__version__ = "1.2.0.dev17" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index 475db15f..c0751c88 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -17,16 +17,16 @@ def test_migrate_success(self, mocker): assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] - def test_migrate_all_pairs(self, mocker): + def test_migrate_run_methods(self, mocker): mocker.patch.object(CassandraMigration, "apply_migration") mocker.patch.object(migrate.Migrate, "_send_logs_to_s3") - all_fs = migrate.migrate("tests/mocks/entities/") + all_fs = migrate.migrate("tests/mocks/entities/", False, False) assert CassandraMigration.apply_migration.call_count == 2 cassandra_pairs = [ - call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs + call(pipe.feature_set, pipe.sink.writers[1], False) for pipe in all_fs ] CassandraMigration.apply_migration.assert_has_calls( cassandra_pairs, any_order=True From b1371f1201235973ba8b98048f676c3fe7071499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Brand=C3=A3o?= <37742275+GaBrandao@users.noreply.github.com> Date: Wed, 2 Jun 2021 15:02:23 -0300 Subject: [PATCH 39/86] [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import --- butterfree/_cli/migrate.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index bfa18b46..277ecf3c 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,7 +1,6 @@ import datetime import importlib import inspect -import json import os import pkgutil import sys @@ -151,10 +150,10 @@ def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None: raise os.remove(file_name) + elif os.path.exists(file_name): + print("Logs written to ../logging.json") else: - with open(file_name, "r") as json_f: - json_data = json.load(json_f) - print(json_data) + print("No logs were generated.") def run(self, generate_logs: bool = False, debug_mode: bool = False) -> None: """Construct and apply the migrations.""" From acf7022bfccc0ddc38cf0c51fff6e8b8086cc47a Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 2 Jun 2021 16:38:21 -0300 Subject: [PATCH 40/86] [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. --- .../database_migration/database_migration.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 40192ff7..aeec4a6e 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -1,5 +1,4 @@ """Migration entity.""" -import logging from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum, auto @@ -141,12 +140,7 @@ def _get_queries( ) queries.append(alter_table_add_query) if drop_items: - if write_on_entity: - logging.info( - "Features will not be dropped automatically " - "when data is loaded to an entity table" - ) - else: + if not write_on_entity: drop_columns_query = self._get_alter_table_drop_query( drop_items, table_name ) @@ -158,7 +152,9 @@ def _get_queries( ) queries.append(alter_column_types_query) if alter_key_items: - logger.info("This operation is not supported by Spark.") + logger.warning( + "The 'change the primary key column' action is not supported by Spark." + ) return queries @@ -217,6 +213,11 @@ def _get_diff( for db_item in db_schema: if fs_item.get("column_name") == db_item.get("column_name"): if fs_item.get("type") != db_item.get("type"): + if fs_item.get("primary_key") is True: + logger.warning( + "Type changes are not applied to " + "columns that are the primary key." + ) alter_type_columns.update( {fs_item.get("column_name"): fs_item.get("type")} ) @@ -296,3 +297,6 @@ def apply_migration( self._client.sql(q) logger.info(f"Feature Set migration finished successfully.") + + # inform in drone console which feature set was migrated + print(f"The {feature_set.name} feature set was migrated.") From d0bf61adb1b48db6eb6cd2e4e71c019b3c43d589 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Fri, 4 Jun 2021 10:15:39 -0300 Subject: [PATCH 41/86] Fix method to generate agg feature name. (#326) --- butterfree/transform/transformations/aggregated_transform.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/butterfree/transform/transformations/aggregated_transform.py b/butterfree/transform/transformations/aggregated_transform.py index 2c7a8ced..7304f34b 100644 --- a/butterfree/transform/transformations/aggregated_transform.py +++ b/butterfree/transform/transformations/aggregated_transform.py @@ -88,7 +88,7 @@ def _get_output_name(self, function: object) -> str: """ ) - base_name = "__".join([self._parent.name, function.__name__]) + base_name = "__".join([self._parent.name, str(function.__name__).lower()]) return base_name @property diff --git a/setup.py b/setup.py index 56cf8842..daaa264b 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev17" +__version__ = "1.2.0.dev18" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 1cf0dbde30c26345926c4ca8b533c87da4579cd7 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Thu, 10 Jun 2021 10:59:48 -0300 Subject: [PATCH 42/86] [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. --- .../historical_feature_store_writer.py | 46 ++++--------------- setup.py | 2 +- .../test_historical_feature_store_writer.py | 21 ++++----- 3 files changed, 21 insertions(+), 48 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index c4344041..489f22be 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -144,15 +144,17 @@ def write( dataframe = self._apply_transformations(dataframe) if self.interval_mode: - if self.debug_mode: - spark_client.create_temporary_view( - dataframe=dataframe, - name=f"historical_feature_store__{feature_set.name}", + partition_overwrite_mode = spark_client.conn.conf.get( + "spark.sql.sources.partitionOverwriteMode" + ).lower() + + if partition_overwrite_mode != "dynamic": + raise RuntimeError( + "m=load_incremental_table, " + "spark.sql.sources.partitionOverwriteMode={}, " + "msg=partitionOverwriteMode have to " + "be configured to 'dynamic'".format(partition_overwrite_mode) ) - return - - self._incremental_mode(feature_set, dataframe, spark_client) - return if self.debug_mode: spark_client.create_temporary_view( @@ -171,34 +173,6 @@ def write( **self.db_config.get_options(s3_key), ) - def _incremental_mode( - self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient - ) -> None: - - partition_overwrite_mode = spark_client.conn.conf.get( - "spark.sql.sources.partitionOverwriteMode" - ).lower() - - if partition_overwrite_mode != "dynamic": - raise RuntimeError( - "m=load_incremental_table, " - "spark.sql.sources.partitionOverwriteMode={}, " - "msg=partitionOverwriteMode have to be configured to 'dynamic'".format( - partition_overwrite_mode - ) - ) - - s3_key = os.path.join("historical", feature_set.entity, feature_set.name) - options = {"path": self.db_config.get_options(s3_key).get("path")} - - spark_client.write_dataframe( - dataframe=dataframe, - format_=self.db_config.format_, - mode=self.db_config.mode, - **options, - partitionBy=self.PARTITION_BY, - ) - def _assert_validation_count( self, table_name: str, written_count: int, dataframe_count: int ) -> None: diff --git a/setup.py b/setup.py index daaa264b..348bdbfe 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev18" +__version__ = "1.2.0.dev19" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py index 8bab23ba..9e84aacd 100644 --- a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py @@ -51,7 +51,7 @@ def test_write_interval_mode( ): # given spark_client = SparkClient() - spark_client.write_dataframe = mocker.stub("write_dataframe") + spark_client.write_table = mocker.stub("write_table") spark_client.conn.conf.set( "spark.sql.sources.partitionOverwriteMode", "dynamic" ) @@ -63,21 +63,15 @@ def test_write_interval_mode( dataframe=feature_set_dataframe, spark_client=spark_client, ) - result_df = spark_client.write_dataframe.call_args[1]["dataframe"] + result_df = spark_client.write_table.call_args[1]["dataframe"] # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) + assert writer.database == spark_client.write_table.call_args[1]["database"] + assert feature_set.name == spark_client.write_table.call_args[1]["table_name"] assert ( - writer.db_config.format_ - == spark_client.write_dataframe.call_args[1]["format_"] - ) - assert ( - writer.db_config.mode == spark_client.write_dataframe.call_args[1]["mode"] - ) - assert ( - writer.PARTITION_BY - == spark_client.write_dataframe.call_args[1]["partitionBy"] + writer.PARTITION_BY == spark_client.write_table.call_args[1]["partition_by"] ) def test_write_interval_mode_invalid_partition_mode( @@ -130,9 +124,14 @@ def test_write_in_debug_mode_with_interval_mode( historical_feature_set_dataframe, feature_set, spark_session, + mocker, ): # given spark_client = SparkClient() + spark_client.write_dataframe = mocker.stub("write_dataframe") + spark_client.conn.conf.set( + "spark.sql.sources.partitionOverwriteMode", "dynamic" + ) writer = HistoricalFeatureStoreWriter(debug_mode=True, interval_mode=True) # when From 9f42f53fc8f53b3d0c50d70b0e588c6b4c4a1611 Mon Sep 17 00:00:00 2001 From: Jay Vala <24193355+jdvala@users.noreply.github.com> Date: Wed, 16 Jun 2021 16:43:52 +0200 Subject: [PATCH 43/86] Add the missing link for H3 geohash (#330) * Add the missing link for H3 geohash * Update the H3 geohash link. * Update the same link Update the same link in in spark_function_and_window.ipynb example --- examples/simple_feature_set/simple_feature_set.ipynb | 2 +- .../spark_function_and_window/spark_function_and_window.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/simple_feature_set/simple_feature_set.ipynb b/examples/simple_feature_set/simple_feature_set.ipynb index b217fcdf..c5ed9ae5 100644 --- a/examples/simple_feature_set/simple_feature_set.ipynb +++ b/examples/simple_feature_set/simple_feature_set.ipynb @@ -89,7 +89,7 @@ "| - | - | - | - | - | - | - | - | - | - | - | - | - | - |\n", "| int | timestamp | float | float | int | int | float | float | float | double | double | string | string | string |\n", "\n", - "For more information about H3 geohash click [here]()\n", + "For more information about H3 geohash click [here](https://h3geo.org/docs/)\n", "\n", "The following code blocks will show how to generate this feature set using Butterfree library:\n", "\n" diff --git a/examples/spark_function_and_window/spark_function_and_window.ipynb b/examples/spark_function_and_window/spark_function_and_window.ipynb index a4472e24..dcf71552 100644 --- a/examples/spark_function_and_window/spark_function_and_window.ipynb +++ b/examples/spark_function_and_window/spark_function_and_window.ipynb @@ -50,7 +50,7 @@ "\n", "Note that we're going to compute two aggregated features, rent average and standard deviation, considering the two last occurrences (or events). It'd also be possible to define time windows, instead of windows based on events.\n", "\n", - "For more information about H3 geohash click [here]().\n", + "For more information about H3 geohash click [here](https://h3geo.org/docs/).\n", "\n", "The following code blocks will show how to generate this feature set using Butterfree library:\n", "\n" From 78927e317ae788334a8293df08a1596ff350a83f Mon Sep 17 00:00:00 2001 From: Rodrigo Martins de Oliveira Date: Fri, 30 Jul 2021 15:56:45 -0300 Subject: [PATCH 44/86] Update README.md (#331) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 728f7b02..7b93f000 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ To learn how to use Butterfree in practice, see [Butterfree's notebook examples] ## Requirements and Installation Butterfree depends on **Python 3.7+** and it is **Spark 3.0 ready** :heavy_check_mark: -[Python Package Index](https://quintoandar.github.io/python-package-server/) hosts reference to a pip-installable module of this library, using it is as straightforward as including it on your project's requirements. +[PyPI hosts reference to a pip-installable module of this library](https://pypi.org/project/butterfree/), using it is as straightforward as including it on your project's requirements. ```bash pip install butterfree From 43bb3a336a9ec900fcdf86971f48b56f2f2389d0 Mon Sep 17 00:00:00 2001 From: Lucas Fonseca Date: Mon, 22 Aug 2022 13:57:54 -0300 Subject: [PATCH 45/86] Update Github Actions Workflow runner (#332) * Update Workflow runner version * bump flake8-bandit * chore: bypass false positive for S105 Co-authored-by: Lucas Cardozo --- .github/workflows/publish.yml | 2 +- .github/workflows/staging.yml | 2 +- .github/workflows/test.yml | 2 +- requirements.lint.txt | 3 ++- tests/unit/butterfree/configs/db/test_cassandra_config.py | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3620cdbb..f981921e 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -9,7 +9,7 @@ jobs: Pipeline: if: github.ref == 'refs/heads/master' - runs-on: ubuntu-16.04 + runs-on: ubuntu-22.04 container: quintoandar/python-3-7-java steps: diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 8b39e5ac..1f94fc5d 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -9,7 +9,7 @@ jobs: Pipeline: if: github.ref == 'refs/heads/staging' - runs-on: ubuntu-16.04 + runs-on: ubuntu-22.04 container: quintoandar/python-3-7-java steps: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b39246fd..d7c1c3ac 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,7 +9,7 @@ on: jobs: Pipeline: - runs-on: ubuntu-16.04 + runs-on: ubuntu-22.04 container: quintoandar/python-3-7-java steps: diff --git a/requirements.lint.txt b/requirements.lint.txt index 161f7911..7c51f4b3 100644 --- a/requirements.lint.txt +++ b/requirements.lint.txt @@ -4,4 +4,5 @@ flake8-isort==2.8.0 isort<5 # temporary fix flake8-docstrings==1.5.0 flake8-bugbear==20.1.0 -flake8-bandit==2.1.2 +flake8-bandit==3.0.0 + diff --git a/tests/unit/butterfree/configs/db/test_cassandra_config.py b/tests/unit/butterfree/configs/db/test_cassandra_config.py index d34c8e9f..fa907a07 100644 --- a/tests/unit/butterfree/configs/db/test_cassandra_config.py +++ b/tests/unit/butterfree/configs/db/test_cassandra_config.py @@ -230,6 +230,6 @@ def test_set_credentials_on_instantiation(self): username="username", password="password", host="host", keyspace="keyspace" ) assert cassandra_config.username == "username" - assert cassandra_config.password == "password" + assert cassandra_config.password == "password" # noqa: S105 assert cassandra_config.host == "host" assert cassandra_config.keyspace == "keyspace" From 2593839d7fdd092a865795faa9d9e0a49cfc6f2c Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 20 Dec 2022 13:45:52 -0300 Subject: [PATCH 46/86] Delete sphinx version. (#334) --- docs/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 501e17cd..a20ab18f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,4 @@ recommonmark==0.6.0 -Sphinx==3.1.1 sphinx-rtd-theme==0.4.3 sphinxemoji==0.1.6 typing-extensions==3.7.4.2 From 35bcd30af981a960bc7c79c47e6a25dbed729f6c Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 21 Dec 2022 11:36:24 -0300 Subject: [PATCH 47/86] Update files to staging (#336) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Release/1.1.3 (#290) * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo * Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. * Create a dev package. (#288) * [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. * [MLOP-632] Butterfree dev workflow, automate release description (#279) * release 1.1.4 * update changelog Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> * Release/1.2.0 (#328) * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * Allow slide selection (#293) * Fix Slide Duration Typo (#295) * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-671] Implement get_schema on Spark client (#301) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. * [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> * [FIX] Add Partition types for Metastore (#305) * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * add local dc property * update version * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * Fix link in our docs. (#315) * [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. * Fix migration query. (#318) * Fix migration query add type key. (#319) * Fix db-config condition (#321) * Fix db-config condition. * Apply style. * MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file * [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * Fix method to generate agg feature name. (#326) * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Release 1.2.0 Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> Co-authored-by: hmeretti Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ setup.py | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 679e9834..e7f7004b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,34 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. ## [Unreleased] + + +## [1.2.0](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) ### Added * [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) +* [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets ([#278](https://github.com/quintoandar/butterfree/pull/278)) +* Allow slide selection ([#293](https://github.com/quintoandar/butterfree/pull/293)) +* [MLOP-637] Implement diff method ([#292](https://github.com/quintoandar/butterfree/pull/292)) +* [MLOP-640] Create CLI with migrate command ([#298](https://github.com/quintoandar/butterfree/pull/298)) +* [MLOP-645] Implement query method, cassandra ([#291](https://github.com/quintoandar/butterfree/pull/291)) +* [MLOP-671] Implement get_schema on Spark client ([#301](https://github.com/quintoandar/butterfree/pull/301)) +* [MLOP-648] Implement query method, metastore ([#294](https://github.com/quintoandar/butterfree/pull/294)) +* [MLOP-647] / [MLOP-646] Apply migrations ([#300](https://github.com/quintoandar/butterfree/pull/300)) +* [MLOP-639] Track logs in S3 ([#306](https://github.com/quintoandar/butterfree/pull/306)) +* [MLOP-702] Debug mode for Automate Migration ([#322](https://github.com/quintoandar/butterfree/pull/322)) + +### Changed +* Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) +* Read and write consistency level options ([#309](https://github.com/quintoandar/butterfree/pull/309)) +* [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree ([#327](https://github.com/quintoandar/butterfree/pull/327)) + +### Fixed +* [BUG] Apply create_partitions to historical validate ([#303](https://github.com/quintoandar/butterfree/pull/303)) +* [BUG] Fix key path for validate read ([#304](https://github.com/quintoandar/butterfree/pull/304)) +* [FIX] Add Partition types for Metastore ([#305](https://github.com/quintoandar/butterfree/pull/305)) +* Change solution for tracking logs ([#308](https://github.com/quintoandar/butterfree/pull/308)) +* [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316)) +* Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326)) ## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3) ### Added diff --git a/setup.py b/setup.py index 348bdbfe..b120a1ca 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev19" +__version__ = "1.2.0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 3a73ed83608772083d79e70f74c9636823645f8d Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 2 Jan 2023 13:49:06 -0300 Subject: [PATCH 48/86] Revert "Update files to staging (#336)" (#337) This reverts commit 35bcd30af981a960bc7c79c47e6a25dbed729f6c. --- CHANGELOG.md | 26 -------------------------- setup.py | 2 +- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7f7004b..679e9834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,34 +4,8 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. ## [Unreleased] - - -## [1.2.0](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) ### Added * [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) -* [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets ([#278](https://github.com/quintoandar/butterfree/pull/278)) -* Allow slide selection ([#293](https://github.com/quintoandar/butterfree/pull/293)) -* [MLOP-637] Implement diff method ([#292](https://github.com/quintoandar/butterfree/pull/292)) -* [MLOP-640] Create CLI with migrate command ([#298](https://github.com/quintoandar/butterfree/pull/298)) -* [MLOP-645] Implement query method, cassandra ([#291](https://github.com/quintoandar/butterfree/pull/291)) -* [MLOP-671] Implement get_schema on Spark client ([#301](https://github.com/quintoandar/butterfree/pull/301)) -* [MLOP-648] Implement query method, metastore ([#294](https://github.com/quintoandar/butterfree/pull/294)) -* [MLOP-647] / [MLOP-646] Apply migrations ([#300](https://github.com/quintoandar/butterfree/pull/300)) -* [MLOP-639] Track logs in S3 ([#306](https://github.com/quintoandar/butterfree/pull/306)) -* [MLOP-702] Debug mode for Automate Migration ([#322](https://github.com/quintoandar/butterfree/pull/322)) - -### Changed -* Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) -* Read and write consistency level options ([#309](https://github.com/quintoandar/butterfree/pull/309)) -* [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree ([#327](https://github.com/quintoandar/butterfree/pull/327)) - -### Fixed -* [BUG] Apply create_partitions to historical validate ([#303](https://github.com/quintoandar/butterfree/pull/303)) -* [BUG] Fix key path for validate read ([#304](https://github.com/quintoandar/butterfree/pull/304)) -* [FIX] Add Partition types for Metastore ([#305](https://github.com/quintoandar/butterfree/pull/305)) -* Change solution for tracking logs ([#308](https://github.com/quintoandar/butterfree/pull/308)) -* [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316)) -* Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326)) ## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3) ### Added diff --git a/setup.py b/setup.py index b120a1ca..348bdbfe 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0" +__version__ = "1.2.0.dev19" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 6b78a505cb29148e1494a1018b8b56af8d6062fe Mon Sep 17 00:00:00 2001 From: Lucas Cardozo Date: Wed, 16 Aug 2023 21:54:42 +0100 Subject: [PATCH 49/86] Less strict requirements (#333) * bump a few requirements; increase lower bound for h3 version range; adding pyarrow dev dependency * fix type repr for spark types; fix: broken tests (pyspark 3.4) --------- Co-authored-by: Ralph Rassweiler --- Makefile | 6 ++--- butterfree/configs/db/cassandra_config.py | 2 +- butterfree/reports/metadata.py | 4 +-- requirements.dev.txt | 8 +++--- requirements.txt | 7 +++--- setup.cfg | 1 + setup.py | 2 +- .../pipelines/test_feature_set_pipeline.py | 25 +++++++++++-------- 8 files changed, 29 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 95cc6e3a..4109504f 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,8 @@ VERSION := $(shell grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d .PHONY: environment ## create virtual environment for butterfree environment: - @pyenv install -s 3.7.6 - @pyenv virtualenv 3.7.6 butterfree + @pyenv install -s 3.7.13 + @pyenv virtualenv 3.7.13 butterfree @pyenv local butterfree @PYTHONPATH=. python -m pip install --upgrade pip @@ -221,4 +221,4 @@ help: } \ printf "\n"; \ }' \ - | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') \ No newline at end of file + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index 3d94e756..a038cb17 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -246,7 +246,7 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: cassandra_schema.append( { "column_name": features["column_name"], - "type": cassandra_mapping[str(features["type"])], + "type": cassandra_mapping[str(features["type"]).replace("()", "")], "primary_key": features["primary_key"], } ) diff --git a/butterfree/reports/metadata.py b/butterfree/reports/metadata.py index d54bbba9..dc1f7cbb 100644 --- a/butterfree/reports/metadata.py +++ b/butterfree/reports/metadata.py @@ -162,7 +162,7 @@ def to_json(self) -> Any: "features": [ { "column_name": c["column_name"], - "data_type": str(c["type"]), + "data_type": str(c["type"]).replace("()", ""), "description": desc, } for c, desc in params._features @@ -208,7 +208,7 @@ def to_markdown(self) -> Any: features = ["Column name", "Data type", "Description"] for c, desc in params._features: - features.extend([c["column_name"], str(c["type"]), desc]) + features.extend([c["column_name"], str(c["type"]).replace("()", ""), desc]) count_rows = len(features) // 3 diff --git a/requirements.dev.txt b/requirements.dev.txt index 96ddefc1..3d70d4c0 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,11 +1,9 @@ -cmake==3.18.4 -h3==3.7.0 -pyarrow==0.15.1 +h3==3.7.4 jupyter==1.0.0 twine==3.1.1 mypy==0.790 -pyspark-stubs==3.0.0 sphinx==3.5.4 sphinxemoji==0.1.8 sphinx-rtd-theme==0.5.2 -recommonmark==0.7.1 \ No newline at end of file +recommonmark==0.7.1 +pyarrow>=1.0.0 diff --git a/requirements.txt b/requirements.txt index 9548edb3..d61d125b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,8 @@ cassandra-driver>=3.22.0,<4.0 mdutils>=1.2.2,<2.0 -pandas>=0.24,<1.1 +pandas>=0.24,<2.0 parameters-validation>=1.1.5,<2.0 pyspark==3.* typer>=0.3,<0.4 -setuptools>=41,<42 -typing-extensions==3.7.4.3 -boto3==1.17.* \ No newline at end of file +typing-extensions>3.7.4,<5 +boto3==1.17.* diff --git a/setup.cfg b/setup.cfg index 255fff84..cff00122 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,7 @@ spark_options = spark.sql.session.timeZone: UTC spark.driver.bindAddress: 127.0.0.1 spark.sql.legacy.timeParserPolicy: LEGACY + spark.sql.legacy.createHiveTableByDefault: false [mypy] # suppress errors about unsatisfied imports diff --git a/setup.py b/setup.py index 348bdbfe..0029a78b 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ license="Copyright", author="QuintoAndar", install_requires=requirements, - extras_require={"h3": ["cmake==3.16.3", "h3==3.4.2"]}, + extras_require={"h3": ["h3>=3.7.4,<4"]}, python_requires=">=3.7, <4", entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]}, include_package_data=True, diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index 753dfe7c..d67e0a38 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -77,9 +77,11 @@ def test_feature_set_pipeline( self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange + table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") + create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, @@ -88,14 +90,16 @@ def test_feature_set_pipeline( table_reader_table=table_reader_table, ) - dbconfig = Mock() - dbconfig.mode = "overwrite" - dbconfig.format_ = "parquet" + path = "test_folder/historical/entity/feature_set" + + dbconfig = MetastoreConfig() dbconfig.get_options = Mock( - return_value={"path": "test_folder/historical/entity/feature_set"} + return_value={"mode": "overwrite", "format_": "parquet", "path": path} ) - historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) + historical_writer = HistoricalFeatureStoreWriter( + db_config=dbconfig, debug_mode=True + ) # act test_pipeline = FeatureSetPipeline( @@ -151,9 +155,13 @@ def test_feature_set_pipeline( ) test_pipeline.run() + # act and assert + dbconfig.get_path_with_partitions = Mock( + return_value=["historical/entity/feature_set"] + ) + # assert - path = dbconfig.get_options("historical/entity/feature_set").get("path") - df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) + df = spark_session.sql("select * from historical_feature_store__feature_set") target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column @@ -162,9 +170,6 @@ def test_feature_set_pipeline( # assert assert_dataframe_equality(df, target_df) - # tear down - shutil.rmtree("test_folder") - def test_feature_set_pipeline_with_dates( self, mocked_date_df, From 2a1900976634b79209e7b7116007df13c00e38d5 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 18 Aug 2023 14:11:52 -0300 Subject: [PATCH 50/86] feat: optional row count validation (#340) --- butterfree/load/sink.py | 17 +++++++++-------- butterfree/load/writers/writer.py | 2 ++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/butterfree/load/sink.py b/butterfree/load/sink.py index 0b0c10c9..7c0328d6 100644 --- a/butterfree/load/sink.py +++ b/butterfree/load/sink.py @@ -69,14 +69,15 @@ def validate( """ failures = [] for writer in self.writers: - try: - writer.validate( - feature_set=feature_set, - dataframe=dataframe, - spark_client=spark_client, - ) - except AssertionError as e: - failures.append(e) + if writer.row_count_validation: + try: + writer.validate( + feature_set=feature_set, + dataframe=dataframe, + spark_client=spark_client, + ) + except AssertionError as e: + failures.append(e) if failures: raise RuntimeError( diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index e12a4317..5073f472 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -26,6 +26,7 @@ def __init__( debug_mode: bool = False, interval_mode: bool = False, write_to_entity: bool = False, + row_count_validation: bool = True, ) -> None: super().__init__() self.db_config = db_config @@ -33,6 +34,7 @@ def __init__( self.debug_mode = debug_mode self.interval_mode = interval_mode self.write_to_entity = write_to_entity + self.row_count_validation = row_count_validation def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any From ca1a16d5cb7f2a1800084160ecd472c5303918e7 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 18 Aug 2023 17:04:51 -0300 Subject: [PATCH 51/86] fix: parameter, libs (#341) --- butterfree/load/writers/historical_feature_store_writer.py | 7 ++++++- requirements.dev.txt | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 489f22be..0ea9b50c 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -113,9 +113,14 @@ def __init__( debug_mode: bool = False, interval_mode: bool = False, check_schema_hook: Hook = None, + row_count_validation: bool = True, ): super(HistoricalFeatureStoreWriter, self).__init__( - db_config or MetastoreConfig(), debug_mode, interval_mode + db_config or MetastoreConfig(), + debug_mode, + interval_mode, + False, + row_count_validation, ) self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" diff --git a/requirements.dev.txt b/requirements.dev.txt index 3d70d4c0..abc64e3f 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -7,3 +7,5 @@ sphinxemoji==0.1.8 sphinx-rtd-theme==0.5.2 recommonmark==0.7.1 pyarrow>=1.0.0 +setuptools +wheel \ No newline at end of file From 60c7ee4df17e574af95ea3f737a48979267e3c49 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 21 Aug 2023 11:12:54 -0300 Subject: [PATCH 52/86] pre-release 1.2.2.dev0 (#342) --- CHANGELOG.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 679e9834..1324f1c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,52 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. ## [Unreleased] + +## [1.2.2](https://github.com/quintoandar/butterfree/releases/tag/1.2.2) + +### Changed +* Optional row count validation ([#284](https://github.com/quintoandar/butterfree/pull/284)) +* Bump several libs versions ([#333](https://github.com/quintoandar/butterfree/pull/333)) + +## [1.2.1](https://github.com/quintoandar/butterfree/releases/tag/1.2.1) + +### Changed +* Update README.md ([#331](https://github.com/quintoandar/butterfree/pull/331)) +* Update Github Actions Workflow runner ([#332](https://github.com/quintoandar/butterfree/pull/332)) +* Delete sphinx version. ([#334](https://github.com/quintoandar/butterfree/pull/334)) + +### Fixed +* Add the missing link for H3 geohash ([#330](https://github.com/quintoandar/butterfree/pull/330)) + +## [1.2.0](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) + + +### Added +* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) +* [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets ([#278](https://github.com/quintoandar/butterfree/pull/278)) +* Allow slide selection ([#293](https://github.com/quintoandar/butterfree/pull/293)) +* [MLOP-637] Implement diff method ([#292](https://github.com/quintoandar/butterfree/pull/292)) +* [MLOP-640] Create CLI with migrate command ([#298](https://github.com/quintoandar/butterfree/pull/298)) +* [MLOP-645] Implement query method, cassandra ([#291](https://github.com/quintoandar/butterfree/pull/291)) +* [MLOP-671] Implement get_schema on Spark client ([#301](https://github.com/quintoandar/butterfree/pull/301)) +* [MLOP-648] Implement query method, metastore ([#294](https://github.com/quintoandar/butterfree/pull/294)) +* [MLOP-647] / [MLOP-646] Apply migrations ([#300](https://github.com/quintoandar/butterfree/pull/300)) +* [MLOP-639] Track logs in S3 ([#306](https://github.com/quintoandar/butterfree/pull/306)) +* [MLOP-702] Debug mode for Automate Migration ([#322](https://github.com/quintoandar/butterfree/pull/322)) + +### Changed +* Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) +* Read and write consistency level options ([#309](https://github.com/quintoandar/butterfree/pull/309)) +* [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree ([#327](https://github.com/quintoandar/butterfree/pull/327)) + +### Fixed +* [BUG] Apply create_partitions to historical validate ([#303](https://github.com/quintoandar/butterfree/pull/303)) +* [BUG] Fix key path for validate read ([#304](https://github.com/quintoandar/butterfree/pull/304)) +* [FIX] Add Partition types for Metastore ([#305](https://github.com/quintoandar/butterfree/pull/305)) +* Change solution for tracking logs ([#308](https://github.com/quintoandar/butterfree/pull/308)) +* [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316)) +* Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326)) + ### Added * [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) diff --git a/setup.py b/setup.py index 0029a78b..b3a2297f 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev19" +__version__ = "1.2.2.dev0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From f35d66565226f6dd75a4f38706a22a4aa496c1d7 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 21 Aug 2023 13:48:52 -0300 Subject: [PATCH 53/86] Rebase staging (#343) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Release/1.1.3 (#290) * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo * Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. * Create a dev package. (#288) * [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. * [MLOP-632] Butterfree dev workflow, automate release description (#279) * release 1.1.4 * update changelog Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> * Release/1.2.0 (#328) * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * Allow slide selection (#293) * Fix Slide Duration Typo (#295) * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-671] Implement get_schema on Spark client (#301) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. * [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> * [FIX] Add Partition types for Metastore (#305) * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * add local dc property * update version * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * Fix link in our docs. (#315) * [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. * Fix migration query. (#318) * Fix migration query add type key. (#319) * Fix db-config condition (#321) * Fix db-config condition. * Apply style. * MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file * [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * Fix method to generate agg feature name. (#326) * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Release 1.2.0 Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> * Release 1.2.1 (#338) * Add the missing link for H3 geohash (#330) * Add the missing link for H3 geohash * Update the H3 geohash link. * Update the same link Update the same link in in spark_function_and_window.ipynb example * Update README.md (#331) * Update Github Actions Workflow runner (#332) * Update Workflow runner version * bump flake8-bandit * chore: bypass false positive for S105 Co-authored-by: Lucas Cardozo * Delete sphinx version. (#334) * Update files to staging (#336) Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * release 1.2.1 Co-authored-by: Jay Vala <24193355+jdvala@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Lucas Fonseca Co-authored-by: Lucas Cardozo Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Update files to staging (#336) * Release/1.1.3 (#290) * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo * Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. * Create a dev package. (#288) * [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. * [MLOP-632] Butterfree dev workflow, automate release description (#279) * release 1.1.4 * update changelog Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> * Release/1.2.0 (#328) * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * Allow slide selection (#293) * Fix Slide Duration Typo (#295) * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-671] Implement get_schema on Spark client (#301) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. * [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> * [FIX] Add Partition types for Metastore (#305) * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * add local dc property * update version * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * Fix link in our docs. (#315) * [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. * Fix migration query. (#318) * Fix migration query add type key. (#319) * Fix db-config condition (#321) * Fix db-config condition. * Apply style. * MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file * [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * Fix method to generate agg feature name. (#326) * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Release 1.2.0 Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> Co-authored-by: hmeretti Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> * Revert "Update files to staging (#336)" (#337) This reverts commit 35bcd30af981a960bc7c79c47e6a25dbed729f6c. * Less strict requirements (#333) * bump a few requirements; increase lower bound for h3 version range; adding pyarrow dev dependency * fix type repr for spark types; fix: broken tests (pyspark 3.4) --------- Co-authored-by: Ralph Rassweiler * feat: optional row count validation (#340) * fix: parameter, libs (#341) --------- Co-authored-by: hmeretti Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> Co-authored-by: Jay Vala <24193355+jdvala@users.noreply.github.com> Co-authored-by: Lucas Fonseca Co-authored-by: Lucas Cardozo Co-authored-by: Mayara Moromisato From 97e44fa896dd9fce4e46645d2437f9684c4cee75 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 21 Aug 2023 15:07:50 -0300 Subject: [PATCH 54/86] Rebase staging from master (#345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Release/1.1.3 (#290) * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo * Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. * Create a dev package. (#288) * [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. * [MLOP-632] Butterfree dev workflow, automate release description (#279) * release 1.1.4 * update changelog Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> * Release/1.2.0 (#328) * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * Allow slide selection (#293) * Fix Slide Duration Typo (#295) * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-671] Implement get_schema on Spark client (#301) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. * [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> * [FIX] Add Partition types for Metastore (#305) * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * add local dc property * update version * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * Fix link in our docs. (#315) * [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. * Fix migration query. (#318) * Fix migration query add type key. (#319) * Fix db-config condition (#321) * Fix db-config condition. * Apply style. * MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file * [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * Fix method to generate agg feature name. (#326) * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Release 1.2.0 Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> * Release 1.2.1 (#338) * Add the missing link for H3 geohash (#330) * Add the missing link for H3 geohash * Update the H3 geohash link. * Update the same link Update the same link in in spark_function_and_window.ipynb example * Update README.md (#331) * Update Github Actions Workflow runner (#332) * Update Workflow runner version * bump flake8-bandit * chore: bypass false positive for S105 Co-authored-by: Lucas Cardozo * Delete sphinx version. (#334) * Update files to staging (#336) Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * release 1.2.1 Co-authored-by: Jay Vala <24193355+jdvala@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Lucas Fonseca Co-authored-by: Lucas Cardozo Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> * fix: methods * fix: duplicate --------- Co-authored-by: hmeretti Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> Co-authored-by: Jay Vala <24193355+jdvala@users.noreply.github.com> Co-authored-by: Lucas Fonseca Co-authored-by: Lucas Cardozo --- CHANGELOG.md | 4 ---- requirements.dev.txt | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1324f1c1..27b680bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,6 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [1.2.0](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) - ### Added * [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets ([#278](https://github.com/quintoandar/butterfree/pull/278)) @@ -50,9 +49,6 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316)) * Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326)) -### Added -* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) - ## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3) ### Added * [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273)) diff --git a/requirements.dev.txt b/requirements.dev.txt index abc64e3f..4e164c83 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -8,4 +8,4 @@ sphinx-rtd-theme==0.5.2 recommonmark==0.7.1 pyarrow>=1.0.0 setuptools -wheel \ No newline at end of file +wheel From 9bcca0e20ba63f1643775e4553a68879a021c874 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 13 Nov 2023 11:27:38 -0300 Subject: [PATCH 55/86] feat(MLOP-1985): optional params (#347) * feat: optional params --- butterfree/extract/source.py | 13 +++++++++++-- butterfree/transform/aggregated_feature_set.py | 17 ++++++++++++++--- butterfree/transform/feature_set.py | 16 ++++++++++++++-- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 6d905c6b..1209e916 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -49,13 +49,22 @@ class Source(HookableComponent): temporary views regarding each reader and, after, will run the desired query and return a dataframe. + The `eager_evaluation` param forces Spark to apply the currently + mapped changes to the DataFrame. When this parameter is set to + False, Spark follows its standard behaviour of lazy evaluation. + Lazy evaluation can improve Spark's performance as it allows + Spark to build the best version of the execution plan. + """ - def __init__(self, readers: List[Reader], query: str) -> None: + def __init__( + self, readers: List[Reader], query: str, eager_evaluation: bool = True, + ) -> None: super().__init__() self.enable_pre_hooks = False self.readers = readers self.query = query + self.eager_evaluation = eager_evaluation def construct( self, client: SparkClient, start_date: str = None, end_date: str = None @@ -87,7 +96,7 @@ def construct( dataframe = client.sql(self.query) - if not dataframe.isStreaming: + if not dataframe.isStreaming and self.eager_evaluation: dataframe.cache().count() post_hook_df = self.run_post_hooks(dataframe) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 133195d7..0bff33c6 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -197,6 +197,8 @@ def __init__( keys: List[KeyFeature], timestamp: TimestampFeature, features: List[Feature], + deduplicate_rows: bool = True, + eager_evaluation: bool = True, ): self._windows: List[Any] = [] self._pivot_column: Optional[str] = None @@ -204,7 +206,14 @@ def __init__( self._distinct_subset: List[Any] = [] self._distinct_keep: Optional[str] = None super(AggregatedFeatureSet, self).__init__( - name, entity, description, keys, timestamp, features, + name, + entity, + description, + keys, + timestamp, + features, + deduplicate_rows, + eager_evaluation, ) @property @@ -626,8 +635,10 @@ def construct( float("nan"), None ) if not output_df.isStreaming: - output_df = self._filter_duplicated_rows(output_df) - output_df.cache().count() + if self.deduplicate_rows: + output_df = self._filter_duplicated_rows(output_df) + if self.eager_evaluation: + output_df.cache().count() post_hook_df = self.run_post_hooks(output_df) diff --git a/butterfree/transform/feature_set.py b/butterfree/transform/feature_set.py index c2e40a49..469a353a 100644 --- a/butterfree/transform/feature_set.py +++ b/butterfree/transform/feature_set.py @@ -97,6 +97,12 @@ class FeatureSet(HookableComponent): values over key columns and timestamp column, we do this in order to reduce our dataframe (regarding the number of rows). A detailed explation of this method can be found at filter_duplicated_rows docstring. + + The `eager_evaluation` param forces Spark to apply the currently + mapped changes to the DataFrame. When this parameter is set to + False, Spark follows its standard behaviour of lazy evaluation. + Lazy evaluation can improve Spark's performance as it allows + Spark to build the best version of the execution plan. """ def __init__( @@ -107,6 +113,8 @@ def __init__( keys: List[KeyFeature], timestamp: TimestampFeature, features: List[Feature], + deduplicate_rows: bool = True, + eager_evaluation: bool = True, ) -> None: super().__init__() self.name = name @@ -116,6 +124,8 @@ def __init__( self.timestamp = timestamp self.features = features self.incremental_strategy = IncrementalStrategy(column=TIMESTAMP_COLUMN) + self.deduplicate_rows = deduplicate_rows + self.eager_evaluation = eager_evaluation @property def name(self) -> str: @@ -426,8 +436,10 @@ def construct( ).select(*self.columns) if not output_df.isStreaming: - output_df = self._filter_duplicated_rows(output_df) - output_df.cache().count() + if self.deduplicate_rows: + output_df = self._filter_duplicated_rows(output_df) + if self.eager_evaluation: + output_df.cache().count() output_df = self.incremental_strategy.filter_with_incremental_strategy( dataframe=output_df, start_date=start_date, end_date=end_date From 512a0fe9642be776dca1dd97ff683447880079e9 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 13 Nov 2023 12:02:13 -0300 Subject: [PATCH 56/86] pre-release 1.2.3 (#349) --- CHANGELOG.md | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27b680bf..b252803c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] +## [1.2.3](https://github.com/quintoandar/butterfree/releases/tag/1.2.3) +* Optional params ([#347](https://github.com/quintoandar/butterfree/pull/347)) + +### Changed +* Optional row count validation ([#340](https://github.com/quintoandar/butterfree/pull/340)) + ## [1.2.2](https://github.com/quintoandar/butterfree/releases/tag/1.2.2) ### Changed diff --git a/setup.py b/setup.py index b3a2297f..2fc26b46 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.2.dev0" +__version__ = "1.2.3.dev0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 688a5b3c0ab514034821de2af79ffa2ccdb9dd49 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Thu, 11 Apr 2024 13:50:18 -0300 Subject: [PATCH 57/86] feat(MLOP-2145): add feature set creation script (#351) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add feature set creation script * feat(mlop-2145): updating auto fs creation (#352) * feat(updating-auto-fs-creation): adding methods to the class as private and add Table dataclass * feat(updating-auto-fs-creation): using dataclass and adding typing * feat(updating-auto-fs-creation): finish using all type hints and apply format * feat(updating-auto-fs-creation): add docstring and auto-infer by df * fix(updating-auto-fs-creation): remove unused format * feat(updating-auto-fs-creation): creating flake8 ignore list * feat(updating-auto-fs-creation): apply fmt * feat(updating-auto-fs-creation): init file * feat(updating-auto-fs-creation): making more readable * feat(updating-auto-fs-creation): remove wrong file * feat(updating-auto-fs-creation): apply fmt * feat(updating-auto-fs-creation): ignoring mypy * feat(updating-auto-fs-creation): add unit test * feat(updating-auto-fs-creation): using Dataframe from pyspark --------- Co-authored-by: João Albuquerque --- butterfree/automated/__init__.py | 0 butterfree/automated/feature_set_creation.py | 199 ++++++++++++++++++ setup.cfg | 2 +- tests/unit/butterfree/automated/__init__.py | 0 .../automated/test_feature_set_creation.py | 28 +++ 5 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 butterfree/automated/__init__.py create mode 100644 butterfree/automated/feature_set_creation.py create mode 100644 tests/unit/butterfree/automated/__init__.py create mode 100644 tests/unit/butterfree/automated/test_feature_set_creation.py diff --git a/butterfree/automated/__init__.py b/butterfree/automated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/butterfree/automated/feature_set_creation.py b/butterfree/automated/feature_set_creation.py new file mode 100644 index 00000000..4a078135 --- /dev/null +++ b/butterfree/automated/feature_set_creation.py @@ -0,0 +1,199 @@ +import re +from dataclasses import dataclass +from typing import List, Optional, Tuple + +from pyspark.sql import DataFrame + +from butterfree.constants.data_type import DataType + +BUTTERFREE_DTYPES = { + "string": DataType.STRING.spark_sql, + "long": DataType.BIGINT.spark_sql, + "double": DataType.DOUBLE.spark_sql, + "boolean": DataType.BOOLEAN.spark_sql, + "integer": DataType.INTEGER.spark_sql, + "date": DataType.DATE.spark_sql, + "timestamp": DataType.TIMESTAMP.spark_sql, + "array": { + "long": DataType.ARRAY_BIGINT.spark_sql, + "float": DataType.ARRAY_FLOAT.spark_sql, + "string": DataType.ARRAY_STRING.spark_sql, + }, +} + + +@dataclass(frozen=True) +class Table: # noqa: D101 + id: str + database: str + name: str + + +class FeatureSetCreation: + """Class to auto-generate readers and features.""" + + def _get_features_with_regex(self, sql_query: str) -> List[str]: + features = [] + sql_query = " ".join(sql_query.split()) + first_pattern = re.compile("[(]?([\w.*]+)[)]?,", re.IGNORECASE) + second_pattern = re.compile("(\w+)\s(from)", re.IGNORECASE) + + for pattern in [first_pattern, second_pattern]: + matches = pattern.finditer(sql_query) + for match in matches: + feature = match.group(1) + + if "." in feature: + feature = feature.split(".")[1] + + features.append(feature) + + return features + + def _get_data_type(self, field_name: str, df: DataFrame) -> str: + for field in df.schema.jsonValue()["fields"]: + if field["name"] == field_name: + + field_type = field["type"] + + if isinstance(field_type, dict): + + field_type_keys = field_type.keys() + + if "type" in field_type_keys and "elementType" in field_type_keys: + return ( + "." + + BUTTERFREE_DTYPES[field_type["type"]][ # type: ignore + field_type["elementType"] + ] + ) + + return "." + BUTTERFREE_DTYPES[field["type"]] + + return "" + + def _get_tables_with_regex(self, sql_query: str) -> Tuple[List[Table], str]: + + modified_sql_query = sql_query + tables = [] + stop_words = [ + "left", + "right", + "full outer", + "inner", + "where", + "join", + "on", + "as", + ] + keywords = ["from", "join"] + + for keyword in keywords: + pattern = re.compile( + rf"\b{keyword}\s+(\w+\.\w+|\w+)\s+(\w+)", re.IGNORECASE + ) + matches = pattern.finditer(sql_query) + + for match in matches: + full_table_name = match.group(1) + id = match.group(2).strip() + + if id in stop_words: + id = full_table_name + + if "." in full_table_name: + database, table = full_table_name.split(".") + + modified_sql_query = re.sub( + rf"\b{database}\.{table}\b", table, modified_sql_query + ) + + tables.append(Table(id=id, database=database, name=table)) + else: + modified_sql_query = re.sub( + rf"\b{full_table_name}\b", full_table_name, modified_sql_query + ) + tables.append(Table(id=id, database="TBD", name=full_table_name)) + + return tables, modified_sql_query + + def get_readers(self, sql_query: str) -> str: + """ + Extracts table readers from a SQL query and formats them as a string. + + Args: + sql_query (str): The SQL query from which to extract table readers. + + Returns: + str: A formatted string containing the table readers. + """ + tables, modified_sql_query = self._get_tables_with_regex(sql_query.lower()) + readers = [] + for table in tables: + table_reader_string = f""" + TableReader( + id="{table.id}", + database="{table.database}", + table="{table.name}" + ), + """ + readers.append(table_reader_string) + + final_string = """ + source=Source( + readers=[ + {} + ], + query=( + \"\"\" + {} + \"\"\" + ), + ), + """.format( + "".join(readers), modified_sql_query.replace("\n", "\n\t\t") + ) + + return final_string + + def get_features(self, sql_query: str, df: Optional[DataFrame] = None) -> str: + """ + Extract features from a SQL query and return them formatted as a string. + + Args: + sql_query (str): The SQL query used to extract features. + df (Optional[DataFrame], optional): Optional DataFrame used to infer data types. Defaults to None. + + Returns: + str: A formatted string containing the extracted features. + + This sould be used on Databricks. + + Especially if you want automatic type inference without passing a reference dataframe. + The utility will only work in an environment where a spark session is available in the environment + """ # noqa: E501 + + features = self._get_features_with_regex(sql_query) + features_formatted = [] + for feature in features: + description = feature.replace("__", " ").replace("_", " ").capitalize() + + data_type = "." + + if df is None: + df = spark.sql(sql_query) # type: ignore # noqa: F821 + + data_type = self._get_data_type(feature, df) + + feature_string = f""" + Feature( + name="{feature}", + description="{description}", + dtype=DataType{data_type}, + ), + """ + features_formatted.append(feature_string) + + final_string = ("features=[\t{}],\n),").format("".join(features_formatted)) + + return final_string diff --git a/setup.cfg b/setup.cfg index cff00122..c58c2df3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ docstring-convention = google max-line-length = 88 max-complexity = 12 -ignore = W503, E203, D203, D401, D107, S101, D105 +ignore = W503, E203, D203, D401, D107, S101, D105, D100, W605, D202, D212, D104, E261 exclude = dist/*,build/*,.pytest_cache/*,.git/*,pip/* per-file-ignores = # We will not check for docstrings or the use of asserts in tests diff --git a/tests/unit/butterfree/automated/__init__.py b/tests/unit/butterfree/automated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/butterfree/automated/test_feature_set_creation.py b/tests/unit/butterfree/automated/test_feature_set_creation.py new file mode 100644 index 00000000..cfb5101e --- /dev/null +++ b/tests/unit/butterfree/automated/test_feature_set_creation.py @@ -0,0 +1,28 @@ +import unittest +from unittest.mock import MagicMock + +from butterfree.automated.feature_set_creation import FeatureSetCreation + + +class TestFeatureSetCreation(unittest.TestCase): + def setUp(self): + self.feature_set_creation = FeatureSetCreation() + + def test_get_features_with_regex(self): + sql_query = "SELECT column1, column2 FROM table1" + expected_features = ["column1", "column2"] + + features = self.feature_set_creation._get_features_with_regex(sql_query) + + self.assertEqual(features, expected_features) + + def test_get_data_type(self): + field_name = "column1" + df_mock = MagicMock() + df_mock.schema.jsonValue.return_value = { + "fields": [{"name": "column1", "type": "string"}] + } + + data_type = self.feature_set_creation._get_data_type(field_name, df_mock) + + self.assertEqual(data_type, ".STRING") From da91b49cbd69b6fa0881a4327df86ac004f22c05 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Thu, 25 Apr 2024 10:18:26 -0300 Subject: [PATCH 58/86] Rebase staging from master (#354) * Rebasing and add skip lint on github actions --- .github/workflows/skip_lint.yml | 17 ++++++++ .github/workflows/staging.yml | 63 ++++++++++++++-------------- CHANGELOG.md | 15 ++++--- docs/source/butterfree.automated.rst | 19 +++++++++ docs/source/butterfree.rst | 1 + setup.py | 2 +- 6 files changed, 78 insertions(+), 39 deletions(-) create mode 100644 .github/workflows/skip_lint.yml create mode 100644 docs/source/butterfree.automated.rst diff --git a/.github/workflows/skip_lint.yml b/.github/workflows/skip_lint.yml new file mode 100644 index 00000000..1c768a23 --- /dev/null +++ b/.github/workflows/skip_lint.yml @@ -0,0 +1,17 @@ +# This step is used only because we want to mark the runner-linter check as required +# for PRs to develop, but not for the merge queue to merge into develop, +# github does not have this functionality yet + +name: 'Skip github-actions/runner-linter check at merge queue' + +on: + merge_group: + +jobs: + empty_job: + name: 'github-actions/runner-linter' + runs-on: github-actions-developers-runner + steps: + - name: Skip github-actions/runner-linter check at merge queue + run: | + echo "Done" diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 1f94fc5d..77127820 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -2,8 +2,7 @@ name: "Publish Dev Package" on: push: paths: - - 'setup.py' - + - "setup.py" jobs: Pipeline: @@ -13,33 +12,33 @@ jobs: container: quintoandar/python-3-7-java steps: - - uses: actions/checkout@v2 - - - name: Install dependencies - run: make ci-install - - - name: Get version - run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 )" >> $GITHUB_ENV - - - name: Build package - run: make package - - - name: Create release - uses: actions/create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ env.version }} - release_name: Release ${{ env.version }} - prerelease: true - - - name: Release already exist - if: ${{ failure() }} - run: echo Release already exist - - - name: Publish release to pypi.org - if: ${{ success() }} - env: - PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} - PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* + - uses: actions/checkout@v2 + + - name: Install dependencies + run: make ci-install + + - name: Get version + run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 )" >> $GITHUB_ENV + + - name: Build package + run: make package + + - name: Create release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ env.version }} + release_name: Release ${{ env.version }} + prerelease: true + + - name: Release already exist + if: ${{ failure() }} + run: echo Release already exist + + - name: Publish release to pypi.org + if: ${{ success() }} + env: + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* diff --git a/CHANGELOG.md b/CHANGELOG.md index b252803c..ad9f4863 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] +## [1.2.4](https://github.com/quintoandar/butterfree/releases/tag/1.2.4) +* Auto create feature sets ([#351](https://github.com/quintoandar/butterfree/pull/351)) + ## [1.2.3](https://github.com/quintoandar/butterfree/releases/tag/1.2.3) * Optional params ([#347](https://github.com/quintoandar/butterfree/pull/347)) @@ -66,7 +69,7 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * [MLOP-632] Butterfree dev workflow, automate release description ([#279](https://github.com/quintoandar/butterfree/commit/245eaa594846166972241b03fddc61ee5117b1f7)) ### Fixed -* Change trigger for pipeline staging ([#287](https://github.com/quintoandar/butterfree/pull/287)) +* Change trigger for pipeline staging ([#287](https://github.com/quintoandar/butterfree/pull/287)) ## [1.1.2](https://github.com/quintoandar/butterfree/releases/tag/1.1.2) ### Fixed @@ -89,11 +92,11 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * Update README ([#257](https://github.com/quintoandar/butterfree/pull/257)) ### Fixed -* Fix Butterfree's workflow ([#262](https://github.com/quintoandar/butterfree/pull/262)) +* Fix Butterfree's workflow ([#262](https://github.com/quintoandar/butterfree/pull/262)) * [FIX] Downgrade Python Version in Pyenv ([#227](https://github.com/quintoandar/butterfree/pull/227)) -* [FIX] Fix docs ([#229](https://github.com/quintoandar/butterfree/pull/229)) +* [FIX] Fix docs ([#229](https://github.com/quintoandar/butterfree/pull/229)) * [FIX] Fix Docs - Add more dependencies ([#230](https://github.com/quintoandar/butterfree/pull/230)) -* Fix broken notebook URL ([#236](https://github.com/quintoandar/butterfree/pull/236)) +* Fix broken notebook URL ([#236](https://github.com/quintoandar/butterfree/pull/236)) * Issue #77 Fix ([#245](https://github.com/quintoandar/butterfree/pull/245)) ## [1.0.2](https://github.com/quintoandar/butterfree/releases/tag/1.0.2) @@ -104,7 +107,7 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * [MLOP-426] Change branching strategy on butterfree to use only master branch ([#216](https://github.com/quintoandar/butterfree/pull/216)) ### Fixed -* [MLOP-440] Python 3.7 bump and Fixing Dependencies ([#220](https://github.com/quintoandar/butterfree/pull/220)) +* [MLOP-440] Python 3.7 bump and Fixing Dependencies ([#220](https://github.com/quintoandar/butterfree/pull/220)) ## [1.0.1](https://github.com/quintoandar/butterfree/releases/tag/1.0.1) ### Added @@ -303,4 +306,4 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * [MLOP-143] Fix Bugs for HouseMain FeatureSet ([#62](https://github.com/quintoandar/butterfree/pull/62)) ## [0.1.0](https://github.com/quintoandar/butterfree/releases/tag/0.1.0) -* First modules and entities of butterfree package. \ No newline at end of file +* First modules and entities of butterfree package. diff --git a/docs/source/butterfree.automated.rst b/docs/source/butterfree.automated.rst new file mode 100644 index 00000000..de290d9c --- /dev/null +++ b/docs/source/butterfree.automated.rst @@ -0,0 +1,19 @@ +butterfree.automated package +============================ + +Submodules +---------- + + +.. automodule:: butterfree.automated.feature_set_creation + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: butterfree.automated + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.rst b/docs/source/butterfree.rst index 0828f921..e108be6e 100644 --- a/docs/source/butterfree.rst +++ b/docs/source/butterfree.rst @@ -7,6 +7,7 @@ Subpackages .. toctree:: :maxdepth: 4 + butterfree.automated butterfree.clients butterfree.configs butterfree.constants diff --git a/setup.py b/setup.py index 2fc26b46..6fa35751 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.3.dev0" +__version__ = "1.2.4" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 887fbb2f262951f46d62a632755d3b71be8ee3de Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Wed, 29 May 2024 16:32:14 -0300 Subject: [PATCH 59/86] feat(mlop-2269): bump versions (#355) * fix: bump versions adjust tests * add checklist * chore: bump python * bump pyspark * chore: java version all steps modified --- .checklist.yaml | 30 ++++ .github/workflows/publish.yml | 13 ++ .github/workflows/staging.yml | 16 +- .github/workflows/test.yml | 16 +- .gitignore | 1 + Makefile | 6 +- butterfree/_cli/migrate.py | 12 +- butterfree/clients/cassandra_client.py | 4 +- butterfree/clients/spark_client.py | 6 +- butterfree/extract/source.py | 5 +- .../historical_feature_store_writer.py | 5 +- .../writers/online_feature_store_writer.py | 10 +- butterfree/load/writers/writer.py | 5 +- .../database_migration/database_migration.py | 5 +- .../database_migration/metastore_migration.py | 5 +- .../transform/aggregated_feature_set.py | 4 +- .../transformations/aggregated_transform.py | 6 +- .../transformations/custom_transform.py | 4 +- .../transform/transformations/h3_transform.py | 5 +- .../sql_expression_transform.py | 3 +- docs/requirements.txt | 3 +- examples/test_examples.py | 4 +- mypy.ini | 41 +++++- requirements.dev.txt | 10 +- requirements.lint.txt | 11 +- requirements.test.txt | 2 +- requirements.txt | 6 +- setup.cfg | 2 +- setup.py | 2 +- .../butterfree/extract/test_source.py | 13 +- tests/integration/butterfree/load/conftest.py | 2 +- .../integration/butterfree/load/test_sink.py | 7 +- .../butterfree/pipelines/conftest.py | 3 +- .../pipelines/test_feature_set_pipeline.py | 72 ++++++--- .../transform/test_aggregated_feature_set.py | 16 +- .../butterfree/transform/test_feature_set.py | 10 +- tests/mocks/entities/first/first_pipeline.py | 18 ++- .../entities/second/deeper/second_pipeline.py | 16 +- .../butterfree/clients/test_spark_client.py | 14 +- .../pre_processing/test_filter_transform.py | 3 +- .../pre_processing/test_pivot_transform.py | 36 ++++- .../extract/readers/test_file_reader.py | 10 +- .../butterfree/extract/readers/test_reader.py | 3 +- .../extract/readers/test_table_reader.py | 9 +- tests/unit/butterfree/extract/test_source.py | 6 +- tests/unit/butterfree/load/conftest.py | 6 +- .../load/processing/test_json_transform.py | 4 +- .../migrations/database_migration/conftest.py | 12 +- tests/unit/butterfree/pipelines/conftest.py | 13 +- .../pipelines/test_feature_set_pipeline.py | 38 ++++- .../unit/butterfree/reports/test_metadata.py | 139 +++++++----------- tests/unit/butterfree/transform/conftest.py | 8 +- .../transform/features/test_feature.py | 4 +- .../transform/test_aggregated_feature_set.py | 14 +- .../butterfree/transform/test_feature_set.py | 21 ++- .../transform/transformations/conftest.py | 2 +- .../test_aggregated_transform.py | 5 +- .../transformations/test_custom_transform.py | 12 +- .../transformations/test_h3_transform.py | 6 +- .../test_spark_function_transform.py | 4 +- .../test_sql_expression_transform.py | 10 +- 61 files changed, 547 insertions(+), 231 deletions(-) create mode 100644 .checklist.yaml diff --git a/.checklist.yaml b/.checklist.yaml new file mode 100644 index 00000000..f0c21171 --- /dev/null +++ b/.checklist.yaml @@ -0,0 +1,30 @@ +apiVersion: quintoandar.com.br/checklist/v2 +kind: ServiceChecklist +metadata: + name: butterfree +spec: + description: >- + A solution for Feature Stores. + + costCenter: C055 + department: engineering + lifecycle: production + docs: true + + ownership: + team: data_products_mlops + line: tech_platform + owner: otavio.cals@quintoandar.com.br + + libraries: + - name: butterfree + type: common-usage + path: https://quintoandar.github.io/python-package-server/ + description: A lib to build Feature Stores. + registries: + - github-packages + tier: T0 + + channels: + squad: 'mlops' + alerts: 'data-products-reports' diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f981921e..0957a958 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -14,6 +14,19 @@ jobs: steps: - uses: actions/checkout@v2 + - uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: microsoft + + - uses: vemonet/setup-spark@v1 + with: + spark-version: '3.5.1' + hadoop-version: '3' - name: Install dependencies run: make ci-install diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 77127820..573049ca 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -8,11 +8,23 @@ jobs: Pipeline: if: github.ref == 'refs/heads/staging' - runs-on: ubuntu-22.04 - container: quintoandar/python-3-7-java + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + - uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: microsoft + + - uses: vemonet/setup-spark@v1 + with: + spark-version: '3.5.1' + hadoop-version: '3' - name: Install dependencies run: make ci-install diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d7c1c3ac..d588c853 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,11 +9,23 @@ on: jobs: Pipeline: - runs-on: ubuntu-22.04 - container: quintoandar/python-3-7-java + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + - uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: microsoft + + - uses: vemonet/setup-spark@v1 + with: + spark-version: '3.5.1' + hadoop-version: '3' - name: Install dependencies run: make ci-install diff --git a/.gitignore b/.gitignore index 62434612..0c59b49a 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,7 @@ instance/ # PyBuilder target/ +pip/ # Jupyter Notebook .ipynb_checkpoints diff --git a/Makefile b/Makefile index 4109504f..ba0d0ead 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,7 @@ style-check: @echo "Code Style" @echo "==========" @echo "" - @python -m black --check -t py36 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . && echo "\n\nSuccess" || (echo "\n\nFailure\n\nYou need to run \"make apply-style\" to apply style formatting to your code"; exit 1) + @python -m black --check -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . && echo "\n\nSuccess" || (echo "\n\nFailure\n\nYou need to run \"make apply-style\" to apply style formatting to your code"; exit 1) .PHONY: quality-check ## run code quality checks with flake8 @@ -104,8 +104,8 @@ checks: style-check quality-check type-check .PHONY: apply-style ## fix stylistic errors with black apply-style: - @python -m black -t py36 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . - @python -m isort -rc --atomic butterfree/ tests/ + @python -m black -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . + @python -m isort --atomic butterfree/ tests/ .PHONY: clean ## clean unused artifacts diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 277ecf3c..ed62f1a2 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -46,13 +46,13 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: logger.error(f"Path: {path} not found!") return set() - logger.info(f"Importing modules...") + logger.info("Importing modules...") package = ".".join(path.strip("/").split("/")) imported = set( importlib.import_module(f".{name}", package=package) for name in modules ) - logger.info(f"Scanning modules...") + logger.info("Scanning modules...") content = { module: set( filter( @@ -93,7 +93,8 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: PATH = typer.Argument( - ..., help="Full or relative path to where feature set pipelines are being defined.", + ..., + help="Full or relative path to where feature set pipelines are being defined.", ) GENERATE_LOGS = typer.Option( @@ -113,7 +114,10 @@ class Migrate: pipelines: list of Feature Set Pipelines to use to migration. """ - def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None: + def __init__( + self, + pipelines: Set[FeatureSetPipeline], + ) -> None: self.pipelines = pipelines def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None: diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 4c6f96fe..5a723155 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -129,7 +129,9 @@ def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: return response def _get_create_table_query( - self, columns: List[CassandraColumn], table: str, + self, + columns: List[CassandraColumn], + table: str, ) -> str: """Creates CQL statement to create a table.""" parsed_columns = [] diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index bfa31d2a..e2b868ca 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -61,9 +61,9 @@ def read( if path and not isinstance(path, (str, list)): raise ValueError("path needs to be a string or a list of string") - df_reader: Union[ - DataStreamReader, DataFrameReader - ] = self.conn.readStream if stream else self.conn.read + df_reader: Union[DataStreamReader, DataFrameReader] = ( + self.conn.readStream if stream else self.conn.read + ) df_reader = df_reader.schema(schema) if schema else df_reader diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 1209e916..281ed15a 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -58,7 +58,10 @@ class Source(HookableComponent): """ def __init__( - self, readers: List[Reader], query: str, eager_evaluation: bool = True, + self, + readers: List[Reader], + query: str, + eager_evaluation: bool = True, ) -> None: super().__init__() self.enable_pre_hooks = False diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 0ea9b50c..1a64afdf 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -130,7 +130,10 @@ def __init__( self.check_schema_hook = check_schema_hook def write( - self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, + self, + feature_set: FeatureSet, + dataframe: DataFrame, + spark_client: SparkClient, ) -> None: """Loads the data from a feature set into the Historical Feature Store. diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index 17dc8af4..d0bcde94 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -116,7 +116,10 @@ def filter_latest(dataframe: DataFrame, id_columns: List[Any]) -> DataFrame: window = Window.partitionBy(*id_columns).orderBy(col(TIMESTAMP_COLUMN).desc()) return ( - dataframe.select(col("*"), row_number().over(window).alias("rn"),) + dataframe.select( + col("*"), + row_number().over(window).alias("rn"), + ) .filter(col("rn") == 1) .drop("rn") ) @@ -162,7 +165,10 @@ def _write_in_debug_mode( ) def write( - self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, + self, + feature_set: FeatureSet, + dataframe: DataFrame, + spark_client: SparkClient, ) -> Union[StreamingQuery, None]: """Loads the latest data from a feature set into the Feature Store. diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index 5073f472..1dae795c 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -72,7 +72,10 @@ def _apply_transformations(self, df: DataFrame) -> DataFrame: @abstractmethod def write( - self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, + self, + feature_set: FeatureSet, + dataframe: DataFrame, + spark_client: SparkClient, ) -> Any: """Loads the data from a feature set into the Feature Store. diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index aeec4a6e..468c028e 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -180,7 +180,8 @@ def create_query( @staticmethod def _get_diff( - fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]], + fs_schema: List[Dict[str, Any]], + db_schema: List[Dict[str, Any]], ) -> Set[Diff]: """Gets schema difference between feature set and the table of a given db. @@ -296,7 +297,7 @@ def apply_migration( logger.info(f"Applying this query: {q} ...") self._client.sql(q) - logger.info(f"Feature Set migration finished successfully.") + logger.info("Feature Set migration finished successfully.") # inform in drone console which feature set was migrated print(f"The {feature_set.name} feature set was migrated.") diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index daa0afd3..8c6c211a 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -30,7 +30,10 @@ class MetastoreMigration(DatabaseMigration): data is being loaded into an entity table, then users can drop columns manually. """ - def __init__(self, database: str = None,) -> None: + def __init__( + self, + database: str = None, + ) -> None: self._db_config = MetastoreConfig() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 0bff33c6..c86a95c3 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -412,7 +412,9 @@ def _aggregate( # repartition to have all rows for each group at the same partition # by doing that, we won't have to shuffle data on grouping by id dataframe = repartition_df( - dataframe, partition_by=groupby, num_processors=num_processors, + dataframe, + partition_by=groupby, + num_processors=num_processors, ) grouped_data = dataframe.groupby(*groupby) diff --git a/butterfree/transform/transformations/aggregated_transform.py b/butterfree/transform/transformations/aggregated_transform.py index 7304f34b..a9581ef0 100644 --- a/butterfree/transform/transformations/aggregated_transform.py +++ b/butterfree/transform/transformations/aggregated_transform.py @@ -76,7 +76,11 @@ def aggregations(self) -> List[Tuple]: Function = namedtuple("Function", ["function", "data_type"]) return [ - Function(f.func(expression), f.data_type.spark,) for f in self.functions + Function( + f.func(expression), + f.data_type.spark, + ) + for f in self.functions ] def _get_output_name(self, function: object) -> str: diff --git a/butterfree/transform/transformations/custom_transform.py b/butterfree/transform/transformations/custom_transform.py index 9b5ae23b..7860fdc2 100644 --- a/butterfree/transform/transformations/custom_transform.py +++ b/butterfree/transform/transformations/custom_transform.py @@ -89,6 +89,8 @@ def transform(self, dataframe: DataFrame) -> DataFrame: """ dataframe = self.transformer( - dataframe, self.parent, **self.transformer__kwargs, + dataframe, + self.parent, + **self.transformer__kwargs, ) return dataframe diff --git a/butterfree/transform/transformations/h3_transform.py b/butterfree/transform/transformations/h3_transform.py index 8ccd3bb3..7a98294e 100644 --- a/butterfree/transform/transformations/h3_transform.py +++ b/butterfree/transform/transformations/h3_transform.py @@ -84,7 +84,10 @@ class H3HashTransform(TransformComponent): """ def __init__( - self, h3_resolutions: List[int], lat_column: str, lng_column: str, + self, + h3_resolutions: List[int], + lat_column: str, + lng_column: str, ): super().__init__() self.h3_resolutions = h3_resolutions diff --git a/butterfree/transform/transformations/sql_expression_transform.py b/butterfree/transform/transformations/sql_expression_transform.py index 0199c23a..80cd41ea 100644 --- a/butterfree/transform/transformations/sql_expression_transform.py +++ b/butterfree/transform/transformations/sql_expression_transform.py @@ -54,7 +54,8 @@ class SQLExpressionTransform(TransformComponent): """ def __init__( - self, expression: str, + self, + expression: str, ): super().__init__() self.expression = expression diff --git a/docs/requirements.txt b/docs/requirements.txt index a20ab18f..7eaabf11 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,5 +4,4 @@ sphinxemoji==0.1.6 typing-extensions==3.7.4.2 cmake==3.18.4 h3==3.7.0 -pyarrow==0.15.1 - +pyarrow==16.1.0 diff --git a/examples/test_examples.py b/examples/test_examples.py index b40b6e1a..7180e080 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -36,9 +36,9 @@ _, error = p.communicate() if p.returncode != 0: errors.append({"notebook": path, "error": error}) - print(f" >>> Error in execution!\n") + print(" >>> Error in execution!\n") else: - print(f" >>> Successful execution\n") + print(" >>> Successful execution\n") if errors: print(">>> Errors in the following notebooks:") diff --git a/mypy.ini b/mypy.ini index c67bd3a8..fc293149 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,5 @@ [mypy] -python_version = 3.7 +python_version = 3.9 ignore_missing_imports = True disallow_untyped_calls = False disallow_untyped_defs = True @@ -9,3 +9,42 @@ show_error_codes = True show_error_context = True disable_error_code = attr-defined, list-item, operator pretty = True + +[mypy-butterfree.pipelines.*] +ignore_errors = True + +[mypy-butterfree.load.*] +ignore_errors = True + +[mypy-butterfree.transform.*] +ignore_errors = True + +[mypy-butterfree.extract.*] +ignore_errors = True + +[mypy-butterfree.config.*] +ignore_errors = True + +[mypy-butterfree.clients.*] +ignore_errors = True + +[mypy-butterfree.configs.*] +ignore_errors = True + +[mypy-butterfree.dataframe_service.*] +ignore_errors = True + +[mypy-butterfree.validations.*] +ignore_errors = True + +[mypy-butterfree.migrations.*] +ignore_errors = True + +[mypy-butterfree.testing.*] +ignore_errors = True + +[mypy-butterfree.hooks.*] +ignore_errors = True + +[mypy-butterfree._cli.*] +ignore_errors = True diff --git a/requirements.dev.txt b/requirements.dev.txt index 4e164c83..89025669 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,11 +1,11 @@ -h3==3.7.4 +h3==3.7.7 jupyter==1.0.0 twine==3.1.1 -mypy==0.790 +mypy==1.10.0 sphinx==3.5.4 sphinxemoji==0.1.8 sphinx-rtd-theme==0.5.2 recommonmark==0.7.1 -pyarrow>=1.0.0 -setuptools -wheel +pyarrow==16.1.0 +setuptools==70.0.0 +wheel==0.43.0 diff --git a/requirements.lint.txt b/requirements.lint.txt index 7c51f4b3..66641a95 100644 --- a/requirements.lint.txt +++ b/requirements.lint.txt @@ -1,8 +1,7 @@ -black==19.10b0 -flake8==3.7.9 -flake8-isort==2.8.0 -isort<5 # temporary fix +black==21.12b0 +flake8==4.0.1 +flake8-isort==4.1.1 flake8-docstrings==1.5.0 flake8-bugbear==20.1.0 -flake8-bandit==3.0.0 - +flake8-bandit==2.1.2 +bandit==1.7.2 diff --git a/requirements.test.txt b/requirements.test.txt index b0c4032a..651700b8 100644 --- a/requirements.test.txt +++ b/requirements.test.txt @@ -2,4 +2,4 @@ pytest==5.3.2 pytest-cov==2.8.1 pytest-xdist==1.31.0 pytest-mock==2.0.0 -pytest-spark==0.5.2 +pytest-spark==0.6.0 diff --git a/requirements.txt b/requirements.txt index d61d125b..f3af4254 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -cassandra-driver>=3.22.0,<4.0 +cassandra-driver==3.24.0 mdutils>=1.2.2,<2.0 pandas>=0.24,<2.0 parameters-validation>=1.1.5,<2.0 -pyspark==3.* -typer>=0.3,<0.4 +pyspark==3.5.1 +typer==0.3.2 typing-extensions>3.7.4,<5 boto3==1.17.* diff --git a/setup.cfg b/setup.cfg index c58c2df3..849d35cf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,13 +10,13 @@ per-file-ignores = setup.py:D,S101 [isort] +profile = black line_length = 88 known_first_party = butterfree default_section = THIRDPARTY multi_line_output = 3 indent = ' ' skip_glob = pip -use_parantheses = True include_trailing_comma = True [tool:pytest] diff --git a/setup.py b/setup.py index 6fa35751..42ef57c8 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ author="QuintoAndar", install_requires=requirements, extras_require={"h3": ["h3>=3.7.4,<4"]}, - python_requires=">=3.7, <4", + python_requires=">=3.9, <4", entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]}, include_package_data=True, ) diff --git a/tests/integration/butterfree/extract/test_source.py b/tests/integration/butterfree/extract/test_source.py index c465ebd0..3ab991ab 100644 --- a/tests/integration/butterfree/extract/test_source.py +++ b/tests/integration/butterfree/extract/test_source.py @@ -1,11 +1,11 @@ from typing import List from pyspark.sql import DataFrame -from tests.integration import INPUT_PATH from butterfree.clients import SparkClient from butterfree.extract import Source from butterfree.extract.readers import FileReader, TableReader +from tests.integration import INPUT_PATH def create_temp_view(dataframe: DataFrame, name): @@ -13,10 +13,11 @@ def create_temp_view(dataframe: DataFrame, name): def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table): - spark.sql(f"create database if not exists {table_reader_db}") + spark.sql(f"drop schema if exists {table_reader_db} cascade") + spark.sql(f"create database {table_reader_db}") spark.sql(f"use {table_reader_db}") spark.sql( - f"create table if not exists {table_reader_db}.{table_reader_table} " # noqa + f"create table {table_reader_db}.{table_reader_table} " # noqa f"as select * from {table_reader_id}" # noqa ) @@ -33,7 +34,10 @@ def compare_dataframes( class TestSource: def test_source( - self, target_df_source, target_df_table_reader, spark_session, + self, + target_df_source, + target_df_table_reader, + spark_session, ): # given spark_client = SparkClient() @@ -66,6 +70,7 @@ def test_source( query=f"select a.*, b.feature2 " # noqa f"from {table_reader_id} a " # noqa f"inner join {file_reader_id} b on a.id = b.id ", # noqa + eager_evaluation=False, ) result_df = source.construct(client=spark_client) diff --git a/tests/integration/butterfree/load/conftest.py b/tests/integration/butterfree/load/conftest.py index 418b6d2a..60101f1a 100644 --- a/tests/integration/butterfree/load/conftest.py +++ b/tests/integration/butterfree/load/conftest.py @@ -51,7 +51,7 @@ def feature_set(): ] ts_feature = TimestampFeature(from_column="timestamp") features = [ - Feature(name="feature", description="Description", dtype=DataType.FLOAT), + Feature(name="feature", description="Description", dtype=DataType.INTEGER), ] return FeatureSet( "test_sink_feature_set", diff --git a/tests/integration/butterfree/load/test_sink.py b/tests/integration/butterfree/load/test_sink.py index b5f97879..f73f5f7c 100644 --- a/tests/integration/butterfree/load/test_sink.py +++ b/tests/integration/butterfree/load/test_sink.py @@ -24,10 +24,13 @@ def test_sink(input_dataframe, feature_set): s3config.mode = "overwrite" s3config.format_ = "parquet" s3config.get_options = Mock( - return_value={"path": "test_folder/historical/entity/feature_set"} + return_value={ + "path": "test_folder/historical/entity/feature_set", + "mode": "overwrite", + } ) s3config.get_path_with_partitions = Mock( - return_value="test_folder/historical/entity/feature_set" + return_value="spark-warehouse/test.db/test_folder/historical/entity/feature_set" ) historical_writer = HistoricalFeatureStoreWriter( diff --git a/tests/integration/butterfree/pipelines/conftest.py b/tests/integration/butterfree/pipelines/conftest.py index 73da163e..5f304972 100644 --- a/tests/integration/butterfree/pipelines/conftest.py +++ b/tests/integration/butterfree/pipelines/conftest.py @@ -132,7 +132,8 @@ def fixed_windows_output_feature_set_date_dataframe(spark_context, spark_session @pytest.fixture() def feature_set_pipeline( - spark_context, spark_session, + spark_context, + spark_session, ): feature_set_pipeline = FeatureSetPipeline( diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index d67e0a38..79125339 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -50,10 +50,11 @@ def create_temp_view(dataframe: DataFrame, name): def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table): - spark.sql(f"create database if not exists {table_reader_db}") + spark.sql(f"drop schema {table_reader_db} cascade") + spark.sql(f"create database {table_reader_db}") spark.sql(f"use {table_reader_db}") spark.sql( - f"create table if not exists {table_reader_db}.{table_reader_table} " # noqa + f"create table {table_reader_db}.{table_reader_table} " # noqa f"as select * from {table_reader_id}" # noqa ) @@ -74,7 +75,10 @@ def create_ymd(dataframe): class TestFeatureSetPipeline: def test_feature_set_pipeline( - self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, + self, + mocked_df, + spark_session, + fixed_windows_output_feature_set_dataframe, ): # arrange @@ -90,7 +94,7 @@ def test_feature_set_pipeline( table_reader_table=table_reader_table, ) - path = "test_folder/historical/entity/feature_set" + path = "spark-warehouse/test.db/test_folder/historical/entity/feature_set" dbconfig = MetastoreConfig() dbconfig.get_options = Mock( @@ -138,7 +142,9 @@ def test_feature_set_pipeline( description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ), ], @@ -237,7 +243,12 @@ def test_pipeline_with_hooks(self, spark_session): test_pipeline = FeatureSetPipeline( source=Source( - readers=[TableReader(id="reader", table="test",).add_post_hook(hook1)], + readers=[ + TableReader( + id="reader", + table="test", + ).add_post_hook(hook1) + ], query="select * from reader", ).add_post_hook(hook1), feature_set=FeatureSet( @@ -263,7 +274,9 @@ def test_pipeline_with_hooks(self, spark_session): ) .add_pre_hook(hook1) .add_post_hook(hook1), - sink=Sink(writers=[historical_writer],).add_pre_hook(hook1), + sink=Sink( + writers=[historical_writer], + ).add_pre_hook(hook1), ) # act @@ -325,11 +338,13 @@ def test_pipeline_interval_run( db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") path = "test_folder/historical/entity/feature_set" + read_path = "spark-warehouse/test.db/" + path spark_session.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") - spark_session.sql(f"create database if not exists {db}") + spark_session.sql(f"drop schema {db} cascade") + spark_session.sql(f"create database {db}") spark_session.sql( - f"create table if not exists {db}.feature_set_interval " + f"create table {db}.feature_set_interval " f"(id int, timestamp timestamp, feature int, " f"run_id int, year int, month int, day int);" ) @@ -340,7 +355,7 @@ def test_pipeline_interval_run( ) historical_writer = HistoricalFeatureStoreWriter( - db_config=dbconfig, interval_mode=True + db_config=dbconfig, interval_mode=True, row_count_validation=False ) first_run_hook = RunHook(id=1) @@ -356,9 +371,10 @@ def test_pipeline_interval_run( test_pipeline = FeatureSetPipeline( source=Source( readers=[ - TableReader(id="id", table="input_data",).with_incremental_strategy( - IncrementalStrategy("ts") - ), + TableReader( + id="id", + table="input_data", + ).with_incremental_strategy(IncrementalStrategy("ts")), ], query="select * from id ", ), @@ -366,48 +382,56 @@ def test_pipeline_interval_run( name="feature_set_interval", entity="entity", description="", - keys=[KeyFeature(name="id", description="", dtype=DataType.INTEGER,)], + keys=[ + KeyFeature( + name="id", + description="", + dtype=DataType.INTEGER, + ) + ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature(name="feature", description="", dtype=DataType.INTEGER), Feature(name="run_id", description="", dtype=DataType.INTEGER), ], ), - sink=Sink([historical_writer],), + sink=Sink( + [historical_writer], + ), ) # act and assert dbconfig.get_path_with_partitions = Mock( return_value=[ - "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", - "test_folder/historical/entity/feature_set/year=2016/month=4/day=12", - "test_folder/historical/entity/feature_set/year=2016/month=4/day=13", + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=11", # noqa + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=12", # noqa + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=13", # noqa ] ) test_pipeline.feature_set.add_pre_hook(first_run_hook) test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11") - first_run_output_df = spark_session.read.parquet(path) + first_run_output_df = spark_session.read.parquet(read_path) assert_dataframe_equality(first_run_output_df, first_run_target_df) dbconfig.get_path_with_partitions = Mock( return_value=[ - "test_folder/historical/entity/feature_set/year=2016/month=4/day=14", + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=14", # noqa ] ) test_pipeline.feature_set.add_pre_hook(second_run_hook) test_pipeline.run_for_date("2016-04-14") - second_run_output_df = spark_session.read.parquet(path) + second_run_output_df = spark_session.read.parquet(read_path) assert_dataframe_equality(second_run_output_df, second_run_target_df) dbconfig.get_path_with_partitions = Mock( return_value=[ - "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=11", # noqa ] ) test_pipeline.feature_set.add_pre_hook(third_run_hook) test_pipeline.run_for_date("2016-04-11") - third_run_output_df = spark_session.read.parquet(path) + third_run_output_df = spark_session.read.parquet(read_path) assert_dataframe_equality(third_run_output_df, third_run_target_df) # tear down - shutil.rmtree("test_folder") + shutil.rmtree("spark-warehouse/test.db/test_folder") diff --git a/tests/integration/butterfree/transform/test_aggregated_feature_set.py b/tests/integration/butterfree/transform/test_aggregated_feature_set.py index bc3ebb6c..41307761 100644 --- a/tests/integration/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/integration/butterfree/transform/test_aggregated_feature_set.py @@ -19,7 +19,9 @@ def divide(df, fs, column1, column2): class TestAggregatedFeatureSet: def test_construct_without_window( - self, feature_set_dataframe, target_df_without_window, + self, + feature_set_dataframe, + target_df_without_window, ): # given @@ -157,7 +159,9 @@ def test_construct_rolling_windows_without_end_date( ) ], timestamp=TimestampFeature(), - ).with_windows(definitions=["1 day", "1 week"],) + ).with_windows( + definitions=["1 day", "1 week"], + ) # act & assert with pytest.raises(ValueError): @@ -201,7 +205,9 @@ def test_h3_feature_set(self, h3_input_df, h3_target_df): assert_dataframe_equality(output_df, h3_target_df) def test_construct_with_pivot( - self, feature_set_df_pivot, target_df_pivot_agg, + self, + feature_set_df_pivot, + target_df_pivot_agg, ): # given @@ -243,7 +249,9 @@ def test_construct_with_pivot( assert_dataframe_equality(output_df, target_df_pivot_agg) def test_construct_rolling_windows_with_date_boundaries( - self, feature_set_dates_dataframe, rolling_windows_output_date_boundaries, + self, + feature_set_dates_dataframe, + rolling_windows_output_date_boundaries, ): # given diff --git a/tests/integration/butterfree/transform/test_feature_set.py b/tests/integration/butterfree/transform/test_feature_set.py index 25f70b6e..6c5f7f1d 100644 --- a/tests/integration/butterfree/transform/test_feature_set.py +++ b/tests/integration/butterfree/transform/test_feature_set.py @@ -51,7 +51,9 @@ def test_construct( description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ), ], @@ -92,7 +94,11 @@ def test_construct_with_date_boundaries( entity="entity", description="description", features=[ - Feature(name="feature", description="test", dtype=DataType.FLOAT,), + Feature( + name="feature", + description="test", + dtype=DataType.FLOAT, + ), ], keys=[ KeyFeature( diff --git a/tests/mocks/entities/first/first_pipeline.py b/tests/mocks/entities/first/first_pipeline.py index 90cfba96..938c880c 100644 --- a/tests/mocks/entities/first/first_pipeline.py +++ b/tests/mocks/entities/first/first_pipeline.py @@ -15,7 +15,13 @@ class FirstPipeline(FeatureSetPipeline): def __init__(self): super(FirstPipeline, self).__init__( source=Source( - readers=[TableReader(id="t", database="db", table="table",)], + readers=[ + TableReader( + id="t", + database="db", + table="table", + ) + ], query=f"select * from t", # noqa ), feature_set=FeatureSet( @@ -23,7 +29,11 @@ def __init__(self): entity="entity", description="description", features=[ - Feature(name="feature1", description="test", dtype=DataType.FLOAT,), + Feature( + name="feature1", + description="test", + dtype=DataType.FLOAT, + ), Feature( name="feature2", description="another test", @@ -32,7 +42,9 @@ def __init__(self): ], keys=[ KeyFeature( - name="id", description="identifier", dtype=DataType.BIGINT, + name="id", + description="identifier", + dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), diff --git a/tests/mocks/entities/second/deeper/second_pipeline.py b/tests/mocks/entities/second/deeper/second_pipeline.py index 12c53cf3..a59ba2e5 100644 --- a/tests/mocks/entities/second/deeper/second_pipeline.py +++ b/tests/mocks/entities/second/deeper/second_pipeline.py @@ -15,7 +15,13 @@ class SecondPipeline(FeatureSetPipeline): def __init__(self): super(SecondPipeline, self).__init__( source=Source( - readers=[TableReader(id="t", database="db", table="table",)], + readers=[ + TableReader( + id="t", + database="db", + table="table", + ) + ], query=f"select * from t", # noqa ), feature_set=FeatureSet( @@ -24,7 +30,9 @@ def __init__(self): description="description", features=[ Feature( - name="feature1", description="test", dtype=DataType.STRING, + name="feature1", + description="test", + dtype=DataType.STRING, ), Feature( name="feature2", @@ -34,7 +42,9 @@ def __init__(self): ], keys=[ KeyFeature( - name="id", description="identifier", dtype=DataType.BIGINT, + name="id", + description="identifier", + dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), diff --git a/tests/unit/butterfree/clients/test_spark_client.py b/tests/unit/butterfree/clients/test_spark_client.py index 12d8ac9d..b2418a7c 100644 --- a/tests/unit/butterfree/clients/test_spark_client.py +++ b/tests/unit/butterfree/clients/test_spark_client.py @@ -69,7 +69,8 @@ def test_read( assert target_df.collect() == result_df.collect() @pytest.mark.parametrize( - "format, path", [(None, "path/to/file"), ("csv", 123)], + "format, path", + [(None, "path/to/file"), ("csv", 123)], ) def test_read_invalid_params(self, format: Optional[str], path: Any) -> None: # arrange @@ -115,7 +116,8 @@ def test_read_table( assert target_df == result_df @pytest.mark.parametrize( - "database, table", [("database", None), ("database", 123)], + "database, table", + [("database", None), ("database", 123)], ) def test_read_table_invalid_params( self, database: str, table: Optional[int] @@ -128,7 +130,8 @@ def test_read_table_invalid_params( spark_client.read_table(table, database) # type: ignore @pytest.mark.parametrize( - "format, mode", [("parquet", "append"), ("csv", "overwrite")], + "format, mode", + [("parquet", "append"), ("csv", "overwrite")], ) def test_write_dataframe( self, format: str, mode: str, mocked_spark_write: Mock @@ -137,7 +140,8 @@ def test_write_dataframe( mocked_spark_write.save.assert_called_with(format=format, mode=mode) @pytest.mark.parametrize( - "format, mode", [(None, "append"), ("parquet", 1)], + "format, mode", + [(None, "append"), ("parquet", 1)], ) def test_write_dataframe_invalid_params( self, target_df: DataFrame, format: Optional[str], mode: Union[str, int] @@ -266,7 +270,7 @@ def test_create_temporary_view( def test_add_table_partitions(self, mock_spark_sql: Mock): # arrange target_command = ( - f"ALTER TABLE `db`.`table` ADD IF NOT EXISTS " + f"ALTER TABLE `db`.`table` ADD IF NOT EXISTS " # noqa f"PARTITION ( year = 2020, month = 8, day = 14 ) " f"PARTITION ( year = 2020, month = 8, day = 15 ) " f"PARTITION ( year = 2020, month = 8, day = 16 )" diff --git a/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py b/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py index 669fd033..fed20f2d 100644 --- a/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py +++ b/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py @@ -28,7 +28,8 @@ def test_filter(self, feature_set_dataframe, spark_context, spark_session): assert result_df.collect() == target_df.collect() @pytest.mark.parametrize( - "condition", [None, 100], + "condition", + [None, 100], ) def test_filter_with_invalidations( self, feature_set_dataframe, condition, spark_context, spark_session diff --git a/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py b/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py index e716f9d6..cfe730d3 100644 --- a/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py +++ b/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py @@ -9,7 +9,9 @@ class TestPivotTransform: def test_pivot_transformation( - self, input_df, pivot_df, + self, + input_df, + pivot_df, ): result_df = pivot( dataframe=input_df, @@ -20,10 +22,15 @@ def test_pivot_transformation( ) # assert - assert compare_dataframes(actual_df=result_df, expected_df=pivot_df,) + assert compare_dataframes( + actual_df=result_df, + expected_df=pivot_df, + ) def test_pivot_transformation_with_forward_fill( - self, input_df, pivot_ffill_df, + self, + input_df, + pivot_ffill_df, ): result_df = pivot( dataframe=input_df, @@ -35,10 +42,15 @@ def test_pivot_transformation_with_forward_fill( ) # assert - assert compare_dataframes(actual_df=result_df, expected_df=pivot_ffill_df,) + assert compare_dataframes( + actual_df=result_df, + expected_df=pivot_ffill_df, + ) def test_pivot_transformation_with_forward_fill_and_mock( - self, input_df, pivot_ffill_mock_df, + self, + input_df, + pivot_ffill_mock_df, ): result_df = pivot( dataframe=input_df, @@ -52,10 +64,15 @@ def test_pivot_transformation_with_forward_fill_and_mock( ) # assert - assert compare_dataframes(actual_df=result_df, expected_df=pivot_ffill_mock_df,) + assert compare_dataframes( + actual_df=result_df, + expected_df=pivot_ffill_mock_df, + ) def test_pivot_transformation_mock_without_type( - self, input_df, pivot_ffill_mock_df, + self, + input_df, + pivot_ffill_mock_df, ): with pytest.raises(AttributeError): _ = pivot( @@ -83,4 +100,7 @@ def test_apply_pivot_transformation(self, input_df, pivot_df): result_df = file_reader._apply_transformations(input_df) # assert - assert compare_dataframes(actual_df=result_df, expected_df=pivot_df,) + assert compare_dataframes( + actual_df=result_df, + expected_df=pivot_df, + ) diff --git a/tests/unit/butterfree/extract/readers/test_file_reader.py b/tests/unit/butterfree/extract/readers/test_file_reader.py index 9e1c42bc..136c8fd6 100644 --- a/tests/unit/butterfree/extract/readers/test_file_reader.py +++ b/tests/unit/butterfree/extract/readers/test_file_reader.py @@ -7,7 +7,15 @@ class TestFileReader: @pytest.mark.parametrize( - "path, format", [(None, "parquet"), ("path/to/file.json", 123), (123, None,)], + "path, format", + [ + (None, "parquet"), + ("path/to/file.json", 123), + ( + 123, + None, + ), + ], ) def test_init_invalid_params(self, path, format): # act and assert diff --git a/tests/unit/butterfree/extract/readers/test_reader.py b/tests/unit/butterfree/extract/readers/test_reader.py index 78160553..bcceacbd 100644 --- a/tests/unit/butterfree/extract/readers/test_reader.py +++ b/tests/unit/butterfree/extract/readers/test_reader.py @@ -148,7 +148,8 @@ def test_build_with_columns( # act file_reader.build( - client=spark_client, columns=[("col1", "new_col1"), ("col2", "new_col2")], + client=spark_client, + columns=[("col1", "new_col1"), ("col2", "new_col2")], ) result_df = spark_session.sql("select * from test") diff --git a/tests/unit/butterfree/extract/readers/test_table_reader.py b/tests/unit/butterfree/extract/readers/test_table_reader.py index 65f3be23..1a2f56f2 100644 --- a/tests/unit/butterfree/extract/readers/test_table_reader.py +++ b/tests/unit/butterfree/extract/readers/test_table_reader.py @@ -5,7 +5,14 @@ class TestTableReader: @pytest.mark.parametrize( - "database, table", [("database", 123), (123, None,)], + "database, table", + [ + ("database", 123), + ( + 123, + None, + ), + ], ) def test_init_invalid_params(self, database, table): # act and assert diff --git a/tests/unit/butterfree/extract/test_source.py b/tests/unit/butterfree/extract/test_source.py index 53af8b65..842d2210 100644 --- a/tests/unit/butterfree/extract/test_source.py +++ b/tests/unit/butterfree/extract/test_source.py @@ -14,7 +14,8 @@ def test_construct(self, mocker, target_df): # when source_selector = Source( - readers=[reader], query=f"select * from {reader_id}", # noqa + readers=[reader], + query=f"select * from {reader_id}", # noqa ) result_df = source_selector.construct(spark_client) @@ -32,7 +33,8 @@ def test_is_cached(self, mocker, target_df): # when source_selector = Source( - readers=[reader], query=f"select * from {reader_id}", # noqa + readers=[reader], + query=f"select * from {reader_id}", # noqa ) result_df = source_selector.construct(spark_client) diff --git a/tests/unit/butterfree/load/conftest.py b/tests/unit/butterfree/load/conftest.py index 4dcf25c9..d0bb2c3b 100644 --- a/tests/unit/butterfree/load/conftest.py +++ b/tests/unit/butterfree/load/conftest.py @@ -20,7 +20,11 @@ def feature_set(): ] ts_feature = TimestampFeature(from_column=TIMESTAMP_COLUMN) features = [ - Feature(name="feature", description="Description", dtype=DataType.BIGINT,) + Feature( + name="feature", + description="Description", + dtype=DataType.BIGINT, + ) ] return FeatureSet( "feature_set", diff --git a/tests/unit/butterfree/load/processing/test_json_transform.py b/tests/unit/butterfree/load/processing/test_json_transform.py index 73949eea..78320d10 100644 --- a/tests/unit/butterfree/load/processing/test_json_transform.py +++ b/tests/unit/butterfree/load/processing/test_json_transform.py @@ -3,7 +3,9 @@ class TestJsonTransform: def test_json_transformation( - self, input_df, json_df, + self, + input_df, + json_df, ): result_df = json_transform(dataframe=input_df) diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py index dcd96714..237158b7 100644 --- a/tests/unit/butterfree/migrations/database_migration/conftest.py +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -45,10 +45,18 @@ def feature_set(): entity="entity", description="description", features=[ - Feature(name="feature_float", description="test", dtype=DataType.FLOAT,), + Feature( + name="feature_float", + description="test", + dtype=DataType.FLOAT, + ), ], keys=[ - KeyFeature(name="id", description="The device ID", dtype=DataType.BIGINT,) + KeyFeature( + name="id", + description="The device ID", + dtype=DataType.BIGINT, + ) ], timestamp=TimestampFeature(), ) diff --git a/tests/unit/butterfree/pipelines/conftest.py b/tests/unit/butterfree/pipelines/conftest.py index 47e65efb..f17e5f41 100644 --- a/tests/unit/butterfree/pipelines/conftest.py +++ b/tests/unit/butterfree/pipelines/conftest.py @@ -23,7 +23,13 @@ def feature_set_pipeline(): spark_client=SparkClient(), source=Mock( spec=Source, - readers=[TableReader(id="source_a", database="db", table="table",)], + readers=[ + TableReader( + id="source_a", + database="db", + table="table", + ) + ], query="select * from source_a", ), feature_set=Mock( @@ -57,7 +63,10 @@ def feature_set_pipeline(): ), ], ), - sink=Mock(spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)],), + sink=Mock( + spec=Sink, + writers=[HistoricalFeatureStoreWriter(db_config=None)], + ), ) return test_pipeline diff --git a/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py b/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py index 7bae6606..5a67e77d 100644 --- a/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py @@ -22,6 +22,20 @@ from butterfree.transform.utils import Function +def get_reader(): + table_reader = TableReader( + id="source_a", + database="db", + table="table", + ) + + return table_reader + + +def get_historical_writer(): + return HistoricalFeatureStoreWriter(db_config=None) + + class TestFeatureSetPipeline: def test_feature_set_args(self): # arrange and act @@ -38,8 +52,12 @@ def test_feature_set_args(self): pipeline = FeatureSetPipeline( source=Source( readers=[ - TableReader(id="source_a", database="db", table="table",), - FileReader(id="source_b", path="path", format="parquet",), + get_reader(), + FileReader( + id="source_b", + path="path", + format="parquet", + ), ], query="select a.*, b.specific_feature " "from source_a left join source_b on a.id=b.id", @@ -131,7 +149,7 @@ def test_source_raise(self): source=Mock( spark_client=SparkClient(), readers=[ - TableReader(id="source_a", database="db", table="table",), + get_reader(), ], query="select * from source_a", ), @@ -167,7 +185,8 @@ def test_source_raise(self): ], ), sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], + spec=Sink, + writers=[get_historical_writer()], ), ) @@ -180,7 +199,7 @@ def test_feature_set_raise(self): source=Mock( spec=Source, readers=[ - TableReader(id="source_a", database="db", table="table",), + get_reader(), ], query="select * from source_a", ), @@ -215,7 +234,8 @@ def test_feature_set_raise(self): ], ), sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], + spec=Sink, + writers=[get_historical_writer()], ), ) @@ -226,7 +246,7 @@ def test_sink_raise(self): source=Mock( spec=Source, readers=[ - TableReader(id="source_a", database="db", table="table",), + get_reader(), ], query="select * from source_a", ), @@ -250,7 +270,9 @@ def test_sink_raise(self): key_columns=["user_id"], timestamp_column="ts", ), - sink=Mock(writers=[HistoricalFeatureStoreWriter(db_config=None)],), + sink=Mock( + writers=[get_historical_writer()], + ), ) def test_run_agg_with_end_date(self, spark_session, feature_set_pipeline): diff --git a/tests/unit/butterfree/reports/test_metadata.py b/tests/unit/butterfree/reports/test_metadata.py index 6f26cc55..093721df 100644 --- a/tests/unit/butterfree/reports/test_metadata.py +++ b/tests/unit/butterfree/reports/test_metadata.py @@ -16,49 +16,63 @@ from butterfree.transform.utils import Function +def get_pipeline(): + + return FeatureSetPipeline( + source=Source( + readers=[ + TableReader( + id="source_a", + database="db", + table="table", + ), + FileReader( + id="source_b", + path="path", + format="parquet", + ), + ], + query="select a.*, b.specific_feature " + "from source_a left join source_b on a.id=b.id", + ), + feature_set=FeatureSet( + name="feature_set", + entity="entity", + description="description", + keys=[ + KeyFeature( + name="user_id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(from_column="ts"), + features=[ + Feature( + name="page_viewed__rent_per_month", + description="Average of something.", + transformation=SparkFunctionTransform( + functions=[ + Function(functions.avg, DataType.FLOAT), + Function(functions.stddev_pop, DataType.DOUBLE), + ], + ), + ), + ], + ), + sink=Sink( + writers=[ + HistoricalFeatureStoreWriter(db_config=None), + OnlineFeatureStoreWriter(db_config=None), + ], + ), + ) + + class TestMetadata: def test_json(self): - pipeline = FeatureSetPipeline( - source=Source( - readers=[ - TableReader(id="source_a", database="db", table="table",), - FileReader(id="source_b", path="path", format="parquet",), - ], - query="select a.*, b.specific_feature " - "from source_a left join source_b on a.id=b.id", - ), - feature_set=FeatureSet( - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="page_viewed__rent_per_month", - description="Average of something.", - transformation=SparkFunctionTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.DOUBLE), - ], - ), - ), - ], - ), - sink=Sink( - writers=[ - HistoricalFeatureStoreWriter(db_config=None), - OnlineFeatureStoreWriter(db_config=None), - ], - ), - ) + + pipeline = get_pipeline() target_json = [ { @@ -102,47 +116,8 @@ def test_json(self): assert json == target_json def test_markdown(self): - pipeline = FeatureSetPipeline( - source=Source( - readers=[ - TableReader(id="source_a", database="db", table="table",), - FileReader(id="source_b", path="path", format="parquet",), - ], - query="select a.*, b.specific_feature " - "from source_a left join source_b on a.id=b.id", - ), - feature_set=FeatureSet( - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="page_viewed__rent_per_month", - description="Average of something.", - transformation=SparkFunctionTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.DOUBLE), - ], - ), - ), - ], - ), - sink=Sink( - writers=[ - HistoricalFeatureStoreWriter(db_config=None), - OnlineFeatureStoreWriter(db_config=None), - ], - ), - ) + + pipeline = get_pipeline() target_md = ( "\n# Feature_set\n\n## Description\n\n\ndescription \n\n" diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index ab760640..fcf60132 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -455,6 +455,12 @@ def agg_feature_set(): ), ), ], - keys=[KeyFeature(name="id", description="description", dtype=DataType.BIGINT,)], + keys=[ + KeyFeature( + name="id", + description="description", + dtype=DataType.BIGINT, + ) + ], timestamp=TimestampFeature(), ) diff --git a/tests/unit/butterfree/transform/features/test_feature.py b/tests/unit/butterfree/transform/features/test_feature.py index 14a89f2c..01bb41e5 100644 --- a/tests/unit/butterfree/transform/features/test_feature.py +++ b/tests/unit/butterfree/transform/features/test_feature.py @@ -98,7 +98,9 @@ def test_feature_transform_with_from_column_and_column_name_exists( def test_feature_transform_with_dtype(self, feature_set_dataframe): test_feature = Feature( - name="feature", description="unit test", dtype=DataType.TIMESTAMP, + name="feature", + description="unit test", + dtype=DataType.TIMESTAMP, ) df = test_feature.transform(feature_set_dataframe) diff --git a/tests/unit/butterfree/transform/test_aggregated_feature_set.py b/tests/unit/butterfree/transform/test_aggregated_feature_set.py index 73320cf5..38ec249a 100644 --- a/tests/unit/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/unit/butterfree/transform/test_aggregated_feature_set.py @@ -44,7 +44,10 @@ def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): ).construct(dataframe, spark_client) def test_agg_feature_set_with_window( - self, dataframe, rolling_windows_agg_dataframe, agg_feature_set, + self, + dataframe, + rolling_windows_agg_dataframe, + agg_feature_set, ): spark_client = SparkClient() @@ -61,7 +64,10 @@ def test_agg_feature_set_with_window( assert_dataframe_equality(output_df, rolling_windows_agg_dataframe) def test_agg_feature_set_with_smaller_slide( - self, dataframe, rolling_windows_hour_slide_agg_dataframe, agg_feature_set, + self, + dataframe, + rolling_windows_hour_slide_agg_dataframe, + agg_feature_set, ): spark_client = SparkClient() @@ -366,7 +372,9 @@ def test_define_start_date(self, agg_feature_set): assert start_date == "2020-07-27" def test_feature_set_start_date( - self, timestamp_c, feature_set_with_distinct_dataframe, + self, + timestamp_c, + feature_set_with_distinct_dataframe, ): fs = AggregatedFeatureSet( name="name", diff --git a/tests/unit/butterfree/transform/test_feature_set.py b/tests/unit/butterfree/transform/test_feature_set.py index 43d937be..e907dc0a 100644 --- a/tests/unit/butterfree/transform/test_feature_set.py +++ b/tests/unit/butterfree/transform/test_feature_set.py @@ -3,12 +3,6 @@ import pytest from pyspark.sql import functions as F from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType -from tests.unit.butterfree.transform.conftest import ( - feature_add, - feature_divide, - key_id, - timestamp_c, -) from butterfree.clients import SparkClient from butterfree.constants import DataType @@ -20,6 +14,12 @@ SQLExpressionTransform, ) from butterfree.transform.utils import Function +from tests.unit.butterfree.transform.conftest import ( + feature_add, + feature_divide, + key_id, + timestamp_c, +) class TestFeatureSet: @@ -70,7 +70,14 @@ class TestFeatureSet: None, [feature_add, feature_divide], ), - ("name", "entity", "description", [key_id], timestamp_c, [None],), + ( + "name", + "entity", + "description", + [key_id], + timestamp_c, + [None], + ), ], ) def test_cannot_instantiate( diff --git a/tests/unit/butterfree/transform/transformations/conftest.py b/tests/unit/butterfree/transform/transformations/conftest.py index 8f3c13bf..41bc63d5 100644 --- a/tests/unit/butterfree/transform/transformations/conftest.py +++ b/tests/unit/butterfree/transform/transformations/conftest.py @@ -62,7 +62,7 @@ def target_df_spark(spark_context, spark_session): "timestamp": "2016-04-11 11:31:11", "feature1": 200, "feature2": 200, - "feature__cos": 0.4871876750070059, + "feature__cos": 0.48718767500700594, }, { "id": 1, diff --git a/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py b/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py index 6cdebf74..f0ae2f85 100644 --- a/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py @@ -44,7 +44,10 @@ def test_output_columns(self): assert all( [ a == b - for a, b in zip(df_columns, ["feature1__avg", "feature1__stddev_pop"],) + for a, b in zip( + df_columns, + ["feature1__avg", "feature1__stddev_pop"], + ) ] ) diff --git a/tests/unit/butterfree/transform/transformations/test_custom_transform.py b/tests/unit/butterfree/transform/transformations/test_custom_transform.py index 4198d9bd..d87cc7cb 100644 --- a/tests/unit/butterfree/transform/transformations/test_custom_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_custom_transform.py @@ -21,7 +21,9 @@ def test_feature_transform(self, feature_set_dataframe): description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ) @@ -44,7 +46,9 @@ def test_output_columns(self, feature_set_dataframe): description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ) @@ -59,7 +63,9 @@ def test_custom_transform_output(self, feature_set_dataframe): description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ) diff --git a/tests/unit/butterfree/transform/transformations/test_h3_transform.py b/tests/unit/butterfree/transform/transformations/test_h3_transform.py index 4b3308eb..d4ad6493 100644 --- a/tests/unit/butterfree/transform/transformations/test_h3_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_h3_transform.py @@ -64,9 +64,9 @@ def test_import_error(self): for m in modules: del sys.modules[m] with pytest.raises(ModuleNotFoundError, match="you must install"): - from butterfree.transform.transformations.h3_transform import ( # noqa - H3HashTransform, # noqa - ) # noqa + from butterfree.transform.transformations.h3_transform import ( # noqa; noqa + H3HashTransform, + ) def test_with_stack(self, h3_input_df, h3_with_stack_target_df): # arrange diff --git a/tests/unit/butterfree/transform/transformations/test_spark_function_transform.py b/tests/unit/butterfree/transform/transformations/test_spark_function_transform.py index fe8bca85..cf88657a 100644 --- a/tests/unit/butterfree/transform/transformations/test_spark_function_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_spark_function_transform.py @@ -126,7 +126,9 @@ def test_feature_transform_output_row_windows( transformation=SparkFunctionTransform( functions=[Function(functions.avg, DataType.DOUBLE)], ).with_window( - partition_by="id", mode="row_windows", window_definition=["2 events"], + partition_by="id", + mode="row_windows", + window_definition=["2 events"], ), ) diff --git a/tests/unit/butterfree/transform/transformations/test_sql_expression_transform.py b/tests/unit/butterfree/transform/transformations/test_sql_expression_transform.py index 9cc2e687..814f8301 100644 --- a/tests/unit/butterfree/transform/transformations/test_sql_expression_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_sql_expression_transform.py @@ -43,7 +43,15 @@ def test_output_columns(self): df_columns = test_feature.get_output_columns() - assert all([a == b for a, b in zip(df_columns, ["feature1_over_feature2"],)]) + assert all( + [ + a == b + for a, b in zip( + df_columns, + ["feature1_over_feature2"], + ) + ] + ) def test_feature_transform_output(self, feature_set_dataframe): test_feature = Feature( From 5af8a05a841c02114e08d90455e9cd772a1980c3 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 3 Jun 2024 15:03:56 -0300 Subject: [PATCH 60/86] fix: sphinx version (#356) * fix: sphinx --- Makefile | 2 +- requirements.dev.txt | 6 +++--- setup.cfg | 6 ++++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index ba0d0ead..3164f503 100644 --- a/Makefile +++ b/Makefile @@ -152,7 +152,7 @@ package: ## update Butterfree API docs update-docs: cd ./docs; rm -rf source/butterfree.* - cd ./docs; sphinx-apidoc -T -E -o source/ ../butterfree + cd ./docs; sphinx-apidoc -o source/ ../butterfree cd ./docs; make coverage .PHONY: docs diff --git a/requirements.dev.txt b/requirements.dev.txt index 89025669..bf4b4b2b 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -2,9 +2,9 @@ h3==3.7.7 jupyter==1.0.0 twine==3.1.1 mypy==1.10.0 -sphinx==3.5.4 -sphinxemoji==0.1.8 -sphinx-rtd-theme==0.5.2 +sphinx==6.2.1 +sphinxemoji==0.3.1 +sphinx-rtd-theme==1.3.0 recommonmark==0.7.1 pyarrow==16.1.0 setuptools==70.0.0 diff --git a/setup.cfg b/setup.cfg index 849d35cf..8206c6ae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,3 +41,9 @@ disallow_any_generics = True disallow_untyped_defs = True check_untyped_defs = True disallow_untyped_calls = True + +[build_sphinx] +all-files = 1 +source-dir = docs/source +build-dir = docs/build +warning-is-error = 0 From cbda73d974d7e2058de8823800f2df0fa8bf6160 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 7 Jun 2024 13:46:16 -0300 Subject: [PATCH 61/86] fix: publish and dev versions (#359) * fix: publish, versions, tests --- .github/workflows/publish.yml | 3 +- Makefile | 4 +- butterfree/_cli/main.py | 2 +- butterfree/_cli/migrate.py | 6 ++- butterfree/clients/__init__.py | 1 + butterfree/clients/abstract_client.py | 5 ++- butterfree/clients/cassandra_client.py | 5 ++- butterfree/clients/spark_client.py | 25 +++++++----- butterfree/configs/db/cassandra_config.py | 25 ++++++------ butterfree/configs/db/kafka_config.py | 17 ++++---- butterfree/configs/db/metastore_config.py | 8 ++-- butterfree/configs/environment.py | 5 ++- butterfree/constants/__init__.py | 1 + butterfree/constants/migrations.py | 1 + butterfree/dataframe_service/__init__.py | 1 + .../dataframe_service/incremental_strategy.py | 17 ++++++-- butterfree/dataframe_service/repartition.py | 13 ++++--- butterfree/extract/__init__.py | 1 + butterfree/extract/pre_processing/__init__.py | 1 + .../explode_json_column_transform.py | 1 + .../pre_processing/filter_transform.py | 1 + .../pre_processing/forward_fill_transform.py | 5 ++- .../extract/pre_processing/pivot_transform.py | 7 ++-- .../pre_processing/replace_transform.py | 1 + butterfree/extract/readers/__init__.py | 1 + butterfree/extract/readers/file_reader.py | 7 ++-- butterfree/extract/readers/kafka_reader.py | 7 ++-- butterfree/extract/readers/reader.py | 10 +++-- butterfree/extract/readers/table_reader.py | 4 +- butterfree/extract/source.py | 7 +++- butterfree/hooks/__init__.py | 1 + .../hooks/schema_compatibility/__init__.py | 1 + .../spark_table_schema_compatibility_hook.py | 6 ++- butterfree/load/processing/__init__.py | 1 + butterfree/load/processing/json_transform.py | 1 + butterfree/load/sink.py | 1 + .../historical_feature_store_writer.py | 19 +++++---- .../writers/online_feature_store_writer.py | 20 ++++++---- butterfree/load/writers/writer.py | 16 +++++--- .../database_migration/database_migration.py | 14 ++++--- .../database_migration/metastore_migration.py | 4 +- butterfree/pipelines/__init__.py | 1 + butterfree/pipelines/feature_set_pipeline.py | 23 +++++------ butterfree/reports/__init__.py | 1 + butterfree/testing/dataframe/__init__.py | 5 ++- .../transform/aggregated_feature_set.py | 19 ++++----- butterfree/transform/feature_set.py | 9 +++-- butterfree/transform/features/feature.py | 9 +++-- butterfree/transform/features/key_feature.py | 6 ++- .../transform/features/timestamp_feature.py | 11 ++++-- .../transformations/aggregated_transform.py | 7 +++- .../transformations/custom_transform.py | 2 +- .../spark_function_transform.py | 11 ++++-- .../transformations/transform_component.py | 1 + .../user_defined_functions/mode.py | 1 + .../most_frequent_set.py | 1 + butterfree/transform/utils/__init__.py | 1 + butterfree/transform/utils/date_range.py | 6 +-- butterfree/transform/utils/function.py | 4 +- butterfree/transform/utils/window_spec.py | 5 ++- butterfree/validations/basic_validaton.py | 4 +- butterfree/validations/validation.py | 4 +- docs/source/conf.py | 1 + mypy.ini | 39 ------------------- requirements.lint.txt | 2 +- requirements.txt | 2 +- .../butterfree/pipelines/conftest.py | 5 ++- .../test_aggregated_transform.py | 2 +- 68 files changed, 262 insertions(+), 196 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 0957a958..d33e4aa0 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -9,8 +9,7 @@ jobs: Pipeline: if: github.ref == 'refs/heads/master' - runs-on: ubuntu-22.04 - container: quintoandar/python-3-7-java + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 diff --git a/Makefile b/Makefile index 3164f503..bf9ccd64 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,8 @@ VERSION := $(shell grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d .PHONY: environment ## create virtual environment for butterfree environment: - @pyenv install -s 3.7.13 - @pyenv virtualenv 3.7.13 butterfree + @pyenv install -s 3.9.19 + @pyenv virtualenv 3.9.19 butterfree @pyenv local butterfree @PYTHONPATH=. python -m pip install --upgrade pip diff --git a/butterfree/_cli/main.py b/butterfree/_cli/main.py index 636fdb25..b8b12f14 100644 --- a/butterfree/_cli/main.py +++ b/butterfree/_cli/main.py @@ -2,7 +2,7 @@ from butterfree._cli import migrate -app = typer.Typer() +app = typer.Typer(no_args_is_help=True) app.add_typer(migrate.app, name="migrate") if __name__ == "__main__": diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index ed62f1a2..f5161509 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -16,7 +16,9 @@ from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline -app = typer.Typer(help="Apply the automatic migrations in a database.") +app = typer.Typer( + help="Apply the automatic migrations in a database.", no_args_is_help=True +) logger = __logger("migrate", True) @@ -89,7 +91,7 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: instances.add(value) logger.info("Creating instances...") - return set(value() for value in instances) + return set(value() for value in instances) # type: ignore PATH = typer.Argument( diff --git a/butterfree/clients/__init__.py b/butterfree/clients/__init__.py index 5f6f0ffa..7e8d1a95 100644 --- a/butterfree/clients/__init__.py +++ b/butterfree/clients/__init__.py @@ -1,4 +1,5 @@ """Holds connection clients.""" + from butterfree.clients.abstract_client import AbstractClient from butterfree.clients.cassandra_client import CassandraClient from butterfree.clients.spark_client import SparkClient diff --git a/butterfree/clients/abstract_client.py b/butterfree/clients/abstract_client.py index ce5d33b6..b9027bd8 100644 --- a/butterfree/clients/abstract_client.py +++ b/butterfree/clients/abstract_client.py @@ -1,6 +1,7 @@ """Abstract class for database clients.""" + from abc import ABC, abstractmethod -from typing import Any +from typing import Any, Optional class AbstractClient(ABC): @@ -25,7 +26,7 @@ def sql(self, query: str) -> Any: pass @abstractmethod - def get_schema(self, table: str, database: str = None) -> Any: + def get_schema(self, table: str, database: Optional[str] = None) -> Any: """Returns desired table schema. Attributes: diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 5a723155..714e8248 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -1,4 +1,5 @@ """CassandraClient entity.""" + from ssl import CERT_REQUIRED, PROTOCOL_TLSv1 from typing import Dict, List, Optional @@ -102,7 +103,9 @@ def sql(self, query: str) -> ResponseFuture: """ return self.conn.execute(query) - def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: + def get_schema( + self, table: str, database: Optional[str] = None + ) -> List[Dict[str, str]]: """Returns desired table schema. Attributes: diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index e2b868ca..933c2165 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -69,7 +69,7 @@ def read( return df_reader.format(format).load(path=path, **options) # type: ignore - def read_table(self, table: str, database: str = None) -> DataFrame: + def read_table(self, table: str, database: Optional[str] = None) -> DataFrame: """Use the SparkSession.read interface to read a metastore table. Args: @@ -179,9 +179,9 @@ def write_table( database: Optional[str], table_name: str, path: str, - format_: str = None, - mode: str = None, - partition_by: List[str] = None, + format_: Optional[str] = None, + mode: Optional[str] = None, + partition_by: Optional[List[str]] = None, **options: Any, ) -> None: """Receive a spark DataFrame and write it as a table in metastore. @@ -231,7 +231,10 @@ def create_temporary_view(dataframe: DataFrame, name: str) -> Any: return dataframe.writeStream.format("memory").queryName(name).start() def add_table_partitions( - self, partitions: List[Dict[str, Any]], table: str, database: str = None + self, + partitions: List[Dict[str, Any]], + table: str, + database: Optional[str] = None, ) -> None: """Add partitions to an existing table. @@ -259,9 +262,11 @@ def add_table_partitions( key_values_expr = [ ", ".join( [ - "{} = {}".format(k, v) - if not isinstance(v, str) - else "{} = '{}'".format(k, v) + ( + "{} = {}".format(k, v) + if not isinstance(v, str) + else "{} = '{}'".format(k, v) + ) for k, v in partition.items() ] ) @@ -314,7 +319,9 @@ def _convert_schema(self, schema: DataFrame) -> List[Dict[str, str]]: return converted_schema - def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: + def get_schema( + self, table: str, database: Optional[str] = None + ) -> List[Dict[str, str]]: """Returns desired table schema. Attributes: diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index a038cb17..d60bb697 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -1,4 +1,5 @@ """Holds configurations to read and write with Spark to Cassandra DB.""" + from typing import Any, Dict, List, Optional from butterfree.configs import environment @@ -32,18 +33,18 @@ class CassandraConfig(AbstractWriteConfig): def __init__( self, - username: str = None, - password: str = None, - host: str = None, - keyspace: str = None, - mode: str = None, - format_: str = None, - stream_processing_time: str = None, - stream_output_mode: str = None, - stream_checkpoint_path: str = None, - read_consistency_level: str = None, - write_consistency_level: str = None, - local_dc: str = None, + username: Optional[str] = None, + password: Optional[str] = None, + host: Optional[str] = None, + keyspace: Optional[str] = None, + mode: Optional[str] = None, + format_: Optional[str] = None, + stream_processing_time: Optional[str] = None, + stream_output_mode: Optional[str] = None, + stream_checkpoint_path: Optional[str] = None, + read_consistency_level: Optional[str] = None, + write_consistency_level: Optional[str] = None, + local_dc: Optional[str] = None, ): self.username = username self.password = password diff --git a/butterfree/configs/db/kafka_config.py b/butterfree/configs/db/kafka_config.py index 79cad15b..e0c14baf 100644 --- a/butterfree/configs/db/kafka_config.py +++ b/butterfree/configs/db/kafka_config.py @@ -1,4 +1,5 @@ """Holds configurations to read and write with Spark to Kafka.""" + from typing import Any, Dict, List, Optional from butterfree.configs import environment @@ -25,13 +26,13 @@ class KafkaConfig(AbstractWriteConfig): def __init__( self, - kafka_topic: str = None, - kafka_connection_string: str = None, - mode: str = None, - format_: str = None, - stream_processing_time: str = None, - stream_output_mode: str = None, - stream_checkpoint_path: str = None, + kafka_topic: Optional[str] = None, + kafka_connection_string: Optional[str] = None, + mode: Optional[str] = None, + format_: Optional[str] = None, + stream_processing_time: Optional[str] = None, + stream_output_mode: Optional[str] = None, + stream_checkpoint_path: Optional[str] = None, ): self.kafka_topic = kafka_topic self.kafka_connection_string = kafka_connection_string @@ -147,4 +148,4 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: Kafka schema. """ - pass + return [{}] diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index ff7ed01d..323aded0 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -25,10 +25,10 @@ class MetastoreConfig(AbstractWriteConfig): def __init__( self, - path: str = None, - mode: str = None, - format_: str = None, - file_system: str = None, + path: Optional[str] = None, + mode: Optional[str] = None, + format_: Optional[str] = None, + file_system: Optional[str] = None, ): self.path = path self.mode = mode diff --git a/butterfree/configs/environment.py b/butterfree/configs/environment.py index f56efc5d..f6ba18a5 100644 --- a/butterfree/configs/environment.py +++ b/butterfree/configs/environment.py @@ -1,4 +1,5 @@ """Holds functions for managing the running environment.""" + import os from typing import Optional @@ -34,7 +35,9 @@ def __init__(self, variable_name: str): ) -def get_variable(variable_name: str, default_value: str = None) -> Optional[str]: +def get_variable( + variable_name: str, default_value: Optional[str] = None +) -> Optional[str]: """Gets an environment variable. The variable comes from it's explicitly declared value in the running diff --git a/butterfree/constants/__init__.py b/butterfree/constants/__init__.py index ec70d41b..aa0c76e6 100644 --- a/butterfree/constants/__init__.py +++ b/butterfree/constants/__init__.py @@ -1,4 +1,5 @@ """Holds constant attributes that are common for Butterfree.""" + from butterfree.constants.data_type import DataType __all__ = ["DataType"] diff --git a/butterfree/constants/migrations.py b/butterfree/constants/migrations.py index b1c0947d..f31d0841 100644 --- a/butterfree/constants/migrations.py +++ b/butterfree/constants/migrations.py @@ -1,4 +1,5 @@ """Migrations' Constants.""" + from butterfree.constants import columns PARTITION_BY = [ diff --git a/butterfree/dataframe_service/__init__.py b/butterfree/dataframe_service/__init__.py index c227dae2..5fd02d45 100644 --- a/butterfree/dataframe_service/__init__.py +++ b/butterfree/dataframe_service/__init__.py @@ -1,4 +1,5 @@ """Dataframe optimization components regarding Butterfree.""" + from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy from butterfree.dataframe_service.partitioning import extract_partition_values from butterfree.dataframe_service.repartition import repartition_df, repartition_sort_df diff --git a/butterfree/dataframe_service/incremental_strategy.py b/butterfree/dataframe_service/incremental_strategy.py index 6554d3b7..957064f1 100644 --- a/butterfree/dataframe_service/incremental_strategy.py +++ b/butterfree/dataframe_service/incremental_strategy.py @@ -2,6 +2,8 @@ from __future__ import annotations +from typing import Optional + from pyspark.sql import DataFrame @@ -18,7 +20,7 @@ class IncrementalStrategy: filter can properly work with the defined upper and lower bounds. """ - def __init__(self, column: str = None): + def __init__(self, column: Optional[str] = None): self.column = column def from_milliseconds(self, column_name: str) -> IncrementalStrategy: @@ -32,7 +34,9 @@ def from_milliseconds(self, column_name: str) -> IncrementalStrategy: """ return IncrementalStrategy(column=f"from_unixtime({column_name}/ 1000.0)") - def from_string(self, column_name: str, mask: str = None) -> IncrementalStrategy: + def from_string( + self, column_name: str, mask: Optional[str] = None + ) -> IncrementalStrategy: """Create a column expression from ts column defined as a simple string. Args: @@ -66,7 +70,9 @@ def from_year_month_day_partitions( f"'-', string({day_column}))" ) - def get_expression(self, start_date: str = None, end_date: str = None) -> str: + def get_expression( + self, start_date: Optional[str] = None, end_date: Optional[str] = None + ) -> str: """Get the incremental filter expression using the defined dates. Both arguments can be set to defined a specific date interval, but it's @@ -95,7 +101,10 @@ def get_expression(self, start_date: str = None, end_date: str = None) -> str: return f"date({self.column}) <= date('{end_date}')" def filter_with_incremental_strategy( - self, dataframe: DataFrame, start_date: str = None, end_date: str = None + self, + dataframe: DataFrame, + start_date: Optional[str] = None, + end_date: Optional[str] = None, ) -> DataFrame: """Filters the dataframe according to the date boundaries. diff --git a/butterfree/dataframe_service/repartition.py b/butterfree/dataframe_service/repartition.py index 8635557f..e84202ba 100644 --- a/butterfree/dataframe_service/repartition.py +++ b/butterfree/dataframe_service/repartition.py @@ -1,5 +1,6 @@ """Module where there are repartition methods.""" -from typing import List + +from typing import List, Optional from pyspark.sql.dataframe import DataFrame @@ -10,7 +11,7 @@ def _num_partitions_definition( - num_processors: int = None, num_partitions: int = None + num_processors: Optional[int] = None, num_partitions: Optional[int] = None ) -> int: num_partitions = ( num_processors * PARTITION_PROCESSOR_RATIO @@ -24,8 +25,8 @@ def _num_partitions_definition( def repartition_df( dataframe: DataFrame, partition_by: List[str], - num_partitions: int = None, - num_processors: int = None, + num_partitions: Optional[int] = None, + num_processors: Optional[int] = None, ) -> DataFrame: """Partition the DataFrame. @@ -47,8 +48,8 @@ def repartition_sort_df( dataframe: DataFrame, partition_by: List[str], order_by: List[str], - num_processors: int = None, - num_partitions: int = None, + num_processors: Optional[int] = None, + num_partitions: Optional[int] = None, ) -> DataFrame: """Partition and Sort the DataFrame. diff --git a/butterfree/extract/__init__.py b/butterfree/extract/__init__.py index bb056255..64c8ae4a 100644 --- a/butterfree/extract/__init__.py +++ b/butterfree/extract/__init__.py @@ -1,4 +1,5 @@ """The Source Component of a Feature Set.""" + from butterfree.extract.source import Source __all__ = ["Source"] diff --git a/butterfree/extract/pre_processing/__init__.py b/butterfree/extract/pre_processing/__init__.py index 72b37c4d..e142de6d 100644 --- a/butterfree/extract/pre_processing/__init__.py +++ b/butterfree/extract/pre_processing/__init__.py @@ -1,4 +1,5 @@ """Pre Processing Components regarding Readers.""" + from butterfree.extract.pre_processing.explode_json_column_transform import ( explode_json_column, ) diff --git a/butterfree/extract/pre_processing/explode_json_column_transform.py b/butterfree/extract/pre_processing/explode_json_column_transform.py index db79b5ce..76c90f73 100644 --- a/butterfree/extract/pre_processing/explode_json_column_transform.py +++ b/butterfree/extract/pre_processing/explode_json_column_transform.py @@ -1,4 +1,5 @@ """Explode json column for dataframes.""" + from pyspark.sql.dataframe import DataFrame, StructType from pyspark.sql.functions import from_json, get_json_object diff --git a/butterfree/extract/pre_processing/filter_transform.py b/butterfree/extract/pre_processing/filter_transform.py index 78e5df78..a7e4fff8 100644 --- a/butterfree/extract/pre_processing/filter_transform.py +++ b/butterfree/extract/pre_processing/filter_transform.py @@ -1,4 +1,5 @@ """Module where filter DataFrames coming from readers.""" + from pyspark.sql.dataframe import DataFrame diff --git a/butterfree/extract/pre_processing/forward_fill_transform.py b/butterfree/extract/pre_processing/forward_fill_transform.py index 96d9bcdd..2d3a232d 100644 --- a/butterfree/extract/pre_processing/forward_fill_transform.py +++ b/butterfree/extract/pre_processing/forward_fill_transform.py @@ -1,6 +1,7 @@ """Forward Fill Transform for dataframes.""" + import sys -from typing import List, Union +from typing import List, Optional, Union from pyspark.sql import DataFrame, Window, functions @@ -10,7 +11,7 @@ def forward_fill( partition_by: Union[str, List[str]], order_by: Union[str, List[str]], fill_column: str, - filled_column: str = None, + filled_column: Optional[str] = None, ) -> DataFrame: """Applies a forward fill to a single column. diff --git a/butterfree/extract/pre_processing/pivot_transform.py b/butterfree/extract/pre_processing/pivot_transform.py index 078b4746..f255f457 100644 --- a/butterfree/extract/pre_processing/pivot_transform.py +++ b/butterfree/extract/pre_processing/pivot_transform.py @@ -1,5 +1,6 @@ """Pivot Transform for dataframes.""" -from typing import Callable, List, Union + +from typing import Callable, List, Optional, Union from pyspark.sql import DataFrame, functions from pyspark.sql.types import DataType @@ -13,8 +14,8 @@ def pivot( pivot_column: str, agg_column: str, aggregation: Callable, - mock_value: Union[float, str] = None, - mock_type: Union[DataType, str] = None, + mock_value: Optional[Union[float, str]] = None, + mock_type: Optional[Union[DataType, str]] = None, with_forward_fill: bool = False, ) -> DataFrame: """Defines a pivot transformation. diff --git a/butterfree/extract/pre_processing/replace_transform.py b/butterfree/extract/pre_processing/replace_transform.py index a7dd1d67..3127c6d9 100644 --- a/butterfree/extract/pre_processing/replace_transform.py +++ b/butterfree/extract/pre_processing/replace_transform.py @@ -1,4 +1,5 @@ """Replace transformer for dataframes.""" + from itertools import chain from typing import Dict diff --git a/butterfree/extract/readers/__init__.py b/butterfree/extract/readers/__init__.py index 37da63a6..8c7bd74e 100644 --- a/butterfree/extract/readers/__init__.py +++ b/butterfree/extract/readers/__init__.py @@ -1,4 +1,5 @@ """The Reader Component of a Source.""" + from butterfree.extract.readers.file_reader import FileReader from butterfree.extract.readers.kafka_reader import KafkaReader from butterfree.extract.readers.table_reader import TableReader diff --git a/butterfree/extract/readers/file_reader.py b/butterfree/extract/readers/file_reader.py index 8cf15599..da046f08 100644 --- a/butterfree/extract/readers/file_reader.py +++ b/butterfree/extract/readers/file_reader.py @@ -1,5 +1,6 @@ """FileReader entity.""" -from typing import Any, Dict + +from typing import Any, Dict, Optional from pyspark.sql import DataFrame from pyspark.sql.types import StructType @@ -75,8 +76,8 @@ def __init__( id: str, path: str, format: str, - schema: StructType = None, - format_options: Dict[Any, Any] = None, + schema: Optional[StructType] = None, + format_options: Optional[Dict[Any, Any]] = None, stream: bool = False, ): super().__init__(id) diff --git a/butterfree/extract/readers/kafka_reader.py b/butterfree/extract/readers/kafka_reader.py index 1b8042bc..44731d20 100644 --- a/butterfree/extract/readers/kafka_reader.py +++ b/butterfree/extract/readers/kafka_reader.py @@ -1,5 +1,6 @@ """KafkaSource entity.""" -from typing import Any, Dict + +from typing import Any, Dict, Optional from pyspark.sql.dataframe import DataFrame, StructType from pyspark.sql.functions import col, struct @@ -107,8 +108,8 @@ def __init__( id: str, topic: str, value_schema: StructType, - connection_string: str = None, - topic_options: Dict[Any, Any] = None, + connection_string: Optional[str] = None, + topic_options: Optional[Dict[Any, Any]] = None, stream: bool = True, ): super().__init__(id) diff --git a/butterfree/extract/readers/reader.py b/butterfree/extract/readers/reader.py index 597c870f..5053d82c 100644 --- a/butterfree/extract/readers/reader.py +++ b/butterfree/extract/readers/reader.py @@ -21,7 +21,9 @@ class Reader(ABC, HookableComponent): """ - def __init__(self, id: str, incremental_strategy: IncrementalStrategy = None): + def __init__( + self, id: str, incremental_strategy: Optional[IncrementalStrategy] = None + ): super().__init__() self.id = id self.transformations: List[Dict[str, Any]] = [] @@ -82,9 +84,9 @@ def consume(self, client: SparkClient) -> DataFrame: def build( self, client: SparkClient, - columns: List[Any] = None, - start_date: str = None, - end_date: str = None, + columns: Optional[List[Any]] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, ) -> None: """Register the data got from the reader in the Spark metastore. diff --git a/butterfree/extract/readers/table_reader.py b/butterfree/extract/readers/table_reader.py index 343f25f3..b5decfc1 100644 --- a/butterfree/extract/readers/table_reader.py +++ b/butterfree/extract/readers/table_reader.py @@ -1,5 +1,7 @@ """TableSource entity.""" +from typing import Optional + from pyspark.sql import DataFrame from butterfree.clients import SparkClient @@ -44,7 +46,7 @@ class TableReader(Reader): __name__ = "Table Reader" - def __init__(self, id: str, table: str, database: str = None): + def __init__(self, id: str, table: str, database: Optional[str] = None): super().__init__(id) if not isinstance(table, str): raise ValueError( diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 281ed15a..bfc15271 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -1,6 +1,6 @@ """Holds the SourceSelector class.""" -from typing import List +from typing import List, Optional from pyspark.sql import DataFrame @@ -70,7 +70,10 @@ def __init__( self.eager_evaluation = eager_evaluation def construct( - self, client: SparkClient, start_date: str = None, end_date: str = None + self, + client: SparkClient, + start_date: Optional[str] = None, + end_date: Optional[str] = None, ) -> DataFrame: """Construct an entry point dataframe for a feature set. diff --git a/butterfree/hooks/__init__.py b/butterfree/hooks/__init__.py index 90bedeb2..e4a32170 100644 --- a/butterfree/hooks/__init__.py +++ b/butterfree/hooks/__init__.py @@ -1,4 +1,5 @@ """Holds Hooks definitions.""" + from butterfree.hooks.hook import Hook from butterfree.hooks.hookable_component import HookableComponent diff --git a/butterfree/hooks/schema_compatibility/__init__.py b/butterfree/hooks/schema_compatibility/__init__.py index edf748bf..a00adef8 100644 --- a/butterfree/hooks/schema_compatibility/__init__.py +++ b/butterfree/hooks/schema_compatibility/__init__.py @@ -1,4 +1,5 @@ """Holds Schema Compatibility Hooks definitions.""" + from butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook import ( # noqa CassandraTableSchemaCompatibilityHook, ) diff --git a/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py b/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py index b08dd56a..eea50c06 100644 --- a/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py +++ b/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py @@ -1,5 +1,7 @@ """Spark table schema compatibility Hook definition.""" +from typing import Optional + from pyspark.sql import DataFrame from butterfree.clients import SparkClient @@ -18,7 +20,9 @@ class SparkTableSchemaCompatibilityHook(Hook): database: database name. """ - def __init__(self, spark_client: SparkClient, table: str, database: str = None): + def __init__( + self, spark_client: SparkClient, table: str, database: Optional[str] = None + ): self.spark_client = spark_client self.table_expression = (f"`{database}`." if database else "") + f"`{table}`" diff --git a/butterfree/load/processing/__init__.py b/butterfree/load/processing/__init__.py index e2ad5157..06c5cb45 100644 --- a/butterfree/load/processing/__init__.py +++ b/butterfree/load/processing/__init__.py @@ -1,4 +1,5 @@ """Pre Processing Components regarding Readers.""" + from butterfree.load.processing.json_transform import json_transform __all__ = ["json_transform"] diff --git a/butterfree/load/processing/json_transform.py b/butterfree/load/processing/json_transform.py index 19ddecae..598064db 100644 --- a/butterfree/load/processing/json_transform.py +++ b/butterfree/load/processing/json_transform.py @@ -1,4 +1,5 @@ """Json conversion for writers.""" + from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import struct, to_json diff --git a/butterfree/load/sink.py b/butterfree/load/sink.py index 7c0328d6..59b001a5 100644 --- a/butterfree/load/sink.py +++ b/butterfree/load/sink.py @@ -1,4 +1,5 @@ """Holds the Sink class.""" + from typing import List, Optional from pyspark.sql.dataframe import DataFrame diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 1a64afdf..c01fee1d 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -1,7 +1,7 @@ """Holds the Historical Feature Store writer class.""" import os -from typing import Any +from typing import Any, Optional from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import dayofmonth, month, year @@ -106,13 +106,13 @@ class HistoricalFeatureStoreWriter(Writer): def __init__( self, - db_config: AbstractWriteConfig = None, - database: str = None, - num_partitions: int = None, + db_config: Optional[AbstractWriteConfig] = None, + database: Optional[str] = None, + num_partitions: Optional[int] = None, validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD, debug_mode: bool = False, interval_mode: bool = False, - check_schema_hook: Hook = None, + check_schema_hook: Optional[Hook] = None, row_count_validation: bool = True, ): super(HistoricalFeatureStoreWriter, self).__init__( @@ -152,7 +152,8 @@ def write( dataframe = self._apply_transformations(dataframe) if self.interval_mode: - partition_overwrite_mode = spark_client.conn.conf.get( + + partition_overwrite_mode = spark_client.conn.conf.get( # type: ignore "spark.sql.sources.partitionOverwriteMode" ).lower() @@ -249,7 +250,11 @@ def _create_partitions(self, dataframe: DataFrame) -> DataFrame: return repartition_df(dataframe, self.PARTITION_BY, self.num_partitions) def check_schema( - self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + self, + client: Any, + dataframe: DataFrame, + table_name: str, + database: Optional[str] = None, ) -> DataFrame: """Instantiate the schema check hook to check schema between dataframe and database. diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index d0bcde94..bce5a375 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -1,7 +1,7 @@ """Holds the Online Feature Store writer class.""" import os -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union from pyspark.sql import DataFrame, Window from pyspark.sql.functions import col, row_number @@ -80,12 +80,12 @@ class OnlineFeatureStoreWriter(Writer): def __init__( self, - db_config: AbstractWriteConfig = None, - database: str = None, - debug_mode: bool = False, - write_to_entity: bool = False, - interval_mode: bool = False, - check_schema_hook: Hook = None, + db_config: Optional[AbstractWriteConfig] = None, + database: Optional[str] = None, + debug_mode: Optional[bool] = False, + write_to_entity: Optional[bool] = False, + interval_mode: Optional[bool] = False, + check_schema_hook: Optional[Hook] = None, ): super(OnlineFeatureStoreWriter, self).__init__( db_config or CassandraConfig(), debug_mode, interval_mode, write_to_entity @@ -256,7 +256,11 @@ def get_db_schema(self, feature_set: FeatureSet) -> List[Dict[Any, Any]]: return db_schema def check_schema( - self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + self, + client: Any, + dataframe: DataFrame, + table_name: str, + database: Optional[str] = None, ) -> DataFrame: """Instantiate the schema check hook to check schema between dataframe and database. diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index 1dae795c..780b9ec2 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from functools import reduce -from typing import Any, Callable, Dict, List +from typing import Any, Callable, Dict, List, Optional from pyspark.sql.dataframe import DataFrame @@ -23,10 +23,10 @@ class Writer(ABC, HookableComponent): def __init__( self, db_config: AbstractWriteConfig, - debug_mode: bool = False, - interval_mode: bool = False, - write_to_entity: bool = False, - row_count_validation: bool = True, + debug_mode: Optional[bool] = False, + interval_mode: Optional[bool] = False, + write_to_entity: Optional[bool] = False, + row_count_validation: Optional[bool] = True, ) -> None: super().__init__() self.db_config = db_config @@ -90,7 +90,11 @@ def write( @abstractmethod def check_schema( - self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + self, + client: Any, + dataframe: DataFrame, + table_name: str, + database: Optional[str] = None, ) -> DataFrame: """Instantiate the schema check hook to check schema between dataframe and database. diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 468c028e..351a4724 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -1,8 +1,9 @@ """Migration entity.""" + from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum, auto -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Optional, Set from butterfree.clients import AbstractClient from butterfree.configs.logger import __logger @@ -106,7 +107,10 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: pass def _get_queries( - self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None + self, + schema_diff: Set[Diff], + table_name: str, + write_on_entity: Optional[bool] = None, ) -> Any: """Create the desired queries for migration. @@ -162,8 +166,8 @@ def create_query( self, fs_schema: List[Dict[str, Any]], table_name: str, - db_schema: List[Dict[str, Any]] = None, - write_on_entity: bool = None, + db_schema: Optional[List[Dict[str, Any]]] = None, + write_on_entity: Optional[bool] = None, ) -> Any: """Create a query regarding a data source. @@ -246,7 +250,7 @@ def _get_diff( return schema_diff def _get_schema( - self, table_name: str, database: str = None + self, table_name: str, database: Optional[str] = None ) -> List[Dict[str, Any]]: """Get a table schema in the respective database. diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 8c6c211a..07e2bd89 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -1,6 +1,6 @@ """Metastore Migration entity.""" -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from butterfree.clients import SparkClient from butterfree.configs import environment @@ -32,7 +32,7 @@ class MetastoreMigration(DatabaseMigration): def __init__( self, - database: str = None, + database: Optional[str] = None, ) -> None: self._db_config = MetastoreConfig() self.database = database or environment.get_variable( diff --git a/butterfree/pipelines/__init__.py b/butterfree/pipelines/__init__.py index a868e48f..8bbc5c39 100644 --- a/butterfree/pipelines/__init__.py +++ b/butterfree/pipelines/__init__.py @@ -1,4 +1,5 @@ """ETL Pipelines.""" + from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline __all__ = ["FeatureSetPipeline"] diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index 8aec54ec..8ba1a636 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -1,5 +1,6 @@ """FeatureSetPipeline entity.""" -from typing import List + +from typing import List, Optional from butterfree.clients import SparkClient from butterfree.dataframe_service import repartition_sort_df @@ -135,7 +136,7 @@ def __init__( source: Source, feature_set: FeatureSet, sink: Sink, - spark_client: SparkClient = None, + spark_client: Optional[SparkClient] = None, ): self.source = source self.feature_set = feature_set @@ -190,11 +191,11 @@ def spark_client(self, spark_client: SparkClient) -> None: def run( self, - end_date: str = None, - partition_by: List[str] = None, - order_by: List[str] = None, - num_processors: int = None, - start_date: str = None, + end_date: Optional[str] = None, + partition_by: Optional[List[str]] = None, + order_by: Optional[List[str]] = None, + num_processors: Optional[int] = None, + start_date: Optional[str] = None, ) -> None: """Runs the defined feature set pipeline. @@ -243,10 +244,10 @@ def run( def run_for_date( self, - execution_date: str = None, - partition_by: List[str] = None, - order_by: List[str] = None, - num_processors: int = None, + execution_date: Optional[str] = None, + partition_by: Optional[List[str]] = None, + order_by: Optional[List[str]] = None, + num_processors: Optional[int] = None, ) -> None: """Runs the defined feature set pipeline for a specific date. diff --git a/butterfree/reports/__init__.py b/butterfree/reports/__init__.py index 4b57dafc..d272943d 100644 --- a/butterfree/reports/__init__.py +++ b/butterfree/reports/__init__.py @@ -1,4 +1,5 @@ """Reports module.""" + from butterfree.reports.metadata import Metadata __all__ = ["Metadata"] diff --git a/butterfree/testing/dataframe/__init__.py b/butterfree/testing/dataframe/__init__.py index 15481a54..5b465bc6 100644 --- a/butterfree/testing/dataframe/__init__.py +++ b/butterfree/testing/dataframe/__init__.py @@ -1,6 +1,7 @@ """Methods to assert properties regarding Apache Spark Dataframes.""" + from json import dumps -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from pyspark import SparkContext from pyspark.sql import Column, DataFrame, SparkSession @@ -72,7 +73,7 @@ def create_df_from_collection( data: List[Dict[Any, Any]], spark_context: SparkContext, spark_session: SparkSession, - schema: StructType = None, + schema: Optional[StructType] = None, ) -> DataFrame: """Creates a dataframe from a list of dicts.""" return spark_session.read.json( diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index c86a95c3..6706bf8c 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -1,4 +1,5 @@ """AggregatedFeatureSet entity.""" + import itertools from datetime import datetime, timedelta from functools import reduce @@ -261,8 +262,8 @@ def _has_aggregated_transform_only(features: List[Feature]) -> bool: @staticmethod def _build_feature_column_name( feature_column: str, - pivot_value: Union[float, str] = None, - window: Window = None, + pivot_value: Optional[Union[float, str]] = None, + window: Optional[Window] = None, ) -> str: base_name = feature_column if pivot_value is not None: @@ -311,7 +312,7 @@ def with_distinct(self, subset: List, keep: str = "last") -> "AggregatedFeatureS return self def with_windows( - self, definitions: List[str], slide: str = None + self, definitions: List[str], slide: Optional[str] = None ) -> "AggregatedFeatureSet": """Create a list with windows defined.""" self._windows = [ @@ -367,7 +368,7 @@ def _dataframe_join( right: DataFrame, on: List[str], how: str, - num_processors: int = None, + num_processors: Optional[int] = None, ) -> DataFrame: # make both tables co-partitioned to improve join performance left = repartition_df(left, partition_by=on, num_processors=num_processors) @@ -379,7 +380,7 @@ def _aggregate( dataframe: DataFrame, features: List[Feature], window: Optional[Window] = None, - num_processors: int = None, + num_processors: Optional[int] = None, ) -> DataFrame: aggregations = [ c.function for f in features for c in f.transformation.aggregations @@ -512,7 +513,7 @@ def _get_biggest_window_in_days(definitions: List[str]) -> float: ) return max(windows_list) / (60 * 60 * 24) - def define_start_date(self, start_date: str = None) -> Optional[str]: + def define_start_date(self, start_date: Optional[str] = None) -> Optional[str]: """Get aggregated feature set start date. Args: @@ -539,9 +540,9 @@ def construct( self, dataframe: DataFrame, client: SparkClient, - end_date: str = None, - num_processors: int = None, - start_date: str = None, + end_date: Optional[str] = None, + num_processors: Optional[int] = None, + start_date: Optional[str] = None, ) -> DataFrame: """Use all the features to build the feature set dataframe. diff --git a/butterfree/transform/feature_set.py b/butterfree/transform/feature_set.py index 469a353a..369eaf29 100644 --- a/butterfree/transform/feature_set.py +++ b/butterfree/transform/feature_set.py @@ -1,4 +1,5 @@ """FeatureSet entity.""" + import itertools from functools import reduce from typing import Any, Dict, List, Optional @@ -389,7 +390,7 @@ def _filter_duplicated_rows(self, df: DataFrame) -> DataFrame: return df.select([column for column in self.columns]) - def define_start_date(self, start_date: str = None) -> Optional[str]: + def define_start_date(self, start_date: Optional[str] = None) -> Optional[str]: """Get feature set start date. Args: @@ -404,9 +405,9 @@ def construct( self, dataframe: DataFrame, client: SparkClient, - end_date: str = None, - num_processors: int = None, - start_date: str = None, + end_date: Optional[str] = None, + num_processors: Optional[int] = None, + start_date: Optional[str] = None, ) -> DataFrame: """Use all the features to build the feature set dataframe. diff --git a/butterfree/transform/features/feature.py b/butterfree/transform/features/feature.py index 612fc4a2..cfd8a2f6 100644 --- a/butterfree/transform/features/feature.py +++ b/butterfree/transform/features/feature.py @@ -1,6 +1,7 @@ """Feature entity.""" + import warnings -from typing import Any, List +from typing import Any, List, Optional from pyspark.sql import DataFrame from pyspark.sql.functions import col @@ -41,9 +42,9 @@ def __init__( self, name: str, description: str, - dtype: DataType = None, - from_column: str = None, - transformation: TransformComponent = None, + dtype: Optional[DataType] = None, + from_column: Optional[str] = None, + transformation: Optional[TransformComponent] = None, ) -> None: self.name = name self.description = description diff --git a/butterfree/transform/features/key_feature.py b/butterfree/transform/features/key_feature.py index a7ad350c..74626d6f 100644 --- a/butterfree/transform/features/key_feature.py +++ b/butterfree/transform/features/key_feature.py @@ -1,5 +1,7 @@ """KeyFeature entity.""" +from typing import Optional + from butterfree.constants.data_type import DataType from butterfree.transform.features.feature import Feature from butterfree.transform.transformations import TransformComponent @@ -31,8 +33,8 @@ def __init__( name: str, description: str, dtype: DataType, - from_column: str = None, - transformation: TransformComponent = None, + from_column: Optional[str] = None, + transformation: Optional[TransformComponent] = None, ) -> None: super(KeyFeature, self).__init__( name=name, diff --git a/butterfree/transform/features/timestamp_feature.py b/butterfree/transform/features/timestamp_feature.py index b131eaee..aa30dfc4 100644 --- a/butterfree/transform/features/timestamp_feature.py +++ b/butterfree/transform/features/timestamp_feature.py @@ -1,4 +1,7 @@ """TimestampFeature entity.""" + +from typing import Optional + from pyspark.sql import DataFrame from pyspark.sql.functions import to_timestamp @@ -38,10 +41,10 @@ class TimestampFeature(Feature): def __init__( self, - from_column: str = None, - transformation: TransformComponent = None, + from_column: Optional[str] = None, + transformation: Optional[TransformComponent] = None, from_ms: bool = False, - mask: str = None, + mask: Optional[str] = None, ) -> None: description = "Time tag for the state of all features." super(TimestampFeature, self).__init__( @@ -70,7 +73,7 @@ def transform(self, dataframe: DataFrame) -> DataFrame: ts_column = ts_column / 1000 dataframe = dataframe.withColumn( - column_name, to_timestamp(ts_column, self.mask) + column_name, to_timestamp(ts_column, self.mask) # type: ignore ) return super().transform(dataframe) diff --git a/butterfree/transform/transformations/aggregated_transform.py b/butterfree/transform/transformations/aggregated_transform.py index a9581ef0..406ca72a 100644 --- a/butterfree/transform/transformations/aggregated_transform.py +++ b/butterfree/transform/transformations/aggregated_transform.py @@ -1,6 +1,7 @@ """Aggregated Transform entity.""" + from collections import namedtuple -from typing import List, Tuple +from typing import List, Optional, Tuple from pyspark.sql import DataFrame from pyspark.sql.functions import col, expr, when @@ -56,7 +57,9 @@ class AggregatedTransform(TransformComponent): NotImplementedError: ... """ - def __init__(self, functions: List[Function], filter_expression: str = None): + def __init__( + self, functions: List[Function], filter_expression: Optional[str] = None + ): super(AggregatedTransform, self).__init__() self.functions = functions self.filter_expression = filter_expression diff --git a/butterfree/transform/transformations/custom_transform.py b/butterfree/transform/transformations/custom_transform.py index 7860fdc2..a1231012 100644 --- a/butterfree/transform/transformations/custom_transform.py +++ b/butterfree/transform/transformations/custom_transform.py @@ -69,7 +69,7 @@ def transformer(self) -> Callable[..., Any]: @transformer.setter def transformer(self, method: Callable[..., Any]) -> None: - if not method: + if method is None: raise ValueError("A method must be provided to CustomTransform") self._transformer = method diff --git a/butterfree/transform/transformations/spark_function_transform.py b/butterfree/transform/transformations/spark_function_transform.py index 8fb24dd7..34384518 100644 --- a/butterfree/transform/transformations/spark_function_transform.py +++ b/butterfree/transform/transformations/spark_function_transform.py @@ -1,5 +1,6 @@ """Spark Function Transform entity.""" -from typing import Any, List + +from typing import Any, List, Optional from pyspark.sql import DataFrame @@ -87,8 +88,8 @@ def with_window( self, partition_by: str, window_definition: List[str], - order_by: str = None, - mode: str = None, + order_by: Optional[str] = None, + mode: Optional[str] = None, ) -> "SparkFunctionTransform": """Create a list with windows defined.""" if mode is not None: @@ -103,7 +104,9 @@ def with_window( ] return self - def _get_output_name(self, function: object, window: Window = None) -> str: + def _get_output_name( + self, function: object, window: Optional[Window] = None + ) -> str: base_name = ( "__".join([self._parent.name, function.__name__]) if hasattr(function, "__name__") diff --git a/butterfree/transform/transformations/transform_component.py b/butterfree/transform/transformations/transform_component.py index 7ecec332..94bc19f8 100644 --- a/butterfree/transform/transformations/transform_component.py +++ b/butterfree/transform/transformations/transform_component.py @@ -1,4 +1,5 @@ """Transform Abstract Class.""" + from abc import ABC, abstractmethod from typing import Any, List diff --git a/butterfree/transform/transformations/user_defined_functions/mode.py b/butterfree/transform/transformations/user_defined_functions/mode.py index 65790b93..5b6c7f17 100644 --- a/butterfree/transform/transformations/user_defined_functions/mode.py +++ b/butterfree/transform/transformations/user_defined_functions/mode.py @@ -1,4 +1,5 @@ """Method to compute mode aggregation.""" + import pandas as pd from pyspark.sql.functions import pandas_udf from pyspark.sql.types import StringType diff --git a/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py b/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py index 20ccd3ba..6dd6779f 100644 --- a/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py +++ b/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py @@ -1,4 +1,5 @@ """Method to compute most frequent set aggregation.""" + from typing import Any import pandas as pd diff --git a/butterfree/transform/utils/__init__.py b/butterfree/transform/utils/__init__.py index abf7ed3f..66004a37 100644 --- a/butterfree/transform/utils/__init__.py +++ b/butterfree/transform/utils/__init__.py @@ -1,4 +1,5 @@ """This module holds utils to be used by transformations.""" + from butterfree.transform.utils.function import Function from butterfree.transform.utils.window_spec import Window diff --git a/butterfree/transform/utils/date_range.py b/butterfree/transform/utils/date_range.py index 78e0e6e3..4bdd2977 100644 --- a/butterfree/transform/utils/date_range.py +++ b/butterfree/transform/utils/date_range.py @@ -1,7 +1,7 @@ """Utils for date range generation.""" from datetime import datetime -from typing import Union +from typing import Optional, Union from pyspark.sql import DataFrame, functions @@ -14,7 +14,7 @@ def get_date_range( client: SparkClient, start_date: Union[str, datetime], end_date: Union[str, datetime], - step: int = None, + step: Optional[int] = None, ) -> DataFrame: """Create a date range dataframe. @@ -44,7 +44,7 @@ def get_date_range( for c in ("start_date", "end_date") ] ) - start_date, end_date = date_df.first() + start_date, end_date = date_df.first() # type: ignore return client.conn.range( start_date, end_date + day_in_seconds, step # type: ignore ).select(functions.col("id").cast(DataType.TIMESTAMP.spark).alias(TIMESTAMP_COLUMN)) diff --git a/butterfree/transform/utils/function.py b/butterfree/transform/utils/function.py index fcf6679f..951a232c 100644 --- a/butterfree/transform/utils/function.py +++ b/butterfree/transform/utils/function.py @@ -32,9 +32,9 @@ def func(self) -> Callable: @func.setter def func(self, value: Callable) -> None: """Definitions to be used in the transformation.""" - if not value: + if value is None: raise ValueError("Function must not be empty.") - if not callable(value): + if callable(value) is False: raise TypeError("Function must be callable.") self._func = value diff --git a/butterfree/transform/utils/window_spec.py b/butterfree/transform/utils/window_spec.py index 53ecd2fd..b95dd73a 100644 --- a/butterfree/transform/utils/window_spec.py +++ b/butterfree/transform/utils/window_spec.py @@ -1,4 +1,5 @@ """Holds function for defining window in DataFrames.""" + from typing import Any, List, Optional, Union from pyspark import sql @@ -69,8 +70,8 @@ def __init__( window_definition: str, partition_by: Optional[Union[Column, str, List[str]]] = None, order_by: Optional[Union[Column, str]] = None, - mode: str = None, - slide: str = None, + mode: Optional[str] = None, + slide: Optional[str] = None, ): self.partition_by = partition_by self.order_by = order_by or TIMESTAMP_COLUMN diff --git a/butterfree/validations/basic_validaton.py b/butterfree/validations/basic_validaton.py index d3a5558c..01bc9ec2 100644 --- a/butterfree/validations/basic_validaton.py +++ b/butterfree/validations/basic_validaton.py @@ -1,5 +1,7 @@ """Validation implementing basic checks over the dataframe.""" +from typing import Optional + from pyspark.sql.dataframe import DataFrame from butterfree.constants.columns import TIMESTAMP_COLUMN @@ -14,7 +16,7 @@ class BasicValidation(Validation): """ - def __init__(self, dataframe: DataFrame = None): + def __init__(self, dataframe: Optional[DataFrame] = None): super().__init__(dataframe) def check(self) -> None: diff --git a/butterfree/validations/validation.py b/butterfree/validations/validation.py index 9915906c..551859d8 100644 --- a/butterfree/validations/validation.py +++ b/butterfree/validations/validation.py @@ -1,5 +1,7 @@ """Abstract Validation class.""" + from abc import ABC, abstractmethod +from typing import Optional from pyspark.sql.dataframe import DataFrame @@ -12,7 +14,7 @@ class Validation(ABC): """ - def __init__(self, dataframe: DataFrame = None): + def __init__(self, dataframe: Optional[DataFrame] = None): self.dataframe = dataframe def input(self, dataframe: DataFrame) -> "Validation": diff --git a/docs/source/conf.py b/docs/source/conf.py index 77fdc125..0a537739 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,5 @@ """Sphinx Configuration.""" + # -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. diff --git a/mypy.ini b/mypy.ini index fc293149..eb867a47 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,42 +9,3 @@ show_error_codes = True show_error_context = True disable_error_code = attr-defined, list-item, operator pretty = True - -[mypy-butterfree.pipelines.*] -ignore_errors = True - -[mypy-butterfree.load.*] -ignore_errors = True - -[mypy-butterfree.transform.*] -ignore_errors = True - -[mypy-butterfree.extract.*] -ignore_errors = True - -[mypy-butterfree.config.*] -ignore_errors = True - -[mypy-butterfree.clients.*] -ignore_errors = True - -[mypy-butterfree.configs.*] -ignore_errors = True - -[mypy-butterfree.dataframe_service.*] -ignore_errors = True - -[mypy-butterfree.validations.*] -ignore_errors = True - -[mypy-butterfree.migrations.*] -ignore_errors = True - -[mypy-butterfree.testing.*] -ignore_errors = True - -[mypy-butterfree.hooks.*] -ignore_errors = True - -[mypy-butterfree._cli.*] -ignore_errors = True diff --git a/requirements.lint.txt b/requirements.lint.txt index 66641a95..1ad6499d 100644 --- a/requirements.lint.txt +++ b/requirements.lint.txt @@ -1,4 +1,4 @@ -black==21.12b0 +black==24.3.0 flake8==4.0.1 flake8-isort==4.1.1 flake8-docstrings==1.5.0 diff --git a/requirements.txt b/requirements.txt index f3af4254..0af8a62a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,6 @@ mdutils>=1.2.2,<2.0 pandas>=0.24,<2.0 parameters-validation>=1.1.5,<2.0 pyspark==3.5.1 -typer==0.3.2 +typer==0.4.2 typing-extensions>3.7.4,<5 boto3==1.17.* diff --git a/tests/integration/butterfree/pipelines/conftest.py b/tests/integration/butterfree/pipelines/conftest.py index 5f304972..1466a8d9 100644 --- a/tests/integration/butterfree/pipelines/conftest.py +++ b/tests/integration/butterfree/pipelines/conftest.py @@ -139,7 +139,10 @@ def feature_set_pipeline( feature_set_pipeline = FeatureSetPipeline( source=Source( readers=[ - TableReader(id="b_source", table="b_table",).with_incremental_strategy( + TableReader( + id="b_source", + table="b_table", + ).with_incremental_strategy( incremental_strategy=IncrementalStrategy(column="timestamp") ), ], diff --git a/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py b/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py index f0ae2f85..96ff682a 100644 --- a/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py @@ -67,7 +67,7 @@ def test_blank_aggregation(self, feature_set_dataframe): name="feature1", description="unit test", transformation=AggregatedTransform( - functions=[Function(func="", data_type="")] + functions=[Function(func=None, data_type="")] ), ) From 2a5a6e8be2fa5493da2666261c4af9a304bbdef7 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 14 Jun 2024 10:52:08 -0300 Subject: [PATCH 62/86] feat(MLOP-2236): add NTZ (#360) * feat: NTZ and new tests --- butterfree/constants/data_type.py | 2 + .../transform/features/timestamp_feature.py | 3 +- .../pipelines/test_feature_set_pipeline.py | 2 +- .../features/test_timestamp_feature.py | 80 ++++++++++++++++++- 4 files changed, 84 insertions(+), 3 deletions(-) diff --git a/butterfree/constants/data_type.py b/butterfree/constants/data_type.py index e99525f7..6166f1fc 100644 --- a/butterfree/constants/data_type.py +++ b/butterfree/constants/data_type.py @@ -12,6 +12,7 @@ IntegerType, LongType, StringType, + TimestampNTZType, TimestampType, ) from typing_extensions import final @@ -21,6 +22,7 @@ class DataType(Enum): """Holds constants for data types within Butterfree.""" + TIMESTAMP_NTZ = (TimestampNTZType(), "timestamp", "TIMESTAMP_NTZ") TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP") BINARY = (BinaryType(), "boolean", "BINARY") BOOLEAN = (BooleanType(), "boolean", "BOOLEAN") diff --git a/butterfree/transform/features/timestamp_feature.py b/butterfree/transform/features/timestamp_feature.py index aa30dfc4..b4aee71e 100644 --- a/butterfree/transform/features/timestamp_feature.py +++ b/butterfree/transform/features/timestamp_feature.py @@ -41,6 +41,7 @@ class TimestampFeature(Feature): def __init__( self, + dtype: Optional[DataType] = DataType.TIMESTAMP, from_column: Optional[str] = None, transformation: Optional[TransformComponent] = None, from_ms: bool = False, @@ -51,7 +52,7 @@ def __init__( name=TIMESTAMP_COLUMN, description=description, from_column=from_column, - dtype=DataType.TIMESTAMP, + dtype=dtype, transformation=transformation, ) self.from_ms = from_ms diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index 79125339..16eb08e2 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -50,7 +50,7 @@ def create_temp_view(dataframe: DataFrame, name): def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table): - spark.sql(f"drop schema {table_reader_db} cascade") + spark.sql(f"drop schema if exists {table_reader_db} cascade") spark.sql(f"create database {table_reader_db}") spark.sql(f"use {table_reader_db}") spark.sql( diff --git a/tests/unit/butterfree/transform/features/test_timestamp_feature.py b/tests/unit/butterfree/transform/features/test_timestamp_feature.py index a5a688c2..42ab40a2 100644 --- a/tests/unit/butterfree/transform/features/test_timestamp_feature.py +++ b/tests/unit/butterfree/transform/features/test_timestamp_feature.py @@ -1,18 +1,26 @@ -from pyspark.sql.types import StringType +from datetime import datetime +import pytz +from pyspark.sql.types import StringType, StructField, StructType + +from butterfree.clients import SparkClient from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN from butterfree.transform.features import TimestampFeature +# from pyspark.sql.types import * + class TestTimestampFeature: def test_args_without_transformation(self): test_key = TimestampFeature(from_column="ts") + test_key_ntz = TimestampFeature(dtype=DataType.TIMESTAMP_NTZ, from_column="ts") assert test_key.name == TIMESTAMP_COLUMN assert test_key.from_column == "ts" assert test_key.dtype == DataType.TIMESTAMP + assert test_key_ntz.dtype == DataType.TIMESTAMP_NTZ def test_transform(self, feature_set_dataframe): @@ -70,3 +78,73 @@ def test_transform_mask(self, feature_set_dataframe_date): assert df[0]["timestamp"] == "2020-02-07 00:00:00" assert df[1]["timestamp"] == "2020-02-08 00:00:00" + + def test_timezone_configs(self): + + spark = SparkClient() + now = datetime.now() + + # Testing a new timezone + spark.conn.conf.set("spark.sql.session.timeZone", "GMT-5") + + time_list = [(now, now)] + rdd = spark.conn.sparkContext.parallelize(time_list) + + schema = StructType( + [ + StructField("ts", DataType.TIMESTAMP.spark, True), + StructField("ts_ntz", DataType.TIMESTAMP_NTZ.spark, True), + ] + ) + df = spark.conn.createDataFrame(rdd, schema) + df.createOrReplaceTempView("temp_tz_table") + + df1 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""") + df2 = df1.withColumns( + {"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())} + ) + df2_vals = df2.collect()[0] + + assert df2_vals.ts != df2_vals.ts_ntz + + # New TZ. Column with TZ must have a != value; Column NTZ must keep its value + spark.conn.conf.set("spark.sql.session.timeZone", "GMT-7") + + df3 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""") + df4 = df3.withColumns( + {"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())} + ) + df4_vals = df4.collect()[0] + + assert df4_vals.ts != df2_vals.ts + assert df4_vals.ts_ntz == df2_vals.ts_ntz + + def test_timezone(self): + + spark = SparkClient() + + my_date = datetime.now(pytz.timezone("US/Pacific")) + + datetime_mask = "%Y-%m-%d %H:%M" + + data = [ + {"id": 1, TIMESTAMP_COLUMN: str(my_date), "feature": 100}, + {"id": 2, TIMESTAMP_COLUMN: str(my_date), "feature": 200}, + ] + + df = spark.conn.read.json(spark.conn._sc.parallelize(data, 1)) + df.createOrReplaceTempView("time_table") + + df2 = spark.sql("SELECT TIMESTAMP AS ts FROM time_table") + + time_value = datetime.fromisoformat(df2.collect()[0].ts).strftime(datetime_mask) + + df_different_timezone = df2.withColumn( + "ts", df2.ts.cast(DataType.TIMESTAMP.spark) + ) + df_no_timezone = df2.withColumn("ts", df2.ts.cast(DataType.TIMESTAMP_NTZ.spark)) + + assert ( + df_different_timezone.collect()[0].ts.strftime(datetime_mask) != time_value + ) + assert df_no_timezone.collect()[0].ts.strftime(datetime_mask) == time_value From 6363e03a2d4e8ce4c21102ba1814515fe672029d Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Thu, 20 Jun 2024 10:46:15 -0300 Subject: [PATCH 63/86] fix: cassandra configs (#364) * fix: to lower case * pin numpy --- butterfree/configs/db/cassandra_config.py | 30 ++++++++++++----------- requirements.txt | 1 + 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index d60bb697..d576359c 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -228,26 +228,28 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ cassandra_mapping = { - "TimestampType": "timestamp", - "BinaryType": "boolean", - "BooleanType": "boolean", - "DateType": "timestamp", - "DecimalType": "decimal", - "DoubleType": "double", - "FloatType": "float", - "IntegerType": "int", - "LongType": "bigint", - "StringType": "text", - "ArrayType(LongType,true)": "frozen>", - "ArrayType(StringType,true)": "frozen>", - "ArrayType(FloatType,true)": "frozen>", + "timestamptype": "timestamp", + "binarytype": "boolean", + "booleantype": "boolean", + "datetype": "timestamp", + "decimaltype": "decimal", + "doubletype": "double", + "floattype": "float", + "integertype": "int", + "longtype": "bigint", + "stringtype": "text", + "arraytype(longtype,true)": "frozen>", + "arraytype(stringtype,true)": "frozen>", + "arraytype(floattype,true)": "frozen>", } cassandra_schema = [] for features in schema: cassandra_schema.append( { "column_name": features["column_name"], - "type": cassandra_mapping[str(features["type"]).replace("()", "")], + "type": cassandra_mapping[ + str(features["type"]).replace("()", "").lower() + ], "primary_key": features["primary_key"], } ) diff --git a/requirements.txt b/requirements.txt index 0af8a62a..f3968c60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ pyspark==3.5.1 typer==0.4.2 typing-extensions>3.7.4,<5 boto3==1.17.* +numpy==1.26.4 From 81c2c178444620b1f1576390ffa13a2be242068a Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 28 Jun 2024 10:33:10 -0300 Subject: [PATCH 64/86] fix: Cassandra config keys (#366) * fix: keys --- butterfree/configs/db/cassandra_config.py | 6 +++--- .../migrations/database_migration/conftest.py | 14 +++++++++++++- .../database_migration/test_cassandra_migration.py | 2 ++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index d576359c..6d7f9a20 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -238,9 +238,9 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: "integertype": "int", "longtype": "bigint", "stringtype": "text", - "arraytype(longtype,true)": "frozen>", - "arraytype(stringtype,true)": "frozen>", - "arraytype(floattype,true)": "frozen>", + "arraytype(longtype, true)": "frozen>", + "arraytype(stringtype, true)": "frozen>", + "arraytype(floattype, true)": "frozen>", } cassandra_schema = [] for features in schema: diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py index 237158b7..3d3662d8 100644 --- a/tests/unit/butterfree/migrations/database_migration/conftest.py +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -1,4 +1,11 @@ -from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType +from pyspark.sql.types import ( + ArrayType, + DoubleType, + FloatType, + LongType, + StringType, + TimestampType, +) from pytest import fixture from butterfree.constants import DataType @@ -30,6 +37,11 @@ def fs_schema(): {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": True}, {"column_name": "new_feature", "type": FloatType(), "primary_key": False}, + { + "column_name": "array_feature", + "type": ArrayType(StringType(), True), + "primary_key": False, + }, { "column_name": "feature1__avg_over_1_week_rolling_windows", "type": FloatType(), diff --git a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py index 5666cc47..5e89b65b 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py @@ -33,9 +33,11 @@ def test_create_table_query(self, fs_schema): expected_query = [ "CREATE TABLE test.table_name " "(id LongType, timestamp TimestampType, new_feature FloatType, " + "array_feature ArrayType(StringType(), True), " "feature1__avg_over_1_week_rolling_windows FloatType, " "PRIMARY KEY (id, timestamp));" ] + query = cassandra_migration.create_query(fs_schema, "table_name") assert query, expected_query From b1949cd3fa4da68595442d28328819d2196e9ddc Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 28 Jun 2024 16:44:07 -0300 Subject: [PATCH 65/86] fix: new type (#368) --- butterfree/configs/db/cassandra_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index 6d7f9a20..919fee8e 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -229,6 +229,7 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ cassandra_mapping = { "timestamptype": "timestamp", + "timestampntztype": "timestamp", "binarytype": "boolean", "booleantype": "boolean", "datetype": "timestamp", From 12d5e982474ad36fd2acfa1594a1535b894fc451 Mon Sep 17 00:00:00 2001 From: Fernando Barbosa Date: Fri, 16 Aug 2024 12:00:45 -0300 Subject: [PATCH 66/86] Delete .checklist.yaml (#371) --- .checklist.yaml | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 .checklist.yaml diff --git a/.checklist.yaml b/.checklist.yaml deleted file mode 100644 index f0c21171..00000000 --- a/.checklist.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: quintoandar.com.br/checklist/v2 -kind: ServiceChecklist -metadata: - name: butterfree -spec: - description: >- - A solution for Feature Stores. - - costCenter: C055 - department: engineering - lifecycle: production - docs: true - - ownership: - team: data_products_mlops - line: tech_platform - owner: otavio.cals@quintoandar.com.br - - libraries: - - name: butterfree - type: common-usage - path: https://quintoandar.github.io/python-package-server/ - description: A lib to build Feature Stores. - registries: - - github-packages - tier: T0 - - channels: - squad: 'mlops' - alerts: 'data-products-reports' From 35dd929a9fb64a08a76edfd6f652970d371e0d29 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 19 Aug 2024 10:57:23 -0300 Subject: [PATCH 67/86] Add Delta support (#370) * feat: delta --- .github/workflows/publish.yml | 4 +- .github/workflows/skip_lint.yml | 17 -- .github/workflows/staging.yml | 3 +- .github/workflows/test.yml | 2 +- CHANGELOG.md | 22 +++ Makefile | 3 +- butterfree/clients/spark_client.py | 1 + butterfree/load/writers/__init__.py | 3 +- butterfree/load/writers/delta_writer.py | 162 ++++++++++++++++++ .../historical_feature_store_writer.py | 27 ++- butterfree/load/writers/writer.py | 2 + docs/source/butterfree.clients.rst | 6 + docs/source/butterfree.configs.db.rst | 8 + docs/source/butterfree.configs.rst | 29 ++++ docs/source/butterfree.constants.rst | 65 +++++++ .../butterfree.extract.pre_processing.rst | 10 ++ docs/source/butterfree.extract.readers.rst | 8 + docs/source/butterfree.extract.rst | 2 + docs/source/butterfree.load.processing.rst | 2 + docs/source/butterfree.load.rst | 2 + docs/source/butterfree.load.writers.rst | 6 + docs/source/butterfree.pipelines.rst | 2 + docs/source/butterfree.reports.rst | 2 + docs/source/butterfree.transform.features.rst | 6 + docs/source/butterfree.transform.rst | 4 + .../butterfree.transform.transformations.rst | 14 ++ ...transformations.user_defined_functions.rst | 4 + docs/source/butterfree.transform.utils.rst | 6 + docs/source/butterfree.validations.rst | 4 + mypy.ini | 39 +++++ requirements.txt | 1 + setup.py | 2 +- .../load/writers/test_delta_writer.py | 83 +++++++++ .../test_historical_feature_store_writer.py | 25 +++ tests/unit/butterfree/transform/conftest.py | 38 ++-- 35 files changed, 560 insertions(+), 54 deletions(-) delete mode 100644 .github/workflows/skip_lint.yml create mode 100644 butterfree/load/writers/delta_writer.py create mode 100644 tests/unit/butterfree/load/writers/test_delta_writer.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d33e4aa0..8b4d9c73 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -4,11 +4,9 @@ on: paths: - 'setup.py' - jobs: Pipeline: if: github.ref == 'refs/heads/master' - runs-on: ubuntu-latest steps: @@ -19,7 +17,7 @@ jobs: - uses: actions/setup-java@v4 with: - java-version: '11' + java-version: '17' distribution: microsoft - uses: vemonet/setup-spark@v1 diff --git a/.github/workflows/skip_lint.yml b/.github/workflows/skip_lint.yml deleted file mode 100644 index 1c768a23..00000000 --- a/.github/workflows/skip_lint.yml +++ /dev/null @@ -1,17 +0,0 @@ -# This step is used only because we want to mark the runner-linter check as required -# for PRs to develop, but not for the merge queue to merge into develop, -# github does not have this functionality yet - -name: 'Skip github-actions/runner-linter check at merge queue' - -on: - merge_group: - -jobs: - empty_job: - name: 'github-actions/runner-linter' - runs-on: github-actions-developers-runner - steps: - - name: Skip github-actions/runner-linter check at merge queue - run: | - echo "Done" diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 573049ca..9885ba68 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -7,7 +7,6 @@ on: jobs: Pipeline: if: github.ref == 'refs/heads/staging' - runs-on: ubuntu-latest steps: @@ -18,7 +17,7 @@ jobs: - uses: actions/setup-java@v4 with: - java-version: '11' + java-version: '17' distribution: microsoft - uses: vemonet/setup-spark@v1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d588c853..96ad666f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,7 +19,7 @@ jobs: - uses: actions/setup-java@v4 with: - java-version: '11' + java-version: '17' distribution: microsoft - uses: vemonet/setup-spark@v1 diff --git a/CHANGELOG.md b/CHANGELOG.md index ad9f4863..fe9f9a8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] +## [1.3.5](https://github.com/quintoandar/butterfree/releases/tag/1.3.5) +* Auto create feature sets ([#368](https://github.com/quintoandar/butterfree/pull/368)) + +## [1.3.4](https://github.com/quintoandar/butterfree/releases/tag/1.3.4) +* Fix Cassandra Config and tests ([#366](https://github.com/quintoandar/butterfree/pull/366)) + +## [1.3.3](https://github.com/quintoandar/butterfree/releases/tag/1.3.3) +* Fix Cassandra Config and Numpy version ([#364](https://github.com/quintoandar/butterfree/pull/364)) + +## [1.3.2](https://github.com/quintoandar/butterfree/releases/tag/1.3.2) +* Fix publish script ([#362](https://github.com/quintoandar/butterfree/pull/362)) + +## [1.3.2](https://github.com/quintoandar/butterfree/releases/tag/1.3.2) +* Fix publish script ([#360](https://github.com/quintoandar/butterfree/pull/362)) + +## [1.3.1](https://github.com/quintoandar/butterfree/releases/tag/1.3.1) +* Timestamp NTZ available ([#360](https://github.com/quintoandar/butterfree/pull/360)) + +## [1.3.0](https://github.com/quintoandar/butterfree/releases/tag/1.3.0) +* Bump versions ([#355](https://github.com/quintoandar/butterfree/pull/355)) +* Sphinx version ([#356](https://github.com/quintoandar/butterfree/pull/356)) + ## [1.2.4](https://github.com/quintoandar/butterfree/releases/tag/1.2.4) * Auto create feature sets ([#351](https://github.com/quintoandar/butterfree/pull/351)) diff --git a/Makefile b/Makefile index bf9ccd64..a93104ab 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ minimum-requirements: .PHONY: requirements ## install all requirements -requirements: requirements-test requirements-lint dev-requirements minimum-requirements +requirements: minimum-requirements dev-requirements requirements-test requirements-lint .PHONY: ci-install ci-install: @@ -146,6 +146,7 @@ package-name: .PHONY: package ## build butterfree package wheel package: + @PYTHONPATH=. pip3 install wheel @PYTHONPATH=. python -m setup sdist bdist_wheel .PHONY: update-docs diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index 933c2165..f4b6ea65 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -30,6 +30,7 @@ def conn(self) -> SparkSession: """ if not self._session: self._session = SparkSession.builder.getOrCreate() + return self._session def read( diff --git a/butterfree/load/writers/__init__.py b/butterfree/load/writers/__init__.py index 72945d27..f1f0e449 100644 --- a/butterfree/load/writers/__init__.py +++ b/butterfree/load/writers/__init__.py @@ -1,8 +1,9 @@ """Holds data loaders for historical and online feature store.""" +from butterfree.load.writers.delta_writer import DeltaWriter from butterfree.load.writers.historical_feature_store_writer import ( HistoricalFeatureStoreWriter, ) from butterfree.load.writers.online_feature_store_writer import OnlineFeatureStoreWriter -__all__ = ["HistoricalFeatureStoreWriter", "OnlineFeatureStoreWriter"] +__all__ = ["HistoricalFeatureStoreWriter", "OnlineFeatureStoreWriter", "DeltaWriter"] diff --git a/butterfree/load/writers/delta_writer.py b/butterfree/load/writers/delta_writer.py new file mode 100644 index 00000000..933f1adb --- /dev/null +++ b/butterfree/load/writers/delta_writer.py @@ -0,0 +1,162 @@ +from delta.tables import DeltaTable +from pyspark.sql.dataframe import DataFrame + +from butterfree.clients import SparkClient +from butterfree.configs.logger import __logger + +logger = __logger("delta_writer", True) + + +class DeltaWriter: + """Control operations on Delta Tables. + + Resposible for merging and optimizing. + """ + + @staticmethod + def _get_full_table_name(table, database): + if database: + return "{}.{}".format(database, table) + else: + return table + + @staticmethod + def _convert_to_delta(client: SparkClient, table: str): + logger.info(f"Converting {table} to Delta...") + client.conn.sql(f"CONVERT TO DELTA {table}") + logger.info("Conversion done.") + + @staticmethod + def merge( + client: SparkClient, + database: str, + table: str, + merge_on: list, + source_df: DataFrame, + when_not_matched_insert_condition: str = None, + when_matched_update_condition: str = None, + when_matched_delete_condition: str = None, + ): + """ + Merge a source dataframe to a Delta table. + + By default, it will update when matched, and insert when + not matched (simple upsert). + + You can change this behavior by setting: + - when_not_matched_insert_condition: it will only insert + when this specified condition is true + - when_matched_update_condition: it will only update when this + specified condition is true. You can refer to the columns + in the source dataframe as source., and the columns + in the target table as target.. + - when_matched_delete_condition: it will add an operation to delete, + but only if this condition is true. Again, source and + target dataframe columns can be referred to respectively as + source. and target. + """ + try: + full_table_name = DeltaWriter._get_full_table_name(table, database) + + table_exists = client.conn.catalog.tableExists(full_table_name) + + if table_exists: + pd_df = client.conn.sql( + f"DESCRIBE TABLE EXTENDED {full_table_name}" + ).toPandas() + provider = ( + pd_df.reset_index() + .groupby(["col_name"])["data_type"] + .aggregate("first") + .Provider + ) + table_is_delta = provider.lower() == "delta" + + if not table_is_delta: + DeltaWriter()._convert_to_delta(client, full_table_name) + + # For schema evolution + client.conn.conf.set( + "spark.databricks.delta.schema.autoMerge.enabled", "true" + ) + + target_table = DeltaTable.forName(client.conn, full_table_name) + join_condition = " AND ".join( + [f"source.{col} = target.{col}" for col in merge_on] + ) + merge_builder = target_table.alias("target").merge( + source_df.alias("source"), join_condition + ) + if when_matched_delete_condition: + merge_builder = merge_builder.whenMatchedDelete( + condition=when_matched_delete_condition + ) + + merge_builder.whenMatchedUpdateAll( + condition=when_matched_update_condition + ).whenNotMatchedInsertAll( + condition=when_not_matched_insert_condition + ).execute() + except Exception as e: + logger.error(f"Merge operation on {full_table_name} failed: {e}") + + @staticmethod + def vacuum(table: str, retention_hours: int, client: SparkClient): + """Vacuum a Delta table. + + Vacuum remove unused files (files not managed by Delta + files + that are not in the latest state). + After vacuum it's impossible to time travel to versions + older than the `retention` time. + Default retention is 7 days. Lower retentions will be warned, + unless it's set to false. + Set spark.databricks.delta.retentionDurationCheck.enabled + to false for low retentions. + https://docs.databricks.com/en/sql/language-manual/delta-vacuum.html + """ + + command = f"VACUUM {table} RETAIN {retention_hours} HOURS" + logger.info(f"Running vacuum with command {command}") + client.conn.sql(command) + logger.info(f"Vacuum successful for table {table}") + + @staticmethod + def optimize( + client: SparkClient, + table: str = None, + z_order: list = None, + date_column: str = "timestamp", + from_date: str = None, + auto_compact: bool = False, + optimize_write: bool = False, + ): + """Optimize a Delta table. + + For auto-compaction and optimize write DBR >= 14.3 LTS + and Delta >= 3.1.0 are MANDATORY. + For z-ordering DBR >= 13.3 LTS and Delta >= 2.0.0 are MANDATORY. + Auto-compaction (recommended) reduces the small file problem + (overhead due to lots of metadata). + Z-order by columns that is commonly used in queries + predicates and has a high cardinality. + https://docs.delta.io/latest/optimizations-oss.html + """ + + if auto_compact: + client.conf.set("spark.databricks.delta.autoCompact.enabled", "true") + + if optimize_write: + client.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true") + + if table: + command = f"OPTIMIZE {table}" + + if from_date: + command += f"WHERE {date_column} >= {from_date}" + + if z_order: + command += f" ZORDER BY {','.join(z_order)}" + + logger.info(f"Running optimize with command {command}...") + client.conn.sql(command) + logger.info(f"Optimize successful for table {table}.") diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index c01fee1d..0be7d6af 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -14,6 +14,7 @@ from butterfree.dataframe_service import repartition_df from butterfree.hooks import Hook from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook +from butterfree.load.writers.delta_writer import DeltaWriter from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet @@ -114,6 +115,7 @@ def __init__( interval_mode: bool = False, check_schema_hook: Optional[Hook] = None, row_count_validation: bool = True, + merge_on: list = None, ): super(HistoricalFeatureStoreWriter, self).__init__( db_config or MetastoreConfig(), @@ -121,6 +123,7 @@ def __init__( interval_mode, False, row_count_validation, + merge_on, ) self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" @@ -141,6 +144,7 @@ def write( feature_set: object processed with feature_set informations. dataframe: spark dataframe containing data from a feature set. spark_client: client for spark connections with external services. + merge_on: when filled, the writing is an upsert in a Delta table. If the debug_mode is set to True, a temporary table with a name in the format: historical_feature_store__{feature_set.name} will be created instead of writing @@ -174,13 +178,22 @@ def write( s3_key = os.path.join("historical", feature_set.entity, feature_set.name) - spark_client.write_table( - dataframe=dataframe, - database=self.database, - table_name=feature_set.name, - partition_by=self.PARTITION_BY, - **self.db_config.get_options(s3_key), - ) + if self.merge_on: + DeltaWriter.merge( + client=spark_client, + database=self.database, + table=feature_set.name, + merge_on=self.merge_on, + source_df=dataframe, + ) + else: + spark_client.write_table( + dataframe=dataframe, + database=self.database, + table_name=feature_set.name, + partition_by=self.PARTITION_BY, + **self.db_config.get_options(s3_key), + ) def _assert_validation_count( self, table_name: str, written_count: int, dataframe_count: int diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index 780b9ec2..a99514ae 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -27,6 +27,7 @@ def __init__( interval_mode: Optional[bool] = False, write_to_entity: Optional[bool] = False, row_count_validation: Optional[bool] = True, + merge_on: Optional[list] = None, ) -> None: super().__init__() self.db_config = db_config @@ -35,6 +36,7 @@ def __init__( self.interval_mode = interval_mode self.write_to_entity = write_to_entity self.row_count_validation = row_count_validation + self.merge_on = merge_on def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any diff --git a/docs/source/butterfree.clients.rst b/docs/source/butterfree.clients.rst index 1bfaa86d..b1e1029a 100644 --- a/docs/source/butterfree.clients.rst +++ b/docs/source/butterfree.clients.rst @@ -4,18 +4,24 @@ butterfree.clients package Submodules ---------- +butterfree.clients.abstract\_client module +------------------------------------------ .. automodule:: butterfree.clients.abstract_client :members: :undoc-members: :show-inheritance: +butterfree.clients.cassandra\_client module +------------------------------------------- .. automodule:: butterfree.clients.cassandra_client :members: :undoc-members: :show-inheritance: +butterfree.clients.spark\_client module +--------------------------------------- .. automodule:: butterfree.clients.spark_client :members: diff --git a/docs/source/butterfree.configs.db.rst b/docs/source/butterfree.configs.db.rst index 3bb9f8b8..6e23dc1c 100644 --- a/docs/source/butterfree.configs.db.rst +++ b/docs/source/butterfree.configs.db.rst @@ -4,24 +4,32 @@ butterfree.configs.db package Submodules ---------- +butterfree.configs.db.abstract\_config module +--------------------------------------------- .. automodule:: butterfree.configs.db.abstract_config :members: :undoc-members: :show-inheritance: +butterfree.configs.db.cassandra\_config module +---------------------------------------------- .. automodule:: butterfree.configs.db.cassandra_config :members: :undoc-members: :show-inheritance: +butterfree.configs.db.kafka\_config module +------------------------------------------ .. automodule:: butterfree.configs.db.kafka_config :members: :undoc-members: :show-inheritance: +butterfree.configs.db.metastore\_config module +---------------------------------------------- .. automodule:: butterfree.configs.db.metastore_config :members: diff --git a/docs/source/butterfree.configs.rst b/docs/source/butterfree.configs.rst index f3cf2aa2..18a82795 100644 --- a/docs/source/butterfree.configs.rst +++ b/docs/source/butterfree.configs.rst @@ -12,12 +12,41 @@ Subpackages Submodules ---------- +butterfree.configs.environment module +------------------------------------- .. automodule:: butterfree.configs.environment :members: :undoc-members: :show-inheritance: +butterfree.configs.logger module +-------------------------------- + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: .. automodule:: butterfree.configs.logger :members: diff --git a/docs/source/butterfree.constants.rst b/docs/source/butterfree.constants.rst index d0e72fed..e90b195e 100644 --- a/docs/source/butterfree.constants.rst +++ b/docs/source/butterfree.constants.rst @@ -4,18 +4,56 @@ butterfree.constants package Submodules ---------- +butterfree.constants.columns module +----------------------------------- .. automodule:: butterfree.constants.columns :members: :undoc-members: :show-inheritance: +butterfree.constants.data\_type module +-------------------------------------- .. automodule:: butterfree.constants.data_type :members: :undoc-members: :show-inheritance: +butterfree.constants.migrations module +-------------------------------------- + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + +butterfree.constants.spark\_constants module +-------------------------------------------- + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + .. automodule:: butterfree.constants.migrations :members: @@ -28,6 +66,33 @@ Submodules :undoc-members: :show-inheritance: +butterfree.constants.window\_definitions module +----------------------------------------------- + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: .. automodule:: butterfree.constants.window_definitions :members: diff --git a/docs/source/butterfree.extract.pre_processing.rst b/docs/source/butterfree.extract.pre_processing.rst index 172e6fb3..e8e66e3d 100644 --- a/docs/source/butterfree.extract.pre_processing.rst +++ b/docs/source/butterfree.extract.pre_processing.rst @@ -4,30 +4,40 @@ butterfree.extract.pre\_processing package Submodules ---------- +butterfree.extract.pre\_processing.explode\_json\_column\_transform module +-------------------------------------------------------------------------- .. automodule:: butterfree.extract.pre_processing.explode_json_column_transform :members: :undoc-members: :show-inheritance: +butterfree.extract.pre\_processing.filter\_transform module +----------------------------------------------------------- .. automodule:: butterfree.extract.pre_processing.filter_transform :members: :undoc-members: :show-inheritance: +butterfree.extract.pre\_processing.forward\_fill\_transform module +------------------------------------------------------------------ .. automodule:: butterfree.extract.pre_processing.forward_fill_transform :members: :undoc-members: :show-inheritance: +butterfree.extract.pre\_processing.pivot\_transform module +---------------------------------------------------------- .. automodule:: butterfree.extract.pre_processing.pivot_transform :members: :undoc-members: :show-inheritance: +butterfree.extract.pre\_processing.replace\_transform module +------------------------------------------------------------ .. automodule:: butterfree.extract.pre_processing.replace_transform :members: diff --git a/docs/source/butterfree.extract.readers.rst b/docs/source/butterfree.extract.readers.rst index a67d47e9..40df200e 100644 --- a/docs/source/butterfree.extract.readers.rst +++ b/docs/source/butterfree.extract.readers.rst @@ -4,24 +4,32 @@ butterfree.extract.readers package Submodules ---------- +butterfree.extract.readers.file\_reader module +---------------------------------------------- .. automodule:: butterfree.extract.readers.file_reader :members: :undoc-members: :show-inheritance: +butterfree.extract.readers.kafka\_reader module +----------------------------------------------- .. automodule:: butterfree.extract.readers.kafka_reader :members: :undoc-members: :show-inheritance: +butterfree.extract.readers.reader module +---------------------------------------- .. automodule:: butterfree.extract.readers.reader :members: :undoc-members: :show-inheritance: +butterfree.extract.readers.table\_reader module +----------------------------------------------- .. automodule:: butterfree.extract.readers.table_reader :members: diff --git a/docs/source/butterfree.extract.rst b/docs/source/butterfree.extract.rst index a59d2e29..455f02d5 100644 --- a/docs/source/butterfree.extract.rst +++ b/docs/source/butterfree.extract.rst @@ -13,6 +13,8 @@ Subpackages Submodules ---------- +butterfree.extract.source module +-------------------------------- .. automodule:: butterfree.extract.source :members: diff --git a/docs/source/butterfree.load.processing.rst b/docs/source/butterfree.load.processing.rst index 4c5d2a2e..d16182cb 100644 --- a/docs/source/butterfree.load.processing.rst +++ b/docs/source/butterfree.load.processing.rst @@ -4,6 +4,8 @@ butterfree.load.processing package Submodules ---------- +butterfree.load.processing.json\_transform module +------------------------------------------------- .. automodule:: butterfree.load.processing.json_transform :members: diff --git a/docs/source/butterfree.load.rst b/docs/source/butterfree.load.rst index e38934a5..e4b56fbc 100644 --- a/docs/source/butterfree.load.rst +++ b/docs/source/butterfree.load.rst @@ -13,6 +13,8 @@ Subpackages Submodules ---------- +butterfree.load.sink module +--------------------------- .. automodule:: butterfree.load.sink :members: diff --git a/docs/source/butterfree.load.writers.rst b/docs/source/butterfree.load.writers.rst index 6ff438de..2a173c9a 100644 --- a/docs/source/butterfree.load.writers.rst +++ b/docs/source/butterfree.load.writers.rst @@ -4,18 +4,24 @@ butterfree.load.writers package Submodules ---------- +butterfree.load.writers.historical\_feature\_store\_writer module +----------------------------------------------------------------- .. automodule:: butterfree.load.writers.historical_feature_store_writer :members: :undoc-members: :show-inheritance: +butterfree.load.writers.online\_feature\_store\_writer module +------------------------------------------------------------- .. automodule:: butterfree.load.writers.online_feature_store_writer :members: :undoc-members: :show-inheritance: +butterfree.load.writers.writer module +------------------------------------- .. automodule:: butterfree.load.writers.writer :members: diff --git a/docs/source/butterfree.pipelines.rst b/docs/source/butterfree.pipelines.rst index e0c31996..e70a4d89 100644 --- a/docs/source/butterfree.pipelines.rst +++ b/docs/source/butterfree.pipelines.rst @@ -4,6 +4,8 @@ butterfree.pipelines package Submodules ---------- +butterfree.pipelines.feature\_set\_pipeline module +-------------------------------------------------- .. automodule:: butterfree.pipelines.feature_set_pipeline :members: diff --git a/docs/source/butterfree.reports.rst b/docs/source/butterfree.reports.rst index 850db914..a95a7e7f 100644 --- a/docs/source/butterfree.reports.rst +++ b/docs/source/butterfree.reports.rst @@ -4,6 +4,8 @@ butterfree.reports package Submodules ---------- +butterfree.reports.metadata module +---------------------------------- .. automodule:: butterfree.reports.metadata :members: diff --git a/docs/source/butterfree.transform.features.rst b/docs/source/butterfree.transform.features.rst index f6c69095..837e0fcf 100644 --- a/docs/source/butterfree.transform.features.rst +++ b/docs/source/butterfree.transform.features.rst @@ -4,18 +4,24 @@ butterfree.transform.features package Submodules ---------- +butterfree.transform.features.feature module +-------------------------------------------- .. automodule:: butterfree.transform.features.feature :members: :undoc-members: :show-inheritance: +butterfree.transform.features.key\_feature module +------------------------------------------------- .. automodule:: butterfree.transform.features.key_feature :members: :undoc-members: :show-inheritance: +butterfree.transform.features.timestamp\_feature module +------------------------------------------------------- .. automodule:: butterfree.transform.features.timestamp_feature :members: diff --git a/docs/source/butterfree.transform.rst b/docs/source/butterfree.transform.rst index 02f8d4c6..12c346ae 100644 --- a/docs/source/butterfree.transform.rst +++ b/docs/source/butterfree.transform.rst @@ -14,12 +14,16 @@ Subpackages Submodules ---------- +butterfree.transform.aggregated\_feature\_set module +---------------------------------------------------- .. automodule:: butterfree.transform.aggregated_feature_set :members: :undoc-members: :show-inheritance: +butterfree.transform.feature\_set module +---------------------------------------- .. automodule:: butterfree.transform.feature_set :members: diff --git a/docs/source/butterfree.transform.transformations.rst b/docs/source/butterfree.transform.transformations.rst index 0978edcf..f17818d3 100644 --- a/docs/source/butterfree.transform.transformations.rst +++ b/docs/source/butterfree.transform.transformations.rst @@ -12,42 +12,56 @@ Subpackages Submodules ---------- +butterfree.transform.transformations.aggregated\_transform module +----------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.aggregated_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.custom\_transform module +------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.custom_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.h3\_transform module +--------------------------------------------------------- .. automodule:: butterfree.transform.transformations.h3_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.spark\_function\_transform module +---------------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.spark_function_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.sql\_expression\_transform module +---------------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.sql_expression_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.stack\_transform module +------------------------------------------------------------ .. automodule:: butterfree.transform.transformations.stack_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.transform\_component module +---------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.transform_component :members: diff --git a/docs/source/butterfree.transform.transformations.user_defined_functions.rst b/docs/source/butterfree.transform.transformations.user_defined_functions.rst index f93c7e98..b79e8138 100644 --- a/docs/source/butterfree.transform.transformations.user_defined_functions.rst +++ b/docs/source/butterfree.transform.transformations.user_defined_functions.rst @@ -4,12 +4,16 @@ butterfree.transform.transformations.user\_defined\_functions package Submodules ---------- +butterfree.transform.transformations.user\_defined\_functions.mode module +------------------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.user_defined_functions.mode :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.user\_defined\_functions.most\_frequent\_set module +---------------------------------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.user_defined_functions.most_frequent_set :members: diff --git a/docs/source/butterfree.transform.utils.rst b/docs/source/butterfree.transform.utils.rst index 82e9038b..d1d7206c 100644 --- a/docs/source/butterfree.transform.utils.rst +++ b/docs/source/butterfree.transform.utils.rst @@ -4,18 +4,24 @@ butterfree.transform.utils package Submodules ---------- +butterfree.transform.utils.date\_range module +--------------------------------------------- .. automodule:: butterfree.transform.utils.date_range :members: :undoc-members: :show-inheritance: +butterfree.transform.utils.function module +------------------------------------------ .. automodule:: butterfree.transform.utils.function :members: :undoc-members: :show-inheritance: +butterfree.transform.utils.window\_spec module +---------------------------------------------- .. automodule:: butterfree.transform.utils.window_spec :members: diff --git a/docs/source/butterfree.validations.rst b/docs/source/butterfree.validations.rst index 35f5d199..2aa0053e 100644 --- a/docs/source/butterfree.validations.rst +++ b/docs/source/butterfree.validations.rst @@ -4,12 +4,16 @@ butterfree.validations package Submodules ---------- +butterfree.validations.basic\_validaton module +---------------------------------------------- .. automodule:: butterfree.validations.basic_validaton :members: :undoc-members: :show-inheritance: +butterfree.validations.validation module +---------------------------------------- .. automodule:: butterfree.validations.validation :members: diff --git a/mypy.ini b/mypy.ini index eb867a47..fc293149 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,3 +9,42 @@ show_error_codes = True show_error_context = True disable_error_code = attr-defined, list-item, operator pretty = True + +[mypy-butterfree.pipelines.*] +ignore_errors = True + +[mypy-butterfree.load.*] +ignore_errors = True + +[mypy-butterfree.transform.*] +ignore_errors = True + +[mypy-butterfree.extract.*] +ignore_errors = True + +[mypy-butterfree.config.*] +ignore_errors = True + +[mypy-butterfree.clients.*] +ignore_errors = True + +[mypy-butterfree.configs.*] +ignore_errors = True + +[mypy-butterfree.dataframe_service.*] +ignore_errors = True + +[mypy-butterfree.validations.*] +ignore_errors = True + +[mypy-butterfree.migrations.*] +ignore_errors = True + +[mypy-butterfree.testing.*] +ignore_errors = True + +[mypy-butterfree.hooks.*] +ignore_errors = True + +[mypy-butterfree._cli.*] +ignore_errors = True diff --git a/requirements.txt b/requirements.txt index f3968c60..9c9eea64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ typer==0.4.2 typing-extensions>3.7.4,<5 boto3==1.17.* numpy==1.26.4 +delta-spark==3.2.0 diff --git a/setup.py b/setup.py index 42ef57c8..bc4f0b45 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.4" +__version__ = "1.3.5" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/load/writers/test_delta_writer.py b/tests/unit/butterfree/load/writers/test_delta_writer.py new file mode 100644 index 00000000..550f6d05 --- /dev/null +++ b/tests/unit/butterfree/load/writers/test_delta_writer.py @@ -0,0 +1,83 @@ +import os +from unittest import mock + +import pytest + +from butterfree.clients import SparkClient +from butterfree.load.writers import DeltaWriter + +DELTA_LOCATION = "spark-warehouse" + + +class TestDeltaWriter: + + def __checkFileExists(self, file_name: str = "test_delta_table") -> bool: + return os.path.exists(os.path.join(DELTA_LOCATION, file_name)) + + @pytest.fixture + def merge_builder_mock(self): + builder = mock.MagicMock() + builder.whenMatchedDelete.return_value = builder + builder.whenMatchedUpdateAll.return_value = builder + builder.whenNotMatchedInsertAll.return_value = builder + return builder + + def test_merge(self, feature_set_dataframe, merge_builder_mock): + + client = SparkClient() + delta_writer = DeltaWriter() + delta_writer.merge = mock.MagicMock() + + DeltaWriter().merge( + client=client, + database=None, + table="test_delta_table", + merge_on=["id"], + source_df=feature_set_dataframe, + ) + + assert merge_builder_mock.execute.assert_called_once + + # Step 2 + source = client.conn.createDataFrame( + [(1, "test3"), (2, "test4"), (3, "test5")], ["id", "feature"] + ) + + DeltaWriter().merge( + client=client, + database=None, + table="test_delta_table", + merge_on=["id"], + source_df=source, + when_not_matched_insert_condition=None, + when_matched_update_condition="id > 2", + ) + + assert merge_builder_mock.execute.assert_called_once + + def test_optimize(self, mocker): + + client = SparkClient() + conn_mock = mocker.patch( + "butterfree.clients.SparkClient.conn", return_value=mock.Mock() + ) + dw = DeltaWriter() + + dw.optimize = mock.MagicMock(client) + dw.optimize(client, "a_table") + + conn_mock.assert_called_once + + def test_vacuum(self, mocker): + + client = SparkClient() + conn_mock = mocker.patch( + "butterfree.clients.SparkClient.conn", return_value=mock.Mock() + ) + dw = DeltaWriter() + retention_hours = 24 + dw.vacuum = mock.MagicMock(client) + + dw.vacuum("a_table", retention_hours, client) + + conn_mock.assert_called_once diff --git a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py index 9e84aacd..d9d9181a 100644 --- a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py @@ -1,5 +1,6 @@ import datetime import random +from unittest import mock import pytest from pyspark.sql.functions import spark_partition_id @@ -145,6 +146,30 @@ def test_write_in_debug_mode_with_interval_mode( # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) + def test_merge_from_historical_writer( + self, + feature_set, + feature_set_dataframe, + mocker, + ): + # given + spark_client = SparkClient() + + spark_client.write_table = mocker.stub("write_table") + writer = HistoricalFeatureStoreWriter(merge_on=["id", "timestamp"]) + + static_mock = mocker.patch( + "butterfree.load.writers.DeltaWriter.merge", return_value=mock.Mock() + ) + + writer.write( + feature_set=feature_set, + dataframe=feature_set_dataframe, + spark_client=spark_client, + ) + + assert static_mock.call_count == 1 + def test_validate(self, historical_feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index fcf60132..104300c9 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -1,6 +1,6 @@ -import json from unittest.mock import Mock +import pyspark.pandas as ps from pyspark.sql import functions from pytest import fixture @@ -54,7 +54,8 @@ def make_dataframe(spark_context, spark_session): "nonfeature": 0, }, ] - df = spark_session.read.json(spark_context.parallelize(data, 1)) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) return df @@ -70,9 +71,8 @@ def make_filtering_dataframe(spark_context, spark_session): {"id": 1, "ts": 6, "feature1": None, "feature2": None, "feature3": None}, {"id": 1, "ts": 7, "feature1": None, "feature2": None, "feature3": None}, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) return df @@ -86,9 +86,8 @@ def make_output_filtering_dataframe(spark_context, spark_session): {"id": 1, "ts": 4, "feature1": 0, "feature2": 1, "feature3": 1}, {"id": 1, "ts": 6, "feature1": None, "feature2": None, "feature3": None}, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) return df @@ -127,9 +126,8 @@ def make_rolling_windows_agg_dataframe(spark_context, spark_session): "feature2__avg_over_1_week_rolling_windows": None, }, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df @@ -156,9 +154,8 @@ def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): "feature2__avg_over_1_day_rolling_windows": 500.0, }, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df @@ -205,9 +202,8 @@ def make_multiple_rolling_windows_hour_slide_agg_dataframe( "feature2__avg_over_3_days_rolling_windows": 500.0, }, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df @@ -257,7 +253,8 @@ def make_fs_dataframe_with_distinct(spark_context, spark_session): "h3": "86a8100efffffff", }, ] - df = spark_session.read.json(spark_context.parallelize(data, 1)) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df @@ -286,9 +283,8 @@ def make_target_df_distinct(spark_context, spark_session): "feature__sum_over_3_days_rolling_windows": None, }, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df From f6c5db6f7c444125a241cde5062c7ca6acd06dd2 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Wed, 21 Aug 2024 13:01:32 -0300 Subject: [PATCH 68/86] Fix dup code (#373) * fix: dedup code --- .../historical_feature_store_writer.py | 9 ++++ tests/unit/butterfree/transform/conftest.py | 53 ++++++------------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 0be7d6af..99bfe66a 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -93,6 +93,15 @@ class HistoricalFeatureStoreWriter(Writer): improve queries performance. The data is stored in partition folders in AWS S3 based on time (per year, month and day). + >>> spark_client = SparkClient() + >>> writer = HistoricalFeatureStoreWriter() + >>> writer.write(feature_set=feature_set, + ... dataframe=dataframe, + ... spark_client=spark_client + ... merge_on=["id", "timestamp"]) + + This procedure will skip dataframe write and will activate Delta Merge. + Use it when the table already exist. """ PARTITION_BY = [ diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index 104300c9..d66d1c39 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -16,6 +16,15 @@ from butterfree.transform.utils import Function +def create_dataframe(data, timestamp_col="ts"): + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() + df = df.withColumn( + TIMESTAMP_COLUMN, df[timestamp_col].cast(DataType.TIMESTAMP.spark) + ) + return df + + def make_dataframe(spark_context, spark_session): data = [ { @@ -54,11 +63,7 @@ def make_dataframe(spark_context, spark_session): "nonfeature": 0, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data) def make_filtering_dataframe(spark_context, spark_session): @@ -71,11 +76,7 @@ def make_filtering_dataframe(spark_context, spark_session): {"id": 1, "ts": 6, "feature1": None, "feature2": None, "feature3": None}, {"id": 1, "ts": 7, "feature1": None, "feature2": None, "feature3": None}, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data) def make_output_filtering_dataframe(spark_context, spark_session): @@ -86,11 +87,7 @@ def make_output_filtering_dataframe(spark_context, spark_session): {"id": 1, "ts": 4, "feature1": 0, "feature2": 1, "feature3": 1}, {"id": 1, "ts": 6, "feature1": None, "feature2": None, "feature3": None}, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data) def make_rolling_windows_agg_dataframe(spark_context, spark_session): @@ -126,11 +123,7 @@ def make_rolling_windows_agg_dataframe(spark_context, spark_session): "feature2__avg_over_1_week_rolling_windows": None, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data, timestamp_col="timestamp") def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): @@ -154,11 +147,7 @@ def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): "feature2__avg_over_1_day_rolling_windows": 500.0, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data, timestamp_col="timestamp") def make_multiple_rolling_windows_hour_slide_agg_dataframe( @@ -202,11 +191,7 @@ def make_multiple_rolling_windows_hour_slide_agg_dataframe( "feature2__avg_over_3_days_rolling_windows": 500.0, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data, timestamp_col="timestamp") def make_fs(spark_context, spark_session): @@ -253,9 +238,7 @@ def make_fs_dataframe_with_distinct(spark_context, spark_session): "h3": "86a8100efffffff", }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + df = create_dataframe(data, "timestamp") return df @@ -283,9 +266,7 @@ def make_target_df_distinct(spark_context, spark_session): "feature__sum_over_3_days_rolling_windows": None, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + df = create_dataframe(data, "timestamp") return df From 11cc5d5c006a17f839f52517f4907c1b40f8d20e Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 16 Sep 2024 10:36:47 -0300 Subject: [PATCH 69/86] fix: performance improvements (#374) --- butterfree/_cli/migrate.py | 14 +++++++-- butterfree/extract/source.py | 14 ++++++--- butterfree/pipelines/feature_set_pipeline.py | 29 ++++++++++++----- .../transform/aggregated_feature_set.py | 31 +++++++++++-------- butterfree/transform/feature_set.py | 7 ++--- .../butterfree/transform/test_feature_set.py | 2 +- 6 files changed, 64 insertions(+), 33 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index f5161509..6bd5ca08 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -4,7 +4,7 @@ import os import pkgutil import sys -from typing import Set +from typing import Set, Type import boto3 import setuptools @@ -90,8 +90,18 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: instances.add(value) + def create_instance(cls: Type[FeatureSetPipeline]) -> FeatureSetPipeline: + sig = inspect.signature(cls.__init__) + parameters = sig.parameters + + if "run_date" in parameters: + run_date = datetime.datetime.today().strftime("%y-%m-%d") + return cls(run_date) + + return cls() + logger.info("Creating instances...") - return set(value() for value in instances) # type: ignore + return set(create_instance(value) for value in instances) # type: ignore PATH = typer.Argument( diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index bfc15271..9d50e94c 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -3,6 +3,7 @@ from typing import List, Optional from pyspark.sql import DataFrame +from pyspark.storagelevel import StorageLevel from butterfree.clients import SparkClient from butterfree.extract.readers.reader import Reader @@ -95,16 +96,21 @@ def construct( DataFrame with the query result against all readers. """ + # Step 1: Build temporary views for each reader for reader in self.readers: - reader.build( - client=client, start_date=start_date, end_date=end_date - ) # create temporary views for each reader + reader.build(client=client, start_date=start_date, end_date=end_date) + # Step 2: Execute SQL query on the combined readers dataframe = client.sql(self.query) + # Step 3: Cache the dataframe if necessary, using memory and disk storage if not dataframe.isStreaming and self.eager_evaluation: - dataframe.cache().count() + # Persist to ensure the DataFrame is stored in mem and disk (if necessary) + dataframe.persist(StorageLevel.MEMORY_AND_DISK) + # Trigger the cache/persist operation by performing an action + dataframe.count() + # Step 4: Run post-processing hooks on the dataframe post_hook_df = self.run_post_hooks(dataframe) return post_hook_df diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index 8ba1a636..d57459f3 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -2,6 +2,8 @@ from typing import List, Optional +from pyspark.storagelevel import StorageLevel + from butterfree.clients import SparkClient from butterfree.dataframe_service import repartition_sort_df from butterfree.extract import Source @@ -209,19 +211,25 @@ def run( soon. Use only if strictly necessary. """ + # Step 1: Construct input dataframe from the source. dataframe = self.source.construct( client=self.spark_client, start_date=self.feature_set.define_start_date(start_date), end_date=end_date, ) + # Step 2: Repartition and sort if required, avoid if not necessary. if partition_by: order_by = order_by or partition_by - dataframe = repartition_sort_df( - dataframe, partition_by, order_by, num_processors - ) - - dataframe = self.feature_set.construct( + current_partitions = dataframe.rdd.getNumPartitions() + optimal_partitions = num_processors or current_partitions + if current_partitions != optimal_partitions: + dataframe = repartition_sort_df( + dataframe, partition_by, order_by, num_processors + ) + + # Step 3: Construct the feature set dataframe using defined transformations. + transformed_dataframe = self.feature_set.construct( dataframe=dataframe, client=self.spark_client, start_date=start_date, @@ -229,15 +237,20 @@ def run( num_processors=num_processors, ) + if dataframe.storageLevel != StorageLevel.NONE: + dataframe.unpersist() # Clear the data from the cache (disk and memory) + + # Step 4: Load the data into the configured sink. self.sink.flush( - dataframe=dataframe, + dataframe=transformed_dataframe, feature_set=self.feature_set, spark_client=self.spark_client, ) - if not dataframe.isStreaming: + # Step 5: Validate the output if not streaming and data volume is reasonable. + if not transformed_dataframe.isStreaming: self.sink.validate( - dataframe=dataframe, + dataframe=transformed_dataframe, feature_set=self.feature_set, spark_client=self.spark_client, ) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 6706bf8c..9f55ae93 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -387,6 +387,7 @@ def _aggregate( ] groupby = self.keys_columns.copy() + if window is not None: dataframe = dataframe.withColumn("window", window.get()) groupby.append("window") @@ -410,19 +411,23 @@ def _aggregate( "keep_rn", functions.row_number().over(partition_window) ).filter("keep_rn = 1") - # repartition to have all rows for each group at the same partition - # by doing that, we won't have to shuffle data on grouping by id - dataframe = repartition_df( - dataframe, - partition_by=groupby, - num_processors=num_processors, - ) + current_partitions = dataframe.rdd.getNumPartitions() + optimal_partitions = num_processors or current_partitions + + if current_partitions != optimal_partitions: + dataframe = repartition_df( + dataframe, + partition_by=groupby, + num_processors=optimal_partitions, + ) + grouped_data = dataframe.groupby(*groupby) - if self._pivot_column: + if self._pivot_column and self._pivot_values: grouped_data = grouped_data.pivot(self._pivot_column, self._pivot_values) aggregated = grouped_data.agg(*aggregations) + return self._with_renamed_columns(aggregated, features, window) def _with_renamed_columns( @@ -637,12 +642,12 @@ def construct( output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) - if not output_df.isStreaming: - if self.deduplicate_rows: - output_df = self._filter_duplicated_rows(output_df) - if self.eager_evaluation: - output_df.cache().count() + if not output_df.isStreaming and self.deduplicate_rows: + output_df = self._filter_duplicated_rows(output_df) post_hook_df = self.run_post_hooks(output_df) + if not output_df.isStreaming and self.eager_evaluation: + post_hook_df.cache().count() + return post_hook_df diff --git a/butterfree/transform/feature_set.py b/butterfree/transform/feature_set.py index 369eaf29..2c4b9b51 100644 --- a/butterfree/transform/feature_set.py +++ b/butterfree/transform/feature_set.py @@ -436,11 +436,8 @@ def construct( pre_hook_df, ).select(*self.columns) - if not output_df.isStreaming: - if self.deduplicate_rows: - output_df = self._filter_duplicated_rows(output_df) - if self.eager_evaluation: - output_df.cache().count() + if not output_df.isStreaming and self.deduplicate_rows: + output_df = self._filter_duplicated_rows(output_df) output_df = self.incremental_strategy.filter_with_incremental_strategy( dataframe=output_df, start_date=start_date, end_date=end_date diff --git a/tests/unit/butterfree/transform/test_feature_set.py b/tests/unit/butterfree/transform/test_feature_set.py index e907dc0a..37a69be2 100644 --- a/tests/unit/butterfree/transform/test_feature_set.py +++ b/tests/unit/butterfree/transform/test_feature_set.py @@ -220,7 +220,7 @@ def test_construct( + feature_divide.get_output_columns() ) assert_dataframe_equality(result_df, feature_set_dataframe) - assert result_df.is_cached + assert not result_df.is_cached def test_construct_invalid_df( self, key_id, timestamp_c, feature_add, feature_divide From 5f7028b081c38e7eaeb1aa5fd0f3597b56afaa26 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Thu, 3 Oct 2024 15:07:11 -0300 Subject: [PATCH 70/86] fix: version, format (#376) --- butterfree/_cli/migrate.py | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 6bd5ca08..207e7daf 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -95,7 +95,7 @@ def create_instance(cls: Type[FeatureSetPipeline]) -> FeatureSetPipeline: parameters = sig.parameters if "run_date" in parameters: - run_date = datetime.datetime.today().strftime("%y-%m-%d") + run_date = datetime.datetime.today().strftime("%Y-%m-%d") return cls(run_date) return cls() diff --git a/requirements.txt b/requirements.txt index 9c9eea64..84a87735 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,6 @@ parameters-validation>=1.1.5,<2.0 pyspark==3.5.1 typer==0.4.2 typing-extensions>3.7.4,<5 -boto3==1.17.* +boto3==1.35.* numpy==1.26.4 delta-spark==3.2.0 From 52d4911d631b280826341456e3386a39c40d42be Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 7 Oct 2024 21:18:49 -0300 Subject: [PATCH 71/86] fix: performance adjustments, migrate (#378) * fix: performance adjustments, migrate --- .../database_migration/cassandra_migration.py | 20 ++++++++++++-- .../transform/aggregated_feature_set.py | 27 ++++++++++--------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index 5a4f755f..4d50746c 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -78,6 +78,9 @@ def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> st def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """Creates CQL statement to alter columns' types. + In Cassandra 3.4.x to 3.11.x alter type is not allowed. + This method creates a temp column to comply. + Args: columns: list of Diff objects with ALTER_TYPE kind. table_name: table name. @@ -86,10 +89,23 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: Alter column type query. """ - parsed_columns = self._get_parsed_columns([column]) + temp_column_name = f"{column.column}_temp" + + add_temp_column_query = ( + f"ALTER TABLE {table_name} ADD {temp_column_name} {column.value};" + ) + copy_data_to_temp_query = ( + f"UPDATE {table_name} SET {temp_column_name} = {column.column};" + ) + + drop_old_column_query = f"ALTER TABLE {table_name} DROP {column.column};" + rename_temp_column_query = ( + f"ALTER TABLE {table_name} RENAME {temp_column_name} TO {column.column};" + ) return ( - f"ALTER TABLE {table_name} ALTER {parsed_columns.replace(' ', ' TYPE ')};" + f"{add_temp_column_query} {copy_data_to_temp_query} " + f"{drop_old_column_query} {rename_temp_column_query};" ) @staticmethod diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 9f55ae93..516b6fed 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -576,14 +576,16 @@ def construct( pre_hook_df = self.run_pre_hooks(dataframe) - output_df = reduce( - lambda df, feature: feature.transform(df), - self.keys + [self.timestamp], - pre_hook_df, + output_df = pre_hook_df + for feature in self.keys + [self.timestamp]: + output_df = feature.transform(output_df) + + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date ) if self._windows and end_date is not None: - # run aggregations for each window + # Run aggregations for each window agg_list = [ self._aggregate( dataframe=output_df, @@ -603,13 +605,12 @@ def construct( # keeping this logic to maintain the same behavior for already implemented # feature sets - if self._windows[0].slide == "1 day": base_df = self._get_base_dataframe( client=client, dataframe=output_df, end_date=end_date ) - # left join each aggregation result to our base dataframe + # Left join each aggregation result to our base dataframe output_df = reduce( lambda left, right: self._dataframe_join( left, @@ -635,19 +636,21 @@ def construct( else: output_df = self._aggregate(output_df, features=self.features) - output_df = self.incremental_strategy.filter_with_incremental_strategy( - dataframe=output_df, start_date=start_date, end_date=end_date - ) - output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) + if not output_df.isStreaming and self.deduplicate_rows: output_df = self._filter_duplicated_rows(output_df) post_hook_df = self.run_post_hooks(output_df) + # Eager evaluation, only if needed and managable if not output_df.isStreaming and self.eager_evaluation: - post_hook_df.cache().count() + # Small dataframes only + if output_df.count() < 1_000_000: + post_hook_df.cache().count() + else: + post_hook_df.cache() # Cache without materialization for large volumes return post_hook_df From 7f65873bf2494bd8c78dec014fb98140c08f4b62 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Thu, 10 Oct 2024 09:36:12 -0300 Subject: [PATCH 72/86] chore: level (#382) --- butterfree/pipelines/feature_set_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index d57459f3..cda233f7 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -237,7 +237,7 @@ def run( num_processors=num_processors, ) - if dataframe.storageLevel != StorageLevel.NONE: + if dataframe.storageLevel != StorageLevel(False, False, False, False, 1): dataframe.unpersist() # Clear the data from the cache (disk and memory) # Step 4: Load the data into the configured sink. From ab551c089409e552b6ad5330407e360c7411838a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Albuquerque?= Date: Fri, 11 Oct 2024 16:09:47 -0300 Subject: [PATCH 73/86] feat(mlop-2456): add protection to host setting on cassandra_client (#385) * feat(mlop-2454): add protection to wrong host values and unit tests * trigger * fix(mlop-2454): using sonar cloud compilant * chore(mlop-2454): apply fmt * feat(mlop-2454): update exclude list * docs(mlops-2456): update docstring * feat(mlops-2456): add exclude to flake8 * chore(mlops-2456): rolling back test --- Makefile | 8 +-- butterfree/clients/cassandra_client.py | 46 +++++++++++++++- .../clients/test_cassandra_client.py | 52 ++++++++++++++++++- 3 files changed, 99 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index a93104ab..2156f9b0 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,7 @@ style-check: @echo "Code Style" @echo "==========" @echo "" - @python -m black --check -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . && echo "\n\nSuccess" || (echo "\n\nFailure\n\nYou need to run \"make apply-style\" to apply style formatting to your code"; exit 1) + @python -m black --check -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/|venv/" . && echo "\n\nSuccess" || (echo "\n\nFailure\n\nYou need to run \"make apply-style\" to apply style formatting to your code"; exit 1) .PHONY: quality-check ## run code quality checks with flake8 @@ -85,7 +85,7 @@ quality-check: @echo "Flake 8" @echo "=======" @echo "" - @python -m flake8 && echo "Success" + python -m flake8 --exclude="dist,build,pip,.pip,deps,.venv,venv,.git,.hg,.mypy_cache,.tox" && echo "Success" @echo "" .PHONY: type-check @@ -95,7 +95,7 @@ type-check: @echo "mypy" @echo "====" @echo "" - @python -m mypy butterfree + @python -m mypy --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/|venv/" butterfree .PHONY: checks ## run all code checks @@ -104,7 +104,7 @@ checks: style-check quality-check type-check .PHONY: apply-style ## fix stylistic errors with black apply-style: - @python -m black -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . + @python -m black -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/|venv/" . @python -m isort --atomic butterfree/ tests/ .PHONY: clean diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 714e8248..0b300844 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -1,7 +1,7 @@ """CassandraClient entity.""" from ssl import CERT_REQUIRED, PROTOCOL_TLSv1 -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union from cassandra.auth import PlainTextAuthProvider from cassandra.cluster import ( @@ -16,6 +16,12 @@ from typing_extensions import TypedDict from butterfree.clients import AbstractClient +from butterfree.configs.logger import __logger + +logger = __logger("cassandra_client") + +EMPTY_STRING_HOST_ERROR = "The value of Cassandra host is empty. Please fill correctly with your endpoints" # noqa: E501 +GENERIC_INVALID_HOST_ERROR = "The Cassandra host must be a valid string, a string that represents a list or list of strings" # noqa: E501 class CassandraColumn(TypedDict): @@ -53,12 +59,48 @@ def __init__( user: Optional[str] = None, password: Optional[str] = None, ) -> None: - self.host = host + self.host = self._validate_and_format_cassandra_host(host) + logger.info(f"The host setted is {self.host}") self.keyspace = keyspace self.user = user self.password = password self._session: Optional[Session] = None + def _validate_and_format_cassandra_host(self, host: Union[List, str]): + """ + Validate and format the provided Cassandra host input. + + This method checks if the input `host` is either a string, a list of strings, or + a list containing a single string with comma-separated values. It splits the string + by commas and trims whitespace, returning a list of hosts. If the input is already + a list of strings, it returns that list. If the input is empty or invalid, a + ValueError is raised. + + Args: + host (str | list): The Cassandra host input, which can be a comma-separated + string or a list of string endpoints. + + Returns: + list: A list of formatted Cassandra host strings. + + Raises: + ValueError: If the input is an empty list/string or if it is not a string + (or a representation of a list) or a list of strings. + """ # noqa: E501 + if isinstance(host, str): + if host: + return [item.strip() for item in host.split(",")] + else: + raise ValueError(EMPTY_STRING_HOST_ERROR) + + if isinstance(host, list): + if len(host) == 1 and isinstance(host[0], str): + return [item.strip() for item in host[0].split(",")] + elif all(isinstance(item, str) for item in host): + return host + + raise ValueError(GENERIC_INVALID_HOST_ERROR) + @property def conn(self, *, ssl_path: str = None) -> Session: # type: ignore """Establishes a Cassandra connection.""" diff --git a/tests/unit/butterfree/clients/test_cassandra_client.py b/tests/unit/butterfree/clients/test_cassandra_client.py index 0356e43f..9f634cf7 100644 --- a/tests/unit/butterfree/clients/test_cassandra_client.py +++ b/tests/unit/butterfree/clients/test_cassandra_client.py @@ -1,8 +1,14 @@ from typing import Any, Dict, List from unittest.mock import MagicMock +import pytest + from butterfree.clients import CassandraClient -from butterfree.clients.cassandra_client import CassandraColumn +from butterfree.clients.cassandra_client import ( + EMPTY_STRING_HOST_ERROR, + GENERIC_INVALID_HOST_ERROR, + CassandraColumn, +) def sanitize_string(query: str) -> str: @@ -86,3 +92,47 @@ def test_cassandra_create_table( query = cassandra_client.sql.call_args[0][0] assert sanitize_string(query) == sanitize_string(expected_query) + + def test_initialize_with_string_host(self): + client = CassandraClient(host="127.0.0.0, 127.0.0.1", keyspace="dummy_keyspace") + assert client.host == ["127.0.0.0", "127.0.0.1"] + + def test_initialize_with_list_host(self): + client = CassandraClient( + host=["127.0.0.0", "127.0.0.1"], keyspace="test_keyspace" + ) + assert client.host == ["127.0.0.0", "127.0.0.1"] + + def test_initialize_with_empty_string_host(self): + with pytest.raises( + ValueError, + match=EMPTY_STRING_HOST_ERROR, + ): + CassandraClient(host="", keyspace="test_keyspace") + + def test_initialize_with_none_host(self): + with pytest.raises( + ValueError, + match=GENERIC_INVALID_HOST_ERROR, + ): + CassandraClient(host=None, keyspace="test_keyspace") + + def test_initialize_with_invalid_host_type(self): + with pytest.raises( + ValueError, + match=GENERIC_INVALID_HOST_ERROR, + ): + CassandraClient(host=123, keyspace="test_keyspace") + + def test_initialize_with_invalid_list_host(self): + with pytest.raises( + ValueError, + match=GENERIC_INVALID_HOST_ERROR, + ): + CassandraClient(host=["127.0.0.0", 123], keyspace="test_keyspace") + + def test_initialize_with_list_of_string_hosts(self): + client = CassandraClient( + host=["127.0.0.0, 127.0.0.1"], keyspace="test_keyspace" + ) + assert client.host == ["127.0.0.0", "127.0.0.1"] From a11a699a578e14f6ca05a1cf992c2fc56ebf703d Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 14 Oct 2024 09:40:39 -0300 Subject: [PATCH 74/86] fix: rollback repartition (#386) * fix: rollback repartition --- butterfree/pipelines/feature_set_pipeline.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index cda233f7..f1c94ec2 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -221,12 +221,9 @@ def run( # Step 2: Repartition and sort if required, avoid if not necessary. if partition_by: order_by = order_by or partition_by - current_partitions = dataframe.rdd.getNumPartitions() - optimal_partitions = num_processors or current_partitions - if current_partitions != optimal_partitions: - dataframe = repartition_sort_df( - dataframe, partition_by, order_by, num_processors - ) + dataframe = repartition_sort_df( + dataframe, partition_by, order_by, num_processors + ) # Step 3: Construct the feature set dataframe using defined transformations. transformed_dataframe = self.feature_set.construct( @@ -237,7 +234,9 @@ def run( num_processors=num_processors, ) - if dataframe.storageLevel != StorageLevel(False, False, False, False, 1): + if transformed_dataframe.storageLevel != StorageLevel( + False, False, False, False, 1 + ): dataframe.unpersist() # Clear the data from the cache (disk and memory) # Step 4: Load the data into the configured sink. From b802f69292ea03a5938940dce2995202f77bf232 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 18 Oct 2024 09:50:16 -0300 Subject: [PATCH 75/86] fix: move incremental filter (#388) --- butterfree/transform/aggregated_feature_set.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 516b6fed..1230cd4d 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -580,10 +580,6 @@ def construct( for feature in self.keys + [self.timestamp]: output_df = feature.transform(output_df) - output_df = self.incremental_strategy.filter_with_incremental_strategy( - dataframe=output_df, start_date=start_date, end_date=end_date - ) - if self._windows and end_date is not None: # Run aggregations for each window agg_list = [ @@ -636,6 +632,10 @@ def construct( else: output_df = self._aggregate(output_df, features=self.features) + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date + ) + output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) From 51c4aed4b37e3daf34893bd58354c651675f2c74 Mon Sep 17 00:00:00 2001 From: "joao.albuquerque" Date: Wed, 4 Dec 2024 18:13:29 -0300 Subject: [PATCH 76/86] Revert "fix: move incremental filter (#388)" This reverts commit b802f69292ea03a5938940dce2995202f77bf232. --- butterfree/transform/aggregated_feature_set.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 1230cd4d..516b6fed 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -580,6 +580,10 @@ def construct( for feature in self.keys + [self.timestamp]: output_df = feature.transform(output_df) + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date + ) + if self._windows and end_date is not None: # Run aggregations for each window agg_list = [ @@ -632,10 +636,6 @@ def construct( else: output_df = self._aggregate(output_df, features=self.features) - output_df = self.incremental_strategy.filter_with_incremental_strategy( - dataframe=output_df, start_date=start_date, end_date=end_date - ) - output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) From 58256342c1fbcbf81fa24f063ceeefbe8171d626 Mon Sep 17 00:00:00 2001 From: "joao.albuquerque" Date: Wed, 4 Dec 2024 18:14:00 -0300 Subject: [PATCH 77/86] Revert "fix: rollback repartition (#386)" This reverts commit a11a699a578e14f6ca05a1cf992c2fc56ebf703d. --- butterfree/pipelines/feature_set_pipeline.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index f1c94ec2..cda233f7 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -221,9 +221,12 @@ def run( # Step 2: Repartition and sort if required, avoid if not necessary. if partition_by: order_by = order_by or partition_by - dataframe = repartition_sort_df( - dataframe, partition_by, order_by, num_processors - ) + current_partitions = dataframe.rdd.getNumPartitions() + optimal_partitions = num_processors or current_partitions + if current_partitions != optimal_partitions: + dataframe = repartition_sort_df( + dataframe, partition_by, order_by, num_processors + ) # Step 3: Construct the feature set dataframe using defined transformations. transformed_dataframe = self.feature_set.construct( @@ -234,9 +237,7 @@ def run( num_processors=num_processors, ) - if transformed_dataframe.storageLevel != StorageLevel( - False, False, False, False, 1 - ): + if dataframe.storageLevel != StorageLevel(False, False, False, False, 1): dataframe.unpersist() # Clear the data from the cache (disk and memory) # Step 4: Load the data into the configured sink. From dc1647bbe84dc4212d72359dddfe09d382b4ff94 Mon Sep 17 00:00:00 2001 From: "joao.albuquerque" Date: Wed, 4 Dec 2024 18:14:23 -0300 Subject: [PATCH 78/86] Revert "chore: level (#382)" This reverts commit 7f65873bf2494bd8c78dec014fb98140c08f4b62. --- butterfree/pipelines/feature_set_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index cda233f7..d57459f3 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -237,7 +237,7 @@ def run( num_processors=num_processors, ) - if dataframe.storageLevel != StorageLevel(False, False, False, False, 1): + if dataframe.storageLevel != StorageLevel.NONE: dataframe.unpersist() # Clear the data from the cache (disk and memory) # Step 4: Load the data into the configured sink. From a6a6615dd5fa6228dbe15fc381842648fc938f32 Mon Sep 17 00:00:00 2001 From: "joao.albuquerque" Date: Wed, 4 Dec 2024 18:14:43 -0300 Subject: [PATCH 79/86] Revert "fix: performance adjustments, migrate (#378)" This reverts commit 52d4911d631b280826341456e3386a39c40d42be. --- .../database_migration/cassandra_migration.py | 20 ++------------ .../transform/aggregated_feature_set.py | 27 +++++++++---------- 2 files changed, 14 insertions(+), 33 deletions(-) diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index 4d50746c..5a4f755f 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -78,9 +78,6 @@ def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> st def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """Creates CQL statement to alter columns' types. - In Cassandra 3.4.x to 3.11.x alter type is not allowed. - This method creates a temp column to comply. - Args: columns: list of Diff objects with ALTER_TYPE kind. table_name: table name. @@ -89,23 +86,10 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: Alter column type query. """ - temp_column_name = f"{column.column}_temp" - - add_temp_column_query = ( - f"ALTER TABLE {table_name} ADD {temp_column_name} {column.value};" - ) - copy_data_to_temp_query = ( - f"UPDATE {table_name} SET {temp_column_name} = {column.column};" - ) - - drop_old_column_query = f"ALTER TABLE {table_name} DROP {column.column};" - rename_temp_column_query = ( - f"ALTER TABLE {table_name} RENAME {temp_column_name} TO {column.column};" - ) + parsed_columns = self._get_parsed_columns([column]) return ( - f"{add_temp_column_query} {copy_data_to_temp_query} " - f"{drop_old_column_query} {rename_temp_column_query};" + f"ALTER TABLE {table_name} ALTER {parsed_columns.replace(' ', ' TYPE ')};" ) @staticmethod diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 516b6fed..9f55ae93 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -576,16 +576,14 @@ def construct( pre_hook_df = self.run_pre_hooks(dataframe) - output_df = pre_hook_df - for feature in self.keys + [self.timestamp]: - output_df = feature.transform(output_df) - - output_df = self.incremental_strategy.filter_with_incremental_strategy( - dataframe=output_df, start_date=start_date, end_date=end_date + output_df = reduce( + lambda df, feature: feature.transform(df), + self.keys + [self.timestamp], + pre_hook_df, ) if self._windows and end_date is not None: - # Run aggregations for each window + # run aggregations for each window agg_list = [ self._aggregate( dataframe=output_df, @@ -605,12 +603,13 @@ def construct( # keeping this logic to maintain the same behavior for already implemented # feature sets + if self._windows[0].slide == "1 day": base_df = self._get_base_dataframe( client=client, dataframe=output_df, end_date=end_date ) - # Left join each aggregation result to our base dataframe + # left join each aggregation result to our base dataframe output_df = reduce( lambda left, right: self._dataframe_join( left, @@ -636,21 +635,19 @@ def construct( else: output_df = self._aggregate(output_df, features=self.features) + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date + ) + output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) - if not output_df.isStreaming and self.deduplicate_rows: output_df = self._filter_duplicated_rows(output_df) post_hook_df = self.run_post_hooks(output_df) - # Eager evaluation, only if needed and managable if not output_df.isStreaming and self.eager_evaluation: - # Small dataframes only - if output_df.count() < 1_000_000: - post_hook_df.cache().count() - else: - post_hook_df.cache() # Cache without materialization for large volumes + post_hook_df.cache().count() return post_hook_df From 40f7766196029dae5d75c863d43457513fe2ca20 Mon Sep 17 00:00:00 2001 From: "joao.albuquerque" Date: Wed, 4 Dec 2024 18:22:44 -0300 Subject: [PATCH 80/86] Revert "fix: performance improvements (#374)" This reverts commit 11cc5d5c006a17f839f52517f4907c1b40f8d20e. --- butterfree/_cli/migrate.py | 14 ++------- butterfree/extract/source.py | 14 +++------ butterfree/pipelines/feature_set_pipeline.py | 29 +++++------------ .../transform/aggregated_feature_set.py | 31 ++++++++----------- butterfree/transform/feature_set.py | 7 +++-- .../butterfree/transform/test_feature_set.py | 2 +- 6 files changed, 33 insertions(+), 64 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 207e7daf..f5161509 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -4,7 +4,7 @@ import os import pkgutil import sys -from typing import Set, Type +from typing import Set import boto3 import setuptools @@ -90,18 +90,8 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: instances.add(value) - def create_instance(cls: Type[FeatureSetPipeline]) -> FeatureSetPipeline: - sig = inspect.signature(cls.__init__) - parameters = sig.parameters - - if "run_date" in parameters: - run_date = datetime.datetime.today().strftime("%Y-%m-%d") - return cls(run_date) - - return cls() - logger.info("Creating instances...") - return set(create_instance(value) for value in instances) # type: ignore + return set(value() for value in instances) # type: ignore PATH = typer.Argument( diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 9d50e94c..bfc15271 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -3,7 +3,6 @@ from typing import List, Optional from pyspark.sql import DataFrame -from pyspark.storagelevel import StorageLevel from butterfree.clients import SparkClient from butterfree.extract.readers.reader import Reader @@ -96,21 +95,16 @@ def construct( DataFrame with the query result against all readers. """ - # Step 1: Build temporary views for each reader for reader in self.readers: - reader.build(client=client, start_date=start_date, end_date=end_date) + reader.build( + client=client, start_date=start_date, end_date=end_date + ) # create temporary views for each reader - # Step 2: Execute SQL query on the combined readers dataframe = client.sql(self.query) - # Step 3: Cache the dataframe if necessary, using memory and disk storage if not dataframe.isStreaming and self.eager_evaluation: - # Persist to ensure the DataFrame is stored in mem and disk (if necessary) - dataframe.persist(StorageLevel.MEMORY_AND_DISK) - # Trigger the cache/persist operation by performing an action - dataframe.count() + dataframe.cache().count() - # Step 4: Run post-processing hooks on the dataframe post_hook_df = self.run_post_hooks(dataframe) return post_hook_df diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index d57459f3..8ba1a636 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -2,8 +2,6 @@ from typing import List, Optional -from pyspark.storagelevel import StorageLevel - from butterfree.clients import SparkClient from butterfree.dataframe_service import repartition_sort_df from butterfree.extract import Source @@ -211,25 +209,19 @@ def run( soon. Use only if strictly necessary. """ - # Step 1: Construct input dataframe from the source. dataframe = self.source.construct( client=self.spark_client, start_date=self.feature_set.define_start_date(start_date), end_date=end_date, ) - # Step 2: Repartition and sort if required, avoid if not necessary. if partition_by: order_by = order_by or partition_by - current_partitions = dataframe.rdd.getNumPartitions() - optimal_partitions = num_processors or current_partitions - if current_partitions != optimal_partitions: - dataframe = repartition_sort_df( - dataframe, partition_by, order_by, num_processors - ) - - # Step 3: Construct the feature set dataframe using defined transformations. - transformed_dataframe = self.feature_set.construct( + dataframe = repartition_sort_df( + dataframe, partition_by, order_by, num_processors + ) + + dataframe = self.feature_set.construct( dataframe=dataframe, client=self.spark_client, start_date=start_date, @@ -237,20 +229,15 @@ def run( num_processors=num_processors, ) - if dataframe.storageLevel != StorageLevel.NONE: - dataframe.unpersist() # Clear the data from the cache (disk and memory) - - # Step 4: Load the data into the configured sink. self.sink.flush( - dataframe=transformed_dataframe, + dataframe=dataframe, feature_set=self.feature_set, spark_client=self.spark_client, ) - # Step 5: Validate the output if not streaming and data volume is reasonable. - if not transformed_dataframe.isStreaming: + if not dataframe.isStreaming: self.sink.validate( - dataframe=transformed_dataframe, + dataframe=dataframe, feature_set=self.feature_set, spark_client=self.spark_client, ) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 9f55ae93..6706bf8c 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -387,7 +387,6 @@ def _aggregate( ] groupby = self.keys_columns.copy() - if window is not None: dataframe = dataframe.withColumn("window", window.get()) groupby.append("window") @@ -411,23 +410,19 @@ def _aggregate( "keep_rn", functions.row_number().over(partition_window) ).filter("keep_rn = 1") - current_partitions = dataframe.rdd.getNumPartitions() - optimal_partitions = num_processors or current_partitions - - if current_partitions != optimal_partitions: - dataframe = repartition_df( - dataframe, - partition_by=groupby, - num_processors=optimal_partitions, - ) - + # repartition to have all rows for each group at the same partition + # by doing that, we won't have to shuffle data on grouping by id + dataframe = repartition_df( + dataframe, + partition_by=groupby, + num_processors=num_processors, + ) grouped_data = dataframe.groupby(*groupby) - if self._pivot_column and self._pivot_values: + if self._pivot_column: grouped_data = grouped_data.pivot(self._pivot_column, self._pivot_values) aggregated = grouped_data.agg(*aggregations) - return self._with_renamed_columns(aggregated, features, window) def _with_renamed_columns( @@ -642,12 +637,12 @@ def construct( output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) - if not output_df.isStreaming and self.deduplicate_rows: - output_df = self._filter_duplicated_rows(output_df) + if not output_df.isStreaming: + if self.deduplicate_rows: + output_df = self._filter_duplicated_rows(output_df) + if self.eager_evaluation: + output_df.cache().count() post_hook_df = self.run_post_hooks(output_df) - if not output_df.isStreaming and self.eager_evaluation: - post_hook_df.cache().count() - return post_hook_df diff --git a/butterfree/transform/feature_set.py b/butterfree/transform/feature_set.py index 2c4b9b51..369eaf29 100644 --- a/butterfree/transform/feature_set.py +++ b/butterfree/transform/feature_set.py @@ -436,8 +436,11 @@ def construct( pre_hook_df, ).select(*self.columns) - if not output_df.isStreaming and self.deduplicate_rows: - output_df = self._filter_duplicated_rows(output_df) + if not output_df.isStreaming: + if self.deduplicate_rows: + output_df = self._filter_duplicated_rows(output_df) + if self.eager_evaluation: + output_df.cache().count() output_df = self.incremental_strategy.filter_with_incremental_strategy( dataframe=output_df, start_date=start_date, end_date=end_date diff --git a/tests/unit/butterfree/transform/test_feature_set.py b/tests/unit/butterfree/transform/test_feature_set.py index 37a69be2..e907dc0a 100644 --- a/tests/unit/butterfree/transform/test_feature_set.py +++ b/tests/unit/butterfree/transform/test_feature_set.py @@ -220,7 +220,7 @@ def test_construct( + feature_divide.get_output_columns() ) assert_dataframe_equality(result_df, feature_set_dataframe) - assert not result_df.is_cached + assert result_df.is_cached def test_construct_invalid_df( self, key_id, timestamp_c, feature_add, feature_divide From b7c7d4873db6eb8aec73ce99240b3dd0c7f6e4b3 Mon Sep 17 00:00:00 2001 From: Lucas Cardozo <8867239+lecardozo@users.noreply.github.com> Date: Mon, 6 Jan 2025 16:14:13 +0000 Subject: [PATCH 81/86] fix(MLOP-2519): avoid configuring logger at lib level (#393) ## Why? :open_book: We should let the users handle logging configurations to avoid unwanted side effects. ## What? :wrench: - removing the `butterfree.configs.logger` module - replacing `__logger` references to `logging.getLogger(__name__)` ## Type of change Please delete options that are not relevant. - [x] Bug fix (non-breaking change which fixes an issue) --- butterfree/_cli/migrate.py | 4 ++-- butterfree/clients/cassandra_client.py | 4 ++-- butterfree/configs/logger.py | 24 ------------------- butterfree/load/writers/delta_writer.py | 5 ++-- .../database_migration/database_migration.py | 4 ++-- 5 files changed, 9 insertions(+), 32 deletions(-) delete mode 100644 butterfree/configs/logger.py diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 207e7daf..a718832d 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,6 +1,7 @@ import datetime import importlib import inspect +import logging import os import pkgutil import sys @@ -12,7 +13,6 @@ from botocore.exceptions import ClientError from butterfree.configs import environment -from butterfree.configs.logger import __logger from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline @@ -20,7 +20,7 @@ help="Apply the automatic migrations in a database.", no_args_is_help=True ) -logger = __logger("migrate", True) +logger = logging.getLogger(__name__) def __find_modules(path: str) -> Set[str]: diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 0b300844..c4d34521 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -1,5 +1,6 @@ """CassandraClient entity.""" +import logging from ssl import CERT_REQUIRED, PROTOCOL_TLSv1 from typing import Dict, List, Optional, Union @@ -16,9 +17,8 @@ from typing_extensions import TypedDict from butterfree.clients import AbstractClient -from butterfree.configs.logger import __logger -logger = __logger("cassandra_client") +logger = logging.getLogger(__name__) EMPTY_STRING_HOST_ERROR = "The value of Cassandra host is empty. Please fill correctly with your endpoints" # noqa: E501 GENERIC_INVALID_HOST_ERROR = "The Cassandra host must be a valid string, a string that represents a list or list of strings" # noqa: E501 diff --git a/butterfree/configs/logger.py b/butterfree/configs/logger.py deleted file mode 100644 index 60dab67c..00000000 --- a/butterfree/configs/logger.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Logger funcion.""" - -import logging - - -def __config(json_file_logs: bool = False) -> None: - - if json_file_logs: - return logging.basicConfig( - format='{"name": "%(name)s", "timestamp": "%(asctime)-15s", ' - '"level": "%(levelname)s", "message": "%(message)s"}', - level=logging.INFO, - filename="../logging.json", - ) - return logging.basicConfig( - format="%(name)s:%(asctime)-15s:%(levelname)s:< %(message)s >", - level=logging.INFO, - ) - - -def __logger(name: str, file_logs: bool = False) -> logging.Logger: - - __config(file_logs) - return logging.getLogger(name) diff --git a/butterfree/load/writers/delta_writer.py b/butterfree/load/writers/delta_writer.py index 933f1adb..45ce1c0c 100644 --- a/butterfree/load/writers/delta_writer.py +++ b/butterfree/load/writers/delta_writer.py @@ -1,10 +1,11 @@ +import logging + from delta.tables import DeltaTable from pyspark.sql.dataframe import DataFrame from butterfree.clients import SparkClient -from butterfree.configs.logger import __logger -logger = __logger("delta_writer", True) +logger = logging.getLogger(__name__) class DeltaWriter: diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 351a4724..aa07fb35 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -1,16 +1,16 @@ """Migration entity.""" +import logging from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum, auto from typing import Any, Dict, List, Optional, Set from butterfree.clients import AbstractClient -from butterfree.configs.logger import __logger from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet -logger = __logger("database_migrate", True) +logger = logging.getLogger(__name__) @dataclass From b84ac5f53ebe4862a0af5c7f78c6936e2f678d34 Mon Sep 17 00:00:00 2001 From: michellyrds Date: Mon, 6 Jan 2025 15:57:56 -0300 Subject: [PATCH 82/86] pre-release 1.4.6 --- CHANGELOG.md | 3 +++ setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe9f9a8a..fc5b6a15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. ## [Unreleased] +## [1.4.6](https://github.com/quintoandar/butterfree/releases/tag/1.4.6) +* Fix(MLOP-2519): avoid configuring logger at lib level ([#393](https://github.com/quintoandar/butterfree/pull/393)) +* Fix: Rollback to latest stable release ([#391](https://github.com/quintoandar/butterfree/pull/391)) ## [1.3.5](https://github.com/quintoandar/butterfree/releases/tag/1.3.5) * Auto create feature sets ([#368](https://github.com/quintoandar/butterfree/pull/368)) diff --git a/setup.py b/setup.py index bc4f0b45..d51e1866 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.3.5" +__version__ = "1.4.6" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 8a3f70320069cbb9980f27b4611232ef12feba5a Mon Sep 17 00:00:00 2001 From: michellyrds Date: Mon, 6 Jan 2025 16:35:55 -0300 Subject: [PATCH 83/86] release 1.4.6 --- CHANGELOG.md | 26 ++++++++- docs/source/butterfree.automated.rst | 2 + docs/source/butterfree.configs.rst | 33 ----------- docs/source/butterfree.constants.rst | 55 ------------------- docs/source/butterfree.dataframe_service.rst | 6 ++ docs/source/butterfree.hooks.rst | 4 ++ .../butterfree.hooks.schema_compatibility.rst | 4 ++ docs/source/butterfree.load.writers.rst | 8 +++ ...tterfree.migrations.database_migration.rst | 6 ++ 9 files changed, 53 insertions(+), 91 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc5b6a15..51cbfdfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,30 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. -## [Unreleased] ## [1.4.6](https://github.com/quintoandar/butterfree/releases/tag/1.4.6) -* Fix(MLOP-2519): avoid configuring logger at lib level ([#393](https://github.com/quintoandar/butterfree/pull/393)) -* Fix: Rollback to latest stable release ([#391](https://github.com/quintoandar/butterfree/pull/391)) + +### Fixed +* [MLOP-2519] avoid configuring logger at lib level ([#393](https://github.com/quintoandar/butterfree/pull/393)) +* Rollback to latest stable release ([#391](https://github.com/quintoandar/butterfree/pull/391)) + +## [1.4.5](https://github.com/quintoandar/butterfree/releases/tag/1.4.5) +* Rollback repartitions ([#386](https://github.com/quintoandar/butterfree/pull/386)) +* Add protection to host setting on cassandra_client ([#385](https://github.com/quintoandar/butterfree/pull/385)) + +## [1.4.4](https://github.com/quintoandar/butterfree/releases/tag/1.4.4) +* Fix Storage Level ([#382](https://github.com/quintoandar/butterfree/pull/382)) + +## [1.4.3](https://github.com/quintoandar/butterfree/releases/tag/1.4.3) +* Performance upgrade ([#378](https://github.com/quintoandar/butterfree/pull/378)) + +## [1.4.2](https://github.com/quintoandar/butterfree/releases/tag/1.4.2) +* Minor fixes ([#376](https://github.com/quintoandar/butterfree/pull/376)) + +## [1.4.1](https://github.com/quintoandar/butterfree/releases/tag/1.4.1) +* Performance Improvements ([#374](https://github.com/quintoandar/butterfree/pull/374)) + +## [1.4.0](https://github.com/quintoandar/butterfree/releases/tag/1.4.0) +* Add Delta support ([#370](https://github.com/quintoandar/butterfree/pull/370)) ## [1.3.5](https://github.com/quintoandar/butterfree/releases/tag/1.3.5) * Auto create feature sets ([#368](https://github.com/quintoandar/butterfree/pull/368)) diff --git a/docs/source/butterfree.automated.rst b/docs/source/butterfree.automated.rst index de290d9c..9c01ac54 100644 --- a/docs/source/butterfree.automated.rst +++ b/docs/source/butterfree.automated.rst @@ -4,6 +4,8 @@ butterfree.automated package Submodules ---------- +butterfree.automated.feature\_set\_creation module +-------------------------------------------------- .. automodule:: butterfree.automated.feature_set_creation :members: diff --git a/docs/source/butterfree.configs.rst b/docs/source/butterfree.configs.rst index 18a82795..81821ae7 100644 --- a/docs/source/butterfree.configs.rst +++ b/docs/source/butterfree.configs.rst @@ -20,39 +20,6 @@ butterfree.configs.environment module :undoc-members: :show-inheritance: -butterfree.configs.logger module --------------------------------- - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.constants.rst b/docs/source/butterfree.constants.rst index e90b195e..e5727fd1 100644 --- a/docs/source/butterfree.constants.rst +++ b/docs/source/butterfree.constants.rst @@ -31,36 +31,6 @@ butterfree.constants.migrations module butterfree.constants.spark\_constants module -------------------------------------------- -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - .. automodule:: butterfree.constants.spark_constants :members: :undoc-members: @@ -69,31 +39,6 @@ butterfree.constants.spark\_constants module butterfree.constants.window\_definitions module ----------------------------------------------- -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - .. automodule:: butterfree.constants.window_definitions :members: :undoc-members: diff --git a/docs/source/butterfree.dataframe_service.rst b/docs/source/butterfree.dataframe_service.rst index 4343305b..ae9658a5 100644 --- a/docs/source/butterfree.dataframe_service.rst +++ b/docs/source/butterfree.dataframe_service.rst @@ -4,18 +4,24 @@ butterfree.dataframe\_service package Submodules ---------- +butterfree.dataframe\_service.incremental\_strategy module +---------------------------------------------------------- .. automodule:: butterfree.dataframe_service.incremental_strategy :members: :undoc-members: :show-inheritance: +butterfree.dataframe\_service.partitioning module +------------------------------------------------- .. automodule:: butterfree.dataframe_service.partitioning :members: :undoc-members: :show-inheritance: +butterfree.dataframe\_service.repartition module +------------------------------------------------ .. automodule:: butterfree.dataframe_service.repartition :members: diff --git a/docs/source/butterfree.hooks.rst b/docs/source/butterfree.hooks.rst index 72f13223..c633cade 100644 --- a/docs/source/butterfree.hooks.rst +++ b/docs/source/butterfree.hooks.rst @@ -12,12 +12,16 @@ Subpackages Submodules ---------- +butterfree.hooks.hook module +---------------------------- .. automodule:: butterfree.hooks.hook :members: :undoc-members: :show-inheritance: +butterfree.hooks.hookable\_component module +------------------------------------------- .. automodule:: butterfree.hooks.hookable_component :members: diff --git a/docs/source/butterfree.hooks.schema_compatibility.rst b/docs/source/butterfree.hooks.schema_compatibility.rst index a39c5b93..2d3de66c 100644 --- a/docs/source/butterfree.hooks.schema_compatibility.rst +++ b/docs/source/butterfree.hooks.schema_compatibility.rst @@ -4,12 +4,16 @@ butterfree.hooks.schema\_compatibility package Submodules ---------- +butterfree.hooks.schema\_compatibility.cassandra\_table\_schema\_compatibility\_hook module +------------------------------------------------------------------------------------------- .. automodule:: butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook :members: :undoc-members: :show-inheritance: +butterfree.hooks.schema\_compatibility.spark\_table\_schema\_compatibility\_hook module +--------------------------------------------------------------------------------------- .. automodule:: butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook :members: diff --git a/docs/source/butterfree.load.writers.rst b/docs/source/butterfree.load.writers.rst index 2a173c9a..b20eb85e 100644 --- a/docs/source/butterfree.load.writers.rst +++ b/docs/source/butterfree.load.writers.rst @@ -4,6 +4,14 @@ butterfree.load.writers package Submodules ---------- +butterfree.load.writers.delta\_writer module +-------------------------------------------- + +.. automodule:: butterfree.load.writers.delta_writer + :members: + :undoc-members: + :show-inheritance: + butterfree.load.writers.historical\_feature\_store\_writer module ----------------------------------------------------------------- diff --git a/docs/source/butterfree.migrations.database_migration.rst b/docs/source/butterfree.migrations.database_migration.rst index 892165df..32ba4d4d 100644 --- a/docs/source/butterfree.migrations.database_migration.rst +++ b/docs/source/butterfree.migrations.database_migration.rst @@ -4,18 +4,24 @@ butterfree.migrations.database\_migration package Submodules ---------- +butterfree.migrations.database\_migration.cassandra\_migration module +--------------------------------------------------------------------- .. automodule:: butterfree.migrations.database_migration.cassandra_migration :members: :undoc-members: :show-inheritance: +butterfree.migrations.database\_migration.database\_migration module +-------------------------------------------------------------------- .. automodule:: butterfree.migrations.database_migration.database_migration :members: :undoc-members: :show-inheritance: +butterfree.migrations.database\_migration.metastore\_migration module +--------------------------------------------------------------------- .. automodule:: butterfree.migrations.database_migration.metastore_migration :members: From 6b1b6e7566db8d5b5b738c9209a2bd1c90c5f908 Mon Sep 17 00:00:00 2001 From: michellyrds Date: Mon, 6 Jan 2025 16:54:21 -0300 Subject: [PATCH 84/86] chore: remove unnecessary files --- .checklist.yaml | 30 ------------------------------ .github/workflows/skip_lint.yml | 17 ----------------- logging.json | 0 3 files changed, 47 deletions(-) delete mode 100644 .checklist.yaml delete mode 100644 .github/workflows/skip_lint.yml delete mode 100644 logging.json diff --git a/.checklist.yaml b/.checklist.yaml deleted file mode 100644 index f0c21171..00000000 --- a/.checklist.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: quintoandar.com.br/checklist/v2 -kind: ServiceChecklist -metadata: - name: butterfree -spec: - description: >- - A solution for Feature Stores. - - costCenter: C055 - department: engineering - lifecycle: production - docs: true - - ownership: - team: data_products_mlops - line: tech_platform - owner: otavio.cals@quintoandar.com.br - - libraries: - - name: butterfree - type: common-usage - path: https://quintoandar.github.io/python-package-server/ - description: A lib to build Feature Stores. - registries: - - github-packages - tier: T0 - - channels: - squad: 'mlops' - alerts: 'data-products-reports' diff --git a/.github/workflows/skip_lint.yml b/.github/workflows/skip_lint.yml deleted file mode 100644 index 1c768a23..00000000 --- a/.github/workflows/skip_lint.yml +++ /dev/null @@ -1,17 +0,0 @@ -# This step is used only because we want to mark the runner-linter check as required -# for PRs to develop, but not for the merge queue to merge into develop, -# github does not have this functionality yet - -name: 'Skip github-actions/runner-linter check at merge queue' - -on: - merge_group: - -jobs: - empty_job: - name: 'github-actions/runner-linter' - runs-on: github-actions-developers-runner - steps: - - name: Skip github-actions/runner-linter check at merge queue - run: | - echo "Done" diff --git a/logging.json b/logging.json deleted file mode 100644 index e69de29b..00000000 From 38509bed1850ca8dc16d5db24ecf5fd75fe678a8 Mon Sep 17 00:00:00 2001 From: michellyrds Date: Mon, 6 Jan 2025 17:03:26 -0300 Subject: [PATCH 85/86] docs: make update-docs --- butterfree/pipelines/feature_set_pipeline.py | 2 +- docs/source/butterfree.configs.rst | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index 89364394..8ba1a636 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -267,4 +267,4 @@ def run_for_date( partition_by=partition_by, order_by=order_by, num_processors=num_processors, - ) \ No newline at end of file + ) diff --git a/docs/source/butterfree.configs.rst b/docs/source/butterfree.configs.rst index f4bda101..20432e45 100644 --- a/docs/source/butterfree.configs.rst +++ b/docs/source/butterfree.configs.rst @@ -12,8 +12,6 @@ Subpackages Submodules ---------- -butterfree.configs.environment module -------------------------------------- butterfree.configs.environment module ------------------------------------- @@ -22,6 +20,14 @@ butterfree.configs.environment module :undoc-members: :show-inheritance: +butterfree.configs.logger module +-------------------------------- + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- From 9b2b65df174b22b7ee0843facc2268aa3390259d Mon Sep 17 00:00:00 2001 From: michellyrds Date: Mon, 6 Jan 2025 17:17:14 -0300 Subject: [PATCH 86/86] fix: merge conflict with staging --- butterfree/extract/source.py | 3 --- butterfree/transform/aggregated_feature_set.py | 5 ++--- tests/unit/butterfree/transform/test_feature_set.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 8605c312..bfc15271 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -3,7 +3,6 @@ from typing import List, Optional from pyspark.sql import DataFrame -from pyspark.storagelevel import StorageLevel from butterfree.clients import SparkClient from butterfree.extract.readers.reader import Reader @@ -96,13 +95,11 @@ def construct( DataFrame with the query result against all readers. """ - # Step 1: Build temporary views for each reader for reader in self.readers: reader.build( client=client, start_date=start_date, end_date=end_date ) # create temporary views for each reader - # Step 2: Execute SQL query on the combined readers dataframe = client.sql(self.query) if not dataframe.isStreaming and self.eager_evaluation: diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 2b3ece98..6706bf8c 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -387,7 +387,6 @@ def _aggregate( ] groupby = self.keys_columns.copy() - if window is not None: dataframe = dataframe.withColumn("window", window.get()) groupby.append("window") @@ -420,11 +419,10 @@ def _aggregate( ) grouped_data = dataframe.groupby(*groupby) - if self._pivot_column and self._pivot_values: + if self._pivot_column: grouped_data = grouped_data.pivot(self._pivot_column, self._pivot_values) aggregated = grouped_data.agg(*aggregations) - return self._with_renamed_columns(aggregated, features, window) def _with_renamed_columns( @@ -600,6 +598,7 @@ def construct( # keeping this logic to maintain the same behavior for already implemented # feature sets + if self._windows[0].slide == "1 day": base_df = self._get_base_dataframe( client=client, dataframe=output_df, end_date=end_date diff --git a/tests/unit/butterfree/transform/test_feature_set.py b/tests/unit/butterfree/transform/test_feature_set.py index 37a69be2..e907dc0a 100644 --- a/tests/unit/butterfree/transform/test_feature_set.py +++ b/tests/unit/butterfree/transform/test_feature_set.py @@ -220,7 +220,7 @@ def test_construct( + feature_divide.get_output_columns() ) assert_dataframe_equality(result_df, feature_set_dataframe) - assert not result_df.is_cached + assert result_df.is_cached def test_construct_invalid_df( self, key_id, timestamp_c, feature_add, feature_divide