diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..2a5a1e5ec --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,32 @@ +This PR adds/fixes/changes... +- please summarize your changes to the code +- and make sure to include all changes to user-facing APIs + +JIRA Issue: - + +Priority for Review: - + +Related PRs: - + +**How Has This Been Tested?** + +- [ ] Unit Tests +- [ ] Integration Tests +- [ ] Manual Tests on VM + + +**Checklist For The Assigned Reviewer:** + +``` +- [ ] Checked if merge conflicts with master exist +- [ ] Checked if stylechecks for Java and Python pass +- [ ] Checked if all docstrings were added and/or updated appropriately +- [ ] Ran spellcheck on docstring +- [ ] Checked if guides & concepts need to be updated +- [ ] Checked if naming conventions for parameters and variables were followed +- [ ] Checked if private methods are properly declared and used +- [ ] Checked if hard-to-understand areas of code are commented +- [ ] Checked if tests are effective +- [ ] Built and deployed changes on dev VM and tested manually +- [x] (Checked if all type annotations were added and/or updated appropriately) +``` diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml new file mode 100644 index 000000000..616a46773 --- /dev/null +++ b/.github/workflows/java.yml @@ -0,0 +1,62 @@ +name: java + +on: pull_request + +jobs: + unit_tests: + name: Unit Tests + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone UTC + + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: "8" + distribution: "adopt" + + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Test + working-directory: ./java + run: mvn clean test + + unit_tests_local_tz: + name: Unit Tests (Local TZ) + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone Europe/Amsterdam + + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: "8" + distribution: "adopt" + + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Test + working-directory: ./java + run: mvn clean test diff --git a/.github/workflows/mkdocs-main.yml b/.github/workflows/mkdocs-main.yml index 001f1fad1..e8e14b4ea 100644 --- a/.github/workflows/mkdocs-main.yml +++ b/.github/workflows/mkdocs-main.yml @@ -24,7 +24,24 @@ jobs: run: cp ../README.md . && pip3 install -r ../requirements-docs.txt && pip3 install -e .[dev] - name: generate autodoc - run: python3 auto_doc.py + run: python3 ./python/auto_doc.py + + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: "8" + distribution: "adopt" + + - name: Build java doc documentation + working-directory: ./java + run: mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../docs/javadoc - name: setup git run: | diff --git a/.github/workflows/mkdocs-release.yml b/.github/workflows/mkdocs-release.yml index e2b4b2b3f..f1c6bb814 100644 --- a/.github/workflows/mkdocs-release.yml +++ b/.github/workflows/mkdocs-release.yml @@ -2,7 +2,7 @@ name: mkdocs-release on: push: - branches: [branch-*\.*] + branches: [branch-*] jobs: publish-release: @@ -29,7 +29,25 @@ jobs: run: cp ../README.md . && pip3 install -r ../requirements-docs.txt && pip3 install -e .[dev] - name: generate autodoc - run: python3 auto_doc.py + run: python3 ./python/auto_doc.py + + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: "8" + distribution: "adopt" + + - name: Build java doc documentation + working-directory: ./java + run: + mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../docs/javadoc - name: setup git run: | diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml deleted file mode 100644 index 156847faf..000000000 --- a/.github/workflows/python-lint.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: python - -on: pull_request - -env: - APP_API_KEY: ${{ secrets.APP_API_KEY }} - -jobs: - lint_stylecheck: - name: Lint and Stylecheck - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - - name: Get all changed files - id: get-changed-files - uses: tj-actions/changed-files@v44 - with: - files_yaml: | - src: - - 'python/**/*.py' - - '!python/tests/**/*.py' - test: - - 'python/tests/**/*.py' - - - name: install deps - run: pip install ruff==0.4.2 - - - name: ruff on python files - if: steps.get-changed-files.outputs.src_any_changed == 'true' - env: - SRC_ALL_CHANGED_FILES: ${{ steps.get-changed-files.outputs.src_all_changed_files }} - run: ruff check --output-format=github $SRC_ALL_CHANGED_FILES - - - name: ruff on test files - if: steps.get-changed-files.outputs.test_any_changed == 'true' - env: - TEST_ALL_CHANGED_FILES: ${{ steps.get-changed-files.outputs.test_all_changed_files }} - run: ruff check --output-format=github $TEST_ALL_CHANGED_FILES - - - name: ruff format --check $ALL_CHANGED_FILES - env: - ALL_CHANGED_FILES: ${{ steps.get-changed-files.outputs.all_changed_files }} - run: ruff format $ALL_CHANGED_FILES \ No newline at end of file diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 000000000..0c5f12c32 --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,242 @@ +name: python + +on: pull_request + +env: + APP_API_KEY: ${{ secrets.APP_API_KEY }} + ENABLE_HOPSWORKS_USAGE: "false" + +jobs: + lint_stylecheck: + name: Lint and Stylecheck + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Get all changed files + id: get-changed-files + uses: tj-actions/changed-files@v44 + with: + files_yaml: | + src: + - 'python/**/*.py' + - '!python/tests/**/*.py' + test: + - 'python/tests/**/*.py' + + - name: install deps + run: pip install ruff==0.4.2 + + - name: ruff on python files + if: steps.get-changed-files.outputs.src_any_changed == 'true' + env: + SRC_ALL_CHANGED_FILES: + ${{ steps.get-changed-files.outputs.src_all_changed_files }} + run: ruff check --output-format=github $SRC_ALL_CHANGED_FILES + + - name: ruff on test files + if: steps.get-changed-files.outputs.test_any_changed == 'true' + env: + TEST_ALL_CHANGED_FILES: + ${{ steps.get-changed-files.outputs.test_all_changed_files }} + run: ruff check --output-format=github $TEST_ALL_CHANGED_FILES + + - name: ruff format --check $ALL_CHANGED_FILES + env: + ALL_CHANGED_FILES: + ${{ steps.get-changed-files.outputs.all_changed_files }} + run: ruff format $ALL_CHANGED_FILES + + unit_tests: + name: Unit Tests + needs: lint_stylecheck + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone UTC + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev] + + - name: Display Python version + run: python --version + + - name: Run Pytest suite + run: pytest python/tests + + unit_tests_no_opt: + name: Unit Tests (No Optional Dependencies) + needs: lint_stylecheck + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone UTC + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev-no-opt] + + - name: Run Pytest suite + run: pytest python/tests + + unit_tests_pandas1: + name: Unit Tests (Pandas 1.x) + needs: lint_stylecheck + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone UTC + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v4 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev-pandas1] + + - name: Display Python version + run: python --version + + - name: Run Pytest suite + run: pytest python/tests + + unit_tests_local_tz: + name: Unit Tests (Local TZ) + needs: lint_stylecheck + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone Europe/Amsterdam + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e 'python[python,dev]' + + - name: Display Python version + run: python --version + + - name: Run Pytest suite + run: pytest python/tests + + unit_tests_typechecked: + name: Unit Tests (Typechecked) + needs: lint_stylecheck + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v4 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev,docs] + + - name: Run Pytest suite + env: + HOPSWORKS_RUN_WITH_TYPECHECK: "true" + run: pytest python/tests + continue-on-error: true + + unit_tests_windows: + name: Unit Tests (Windows) + needs: lint_stylecheck + runs-on: windows-latest + + steps: + - name: Set Timezone + run: tzutil /s "UTC" + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev] + + - name: Display Python version + run: python --version + + - name: Run Pytest suite + run: pytest python/tests + + unit_tests_windows_local_tz: + name: Unit Tests (Windows) (Local TZ) + needs: lint_stylecheck + runs-on: windows-latest + + steps: + - name: Set Timezone + run: tzutil /s "W. Europe Standard Time" + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev] + + - name: Display Python version + run: python --version + + - name: Display pip freeze + run: pip freeze + + - name: Run Pytest suite + run: pytest python/tests diff --git a/.gitignore b/.gitignore index 6e96d8144..1581db87d 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,8 @@ coverage.xml .hypothesis/ .pytest_cache/ .ruff_cache/ +bigquery.json +metastore_db/ # Translations *.mo @@ -71,6 +73,9 @@ instance/ # Sphinx documentation docs/_build/ +# Mike Javadoc +docs/javadoc + # PyBuilder target/ @@ -128,3 +133,9 @@ target/ # mkdocs intemediate files docs/generated + +docs/CONTRIBUTING.md +docs/index.md + +# Test artifacts +keyFile.json diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 564734d53..e2801b11b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,16 +1,17 @@ ## Python development setup + --- - Fork and clone the repository -- Create a new Python environment with your favourite environment manager, e.g. virtualenv or conda +- Create a new Python environment with your favourite environment manager (e.g. virtualenv or conda) and Python 3.9 (newer versions will return a library conflict in `auto_doc.py`) - Install repository in editable mode with development dependencies: - ```bash - cd python - pip install -e ".[dev]" - ``` + ```bash + cd python + pip install -e ".[dev]" + ``` - Install [pre-commit](https://pre-commit.com/) and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. The library uses pre-commit to ensure code-style and code formatting through [ruff](https://docs.astral.sh/ruff/). Run the following commands from the `python` directory: @@ -37,75 +38,67 @@ We follow a few best practices for writing the Python documentation: 1. Use the google docstring style: - ```python - """[One Line Summary] + ```python + """[One Line Summary] - [Extended Summary] + [Extended Summary] - [!!! example - import xyz - ] + [!!! example + import xyz + ] - # Arguments - arg1: Type[, optional]. Description[, defaults to `default`] - arg2: Type[, optional]. Description[, defaults to `default`] + # Arguments + arg1: Type[, optional]. Description[, defaults to `default`] + arg2: Type[, optional]. Description[, defaults to `default`] - # Returns - Type. Description. + # Returns + Type. Description. - # Raises - Exception. Description. - """ - ``` - - If Python 3 type annotations are used, they are inserted automatically. + # Raises + Exception. Description. + """ + ``` + If Python 3 type annotations are used, they are inserted automatically. 2. Hopsworks entity engine methods (e.g. ExecutionEngine etc.) only require a single line docstring. -3. Private REST Api implementations (e.g. GitRemoteApi etc.) should be fully documented with docstrings without defaults. -4. Public Api such as metadata objects and public REST Api implementations should be fully documented with defaults. +3. Private REST API implementations (e.g. FeatureGroupApi etc.) should be fully documented with docstrings without defaults. +4. Public API such as metadata objects and public REST API implementations should be fully documented with defaults. #### Setup and Build Documentation We use `mkdocs` together with `mike` ([for versioning](https://github.com/jimporter/mike/)) to build the documentation and a plugin called `keras-autodoc` to auto generate Python API documentation from docstrings. **Background about `mike`:** - `mike` builds the documentation and commits it as a new directory to the gh-pages branch. Each directory corresponds to one version of the documentation. Additionally, `mike` maintains a json in the root of gh-pages with the mappings of versions/aliases for each of the directories available. With aliases you can define extra names like `dev` or `latest`, to indicate stable and unstable releases. +`mike` builds the documentation and commits it as a new directory to the gh-pages branch. Each directory corresponds to one version of the documentation. Additionally, `mike` maintains a json in the root of gh-pages with the mappings of versions/aliases for each of the directories available. With aliases you can define extra names like `dev` or `latest`, to indicate stable and unstable releases. -1. Currently we are using our own version of `keras-autodoc` +1. Install Hopsworks with `dev-docs` extras: - ```bash - pip install git+https://github.com/logicalclocks/keras-autodoc - ``` + ```bash + pip install -e ".[dev-docs]" + ``` -2. Install HOPSWORKS with `docs` extras: - - ```bash - pip install -e .[dev,docs] - ``` +2. To build the docs, first run the auto doc script: -3. To build the docs, first run the auto doc script: - - ```bash - cd .. - python auto_doc.py - ``` + ```bash + python auto_doc.py + ``` ##### Option 1: Build only current version of docs -4. Either build the docs, or serve them dynamically: +3. Either build the docs, or serve them dynamically: - Note: Links and pictures might not resolve properly later on when checking with this build. - The reason for that is that the docs are deployed with versioning on docs.hopsworks.ai and - therefore another level is added to all paths, e.g. `docs.hopsworks.ai/[version-or-alias]`. - Using relative links should not be affected by this, however, building the docs with version - (Option 2) is recommended. + Note: Links and pictures might not resolve properly later on when checking with this build. + The reason for that is that the docs are deployed with versioning on docs.hopsworks.ai and + therefore another level is added to all paths, e.g. `docs.hopsworks.ai/[version-or-alias]`. + Using relative links should not be affected by this, however, building the docs with version + (Option 2) is recommended. - ```bash - mkdocs build - # or - mkdocs serve - ``` + ```bash + mkdocs build + # or + mkdocs serve + ``` ##### Option 2 (Preferred): Build multi-version doc with `mike` @@ -113,34 +106,36 @@ We use `mkdocs` together with `mike` ([for versioning](https://github.com/jimpor On docs.hopsworks.ai we implement the following versioning scheme: -- current master branches (e.g. of hopsworks corresponding to master of Hopsworks): rendered as current Hopsworks snapshot version, e.g. **3.1.0-SNAPSHOT [dev]**, where `dev` is an alias to indicate that this is an unstable version. -- the latest release: rendered with full current version, e.g. **3.0.1 [latest]** with `latest` alias to indicate that this is the latest stable release. -- previous stable releases: rendered without alias, e.g. **3.0.0**. +- current master branches (e.g. of hopsworks corresponding to master of Hopsworks): rendered as current Hopsworks snapshot version, e.g. **4.0.0-SNAPSHOT [dev]**, where `dev` is an alias to indicate that this is an unstable version. +- the latest release: rendered with full current version, e.g. **3.8.0 [latest]** with `latest` alias to indicate that this is the latest stable release. +- previous stable releases: rendered without alias, e.g. **3.4.4**. ###### Build Instructions -4. For this you can either checkout and make a local copy of the `upstream/gh-pages` branch, where -`mike` maintains the current state of docs.hopsworks.ai, or just build documentation for the branch you are updating: +4. For this you can either checkout and make a local copy of the `upstream/gh-pages` branch, where `mike` maintains the current state of docs.hopsworks.ai, or just build documentation for the branch you are updating: Building *one* branch: Checkout your dev branch with modified docs: + ```bash git checkout [dev-branch] ``` Generate API docs if necessary: + ```bash python auto_doc.py ``` Build docs with a version and alias + ```bash mike deploy [version] [alias] --update-alias # for example, if you are updating documentation to be merged to master, # which will become the new SNAPSHOT version: - mike deploy 3.1.0-SNAPSHOT dev --update-alias + mike deploy 4.0.0-SNAPSHOT dev --update-alias # if you are updating docs of the latest stable release branch mike deploy [version] latest --update-alias @@ -158,17 +153,20 @@ On docs.hopsworks.ai we implement the following versioning scheme: ``` You can now checkout the gh-pages branch and serve: + ```bash git checkout gh-pages mike serve ``` You can also list all available versions/aliases: + ```bash mike list ``` Delete and reset your local gh-pages branch: + ```bash mike delete --all @@ -194,7 +192,7 @@ PAGES = { Now you can add a template markdown file to the `docs/templates` directory with the name you specified in the auto-doc script. The `new_template.md` file should contain a tag to identify the place at which the API documentation should be inserted: -``` +```` ## The XYZ package {{module}} @@ -207,7 +205,7 @@ Some extra content here. ``` {{xyz.asd}} -``` +```` Finally, run the `auto_doc.py` script, as decribed above, to update the documentation. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..38d9025c5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM ubuntu:22.04 + +RUN apt-get update && \ + apt-get install -y python3-pip git && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN pip3 install twine build virtualenv \ + mkdocs==1.5.3 \ + mkdocs-material==9.5.17 \ + mike==2.0.0 \ + git+https://github.com/logicalclocks/keras-autodoc + +RUN mkdir -p /.local && chmod -R 777 /.local diff --git a/README.md b/README.md index 162c95f97..e523c059d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,10 @@ src="https://img.shields.io/pypi/v/hopsworks?color=blue" alt="PyPiStatus" /> + Scala/Java Artifacts Downloads *hopsworks* is the python API for interacting with a Hopsworks cluster. Don't have a Hopsworks cluster just yet? Register an account on [Hopsworks Serverless](https://app.hopsworks.ai/) and get started for free. Once connected to your project, you can: - - Insert dataframes into the online or offline Store, create training datasets or *serve real-time* feature vectors in the Feature Store via the [Feature Store API](https://github.com/logicalclocks/feature-store-api). Already have data somewhere you want to import, checkout our [Storage Connectors](https://docs.hopsworks.ai/latest/user_guides/fs/storage_connector/) documentation. - - register ML models in the model registry and *deploy* them via model serving via the [Machine Learning API](https://gitub.com/logicalclocks/machine-learning-api). - - manage environments, executions, kafka topics and more once you deploy your own Hopsworks cluster, either on-prem or in the cloud. Hopsworks is open-source and has its own [Community Edition](https://github.com/logicalclocks/hopsworks). + +- Insert dataframes into the online or offline Store, create training datasets or *serve real-time* feature vectors in the Feature Store via the Feature Store API. Already have data somewhere you want to import, checkout our [Storage Connectors](https://docs.hopsworks.ai/latest/user_guides/fs/storage_connector/) documentation. +- register ML models in the model registry and *deploy* them via model serving via the Machine Learning API. +- manage environments, executions, kafka topics and more once you deploy your own Hopsworks cluster, either on-prem or in the cloud. Hopsworks is open-source and has its own [Community Edition](https://github.com/logicalclocks/hopsworks). Our [tutorials](https://github.com/logicalclocks/hopsworks-tutorials) cover a wide range of use cases and example of what *you* can build using Hopsworks. @@ -43,16 +48,19 @@ Our [tutorials](https://github.com/logicalclocks/hopsworks-tutorials) cover a wi Once you created a project on [Hopsworks Serverless](https://app.hopsworks.ai) and created a new [Api Key](https://docs.hopsworks.ai/latest/user_guides/projects/api_key/create_api_key/), just use your favourite virtualenv and package manager to install the library: ```bash -pip install hopsworks +pip install "hopsworks[python]" ``` Fire up a notebook and connect to your project, you will be prompted to enter your newly created API key: + ```python import hopsworks project = hopsworks.login() ``` +### Feature Store API + Access the Feature Store of your project to use as a central repository for your feature data. Use *your* favourite data engineering library (pandas, polars, Spark, etc...) to insert data into the Feature Store, create training datasets or serve real-time feature vectors. Want to predict likelyhood of e-scooter accidents in real-time? Here's how you can do it: ```python @@ -60,9 +68,9 @@ fs = project.get_feature_store() # Write to Feature Groups bike_ride_fg = fs.get_or_create_feature_group( - name="bike_rides", - version=1, - primary_key=["ride_id"], + name="bike_rides", + version=1, + primary_key=["ride_id"], event_time="activation_time", online_enabled=True, ) @@ -73,13 +81,13 @@ fg.insert(bike_rides_df) profile_fg = fs.get_feature_group("user_profile", version=1) bike_ride_fv = fs.get_or_create_feature_view( - name="bike_rides_view", - version=1, + name="bike_rides_view", + version=1, query=bike_ride_fg.select_except(["ride_id"]).join(profile_fg.select(["age", "has_license"]), on="user_id") ) bike_rides_Q1_2021_df = bike_ride_fv.get_batch_data( - start_date="2021-01-01", + start_date="2021-01-01", end_date="2021-01-31" ) @@ -97,22 +105,68 @@ bike_ride_fv.init_serving() while True: new_ride_vector = poll_ride_queue() feature_vector = bike_ride_fv.get_online_feature_vector( - {"user_id": new_ride_vector["user_id"]}, + {"user_id": new_ride_vector["user_id"]}, passed_features=new_ride_vector ) accident_probability = model.predict(feature_vector) ``` -Or you can use the Machine Learning API to register models and deploy them for serving: +The API enables interaction with the Hopsworks Feature Store. It makes creating new features, feature groups and training datasets easy. + +The API is environment independent and can be used in two modes: + +- Spark mode: For data engineering jobs that create and write features into the feature store or generate training datasets. It requires a Spark environment such as the one provided in the Hopsworks platform or Databricks. In Spark mode, HSFS provides bindings both for Python and JVM languages. + +- Python mode: For data science jobs to explore the features available in the feature store, generate training datasets and feed them in a training pipeline. Python mode requires just a Python interpreter and can be used both in Hopsworks from Python Jobs/Jupyter Kernels, Amazon SageMaker or KubeFlow. + +Scala API is also available, here is a short sample of it: + +```scala +import com.logicalclocks.hsfs._ +val connection = HopsworksConnection.builder().build() +val fs = connection.getFeatureStore(); +val attendances_features_fg = fs.getFeatureGroup("games_features", 1); +attendances_features_fg.show(1) +``` + +### Machine Learning API + +Or you can use the Machine Learning API to interact with the Hopsworks Model Registry and Model Serving. The API makes it easy to export, manage and deploy models. For example, to register models and deploy them for serving you can do: + ```python mr = project.get_model_registry() # or -ms = project.get_model_serving() +ms = connection.get_model_serving() + +# Create a new model: +model = mr.tensorflow.create_model(name="mnist", + version=1, + metrics={"accuracy": 0.94}, + description="mnist model description") +model.save("/tmp/model_directory") # or /tmp/model_file + +# Download a model: +model = mr.get_model("mnist", version=1) +model_path = model.download() + +# Delete the model: +model.delete() + +# Get the best-performing model +best_model = mr.get_best_model('mnist', 'accuracy', 'max') + +# Deploy the model: +deployment = model.deploy() +deployment.start() + +# Make predictions with a deployed model +data = { "instances": [ model.input_example ] } +predictions = deployment.predict(data) ``` ## Tutorials -Need more inspiration or want to learn more about the Hopsworks platform? Check out our [tutorials](https://github.com/logicalclocks/hopsworks-tutorials). +Need more inspiration or want to learn more about the Hopsworks platform? Check out our [tutorials](https://github.com/logicalclocks/hopsworks-tutorials). ## Documentation @@ -124,7 +178,17 @@ For general questions about the usage of Hopsworks and the Feature Store please Please report any issue using [Github issue tracking](https://github.com/logicalclocks/hopsworks-api/issues). +### Related to Feautre Store API + +Please attach the client environment from the output below to your issue, if it is related to Feature Store API: + +```python +import hopsworks +import hsfs +hopsworks.login().get_feature_store() +print(hsfs.get_env()) +``` + ## Contributing If you would like to contribute to this library, please see the [Contribution Guidelines](CONTRIBUTING.md). - diff --git a/auto_doc.py b/auto_doc.py deleted file mode 100644 index 1fd5b40f8..000000000 --- a/auto_doc.py +++ /dev/null @@ -1,215 +0,0 @@ -# -# Copyright 2022 Logical Clocks AB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import os -import pathlib -import shutil - -import keras_autodoc - -PAGES = { - "api/login.md": { - "login": ["hopsworks.login"], - "get_current_project": ["hopsworks.get_current_project"], - "fs_api": ["hopsworks.project.Project.get_feature_store"], - "mr_api": ["hopsworks.project.Project.get_model_registry"], - "ms_api": ["hopsworks.project.Project.get_model_serving"], - }, - "api/udf.md": { - "udf": ["hopsworks.udf"], - }, - "api/connection.md": { - "connection_create": ["hopsworks.connection.Connection.connection"], - "connection_properties": keras_autodoc.get_properties( - "hopsworks.connection.Connection" - ), - "connection_methods": keras_autodoc.get_methods( - "hopsworks.connection.Connection", exclude=["from_response_json", "json"] - ), - }, - "api/projects.md": { - "project_create": ["hopsworks.create_project"], - "project_properties": keras_autodoc.get_properties("hopsworks.project.Project"), - "project_methods": keras_autodoc.get_methods( - "hopsworks.project.Project", exclude=["from_response_json", "json"] - ), - }, - "api/jobs.md": { - "job_api_handle": ["hopsworks.project.Project.get_jobs_api"], - "job_create": ["hopsworks.core.job_api.JobsApi.create_job"], - "job_get": ["hopsworks.core.job_api.JobsApi.get_job"], - "job_get_all": ["hopsworks.core.job_api.JobsApi.get_jobs"], - "job_properties": keras_autodoc.get_properties("hopsworks.job.Job"), - "job_config": ["hopsworks.core.job_api.JobsApi.get_configuration"], - "job_methods": keras_autodoc.get_methods( - "hopsworks.job.Job", exclude=["from_response_json", "json"] - ), - }, - "api/executions.md": { - "execution_create": ["hopsworks.job.Job.run"], - "execution_get": ["hopsworks.job.Job.get_executions"], - "execution_properties": keras_autodoc.get_properties( - "hopsworks.execution.Execution" - ), - "execution_methods": keras_autodoc.get_methods( - "hopsworks.execution.Execution", - exclude=["from_response_json", "json", "update_from_response_json"], - ), - }, - "api/flink_cluster.md": { - "flink_api_handle": ["hopsworks.project.Project.get_flink_cluster_api"], - "setup_cluster": [ - "hopsworks.core.flink_cluster_api.FlinkClusterApi.setup_cluster" - ], - "get_cluster": ["hopsworks.core.flink_cluster_api.FlinkClusterApi.get_cluster"], - "start_cluster": ["hopsworks.flink_cluster.FlinkCluster.start"], - "submit_job_to_cluster": ["hopsworks.flink_cluster.FlinkCluster.submit_job"], - "flink_cluster_properties": keras_autodoc.get_properties( - "hopsworks.flink_cluster.FlinkCluster" - ), - "flink_cluster_methods": keras_autodoc.get_methods( - "hopsworks.flink_cluster.FlinkCluster", - exclude=["from_response_json", "json"], - ), - }, - "api/environment.md": { - "env_api_handle": ["hopsworks.project.Project.get_environment_api"], - "env_create": [ - "hopsworks.core.environment_api.EnvironmentApi.create_environment" - ], - "env_get": ["hopsworks.core.environment_api.EnvironmentApi.get_environment"], - "env_methods": keras_autodoc.get_methods( - "hopsworks.environment.Environment", exclude=["from_response_json", "json"] - ), - }, - "api/git_repo.md": { - "git_api_handle": ["hopsworks.project.Project.get_git_api"], - "git_repo_clone": ["hopsworks.core.git_api.GitApi.clone"], - "git_repo_get": ["hopsworks.core.git_api.GitApi.get_repo"], - "git_repo_get_all": ["hopsworks.core.git_api.GitApi.get_repos"], - "git_repo_properties": keras_autodoc.get_properties( - "hopsworks.git_repo.GitRepo" - ), - "git_repo_methods": keras_autodoc.get_methods( - "hopsworks.git_repo.GitRepo", exclude=["from_response_json", "json"] - ), - }, - "api/git_provider.md": { - "git_api_handle": ["hopsworks.project.Project.get_git_api"], - "git_provider_create": ["hopsworks.core.git_api.GitApi.set_provider"], - "git_provider_get": ["hopsworks.core.git_api.GitApi.get_provider"], - "git_provider_get_all": ["hopsworks.core.git_api.GitApi.get_providers"], - "git_provider_properties": keras_autodoc.get_properties( - "hopsworks.git_provider.GitProvider" - ), - "git_provider_methods": keras_autodoc.get_methods( - "hopsworks.git_provider.GitProvider", exclude=["from_response_json", "json"] - ), - }, - "api/git_remote.md": { - "git_api_handle": ["hopsworks.project.Project.get_git_api"], - "git_remote_create": ["hopsworks.git_repo.GitRepo.add_remote"], - "git_remote_get": ["hopsworks.git_repo.GitRepo.get_remote"], - "git_remote_get_all": ["hopsworks.git_repo.GitRepo.get_remotes"], - "git_remote_properties": keras_autodoc.get_properties( - "hopsworks.git_remote.GitRemote" - ), - "git_remote_methods": keras_autodoc.get_methods( - "hopsworks.git_remote.GitRemote", exclude=["from_response_json", "json"] - ), - }, - "api/datasets.md": { - "dataset_api_handle": ["hopsworks.project.Project.get_dataset_api"], - "dataset_methods": keras_autodoc.get_methods( - "hopsworks.core.dataset_api.DatasetApi" - ), - }, - "api/kafka_topic.md": { - "kafka_api_handle": ["hopsworks.project.Project.get_kafka_api"], - "kafka_config": ["hopsworks.core.kafka_api.KafkaApi.get_default_config"], - "kafka_topic_create": ["hopsworks.core.kafka_api.KafkaApi.create_topic"], - "kafka_topic_get": ["hopsworks.core.kafka_api.KafkaApi.get_topic"], - "kafka_topic_get_all": ["hopsworks.core.kafka_api.KafkaApi.get_topics"], - "kafka_topic_properties": keras_autodoc.get_properties( - "hopsworks.kafka_topic.KafkaTopic" - ), - "kafka_topic_methods": keras_autodoc.get_methods( - "hopsworks.kafka_topic.KafkaTopic", - exclude=["from_response_json", "json", "update_from_response_json"], - ), - }, - "api/kafka_schema.md": { - "kafka_api_handle": ["hopsworks.project.Project.get_kafka_api"], - "kafka_schema_create": ["hopsworks.core.kafka_api.KafkaApi.create_schema"], - "kafka_schema_get": ["hopsworks.core.kafka_api.KafkaApi.get_schema"], - "kafka_schema_get_all": ["hopsworks.core.kafka_api.KafkaApi.get_schemas"], - "kafka_schema_get_subjects": ["hopsworks.core.kafka_api.KafkaApi.get_subjects"], - "kafka_schema_properties": keras_autodoc.get_properties( - "hopsworks.kafka_schema.KafkaSchema" - ), - "kafka_schema_methods": keras_autodoc.get_methods( - "hopsworks.kafka_schema.KafkaSchema", - exclude=["from_response_json", "json", "update_from_response_json"], - ), - }, - "api/secrets.md": { - "secret_api_handle": ["hopsworks.get_secrets_api"], - "secret_create": ["hopsworks.core.secret_api.SecretsApi.create_secret"], - "secret_get": ["hopsworks.core.secret_api.SecretsApi.get_secret"], - "secret_get_simplified": ["hopsworks.core.secret_api.SecretsApi.get"], - "secret_get_all": ["hopsworks.core.secret_api.SecretsApi.get_secrets"], - "secret_properties": keras_autodoc.get_properties("hopsworks.secret.Secret"), - "secret_methods": keras_autodoc.get_methods( - "hopsworks.secret.Secret", exclude=["from_response_json", "json"] - ), - }, - "api/opensearch.md": { - "opensearch_api_handle": ["hopsworks.project.Project.get_opensearch_api"], - "opensearch_methods": keras_autodoc.get_methods( - "hopsworks.core.opensearch_api.OpenSearchApi" - ), - }, -} - -hw_dir = pathlib.Path(__file__).resolve().parents[0] -if "GITHUB_SHA" in os.environ: - commit_sha = os.environ["GITHUB_SHA"] - project_url = ( - f"https://github.com/logicalclocks/feature-store-api/tree/{commit_sha}/python" - ) -else: - branch_name = os.environ.get("GITHUB_BASE_REF", "master") - project_url = ( - f"https://github.com/logicalclocks/feature-store-api/blob/{branch_name}/python" - ) - - -def generate(dest_dir): - doc_generator = keras_autodoc.DocumentationGenerator( - PAGES, - project_url=project_url, - template_dir="./docs/templates", - titles_size="###", - extra_aliases={}, - max_signature_line_length=100, - ) - shutil.copyfile(hw_dir / "CONTRIBUTING.md", dest_dir / "CONTRIBUTING.md") - shutil.copyfile(hw_dir / "README.md", dest_dir / "index.md") - - doc_generator.generate(dest_dir / "generated") - - -if __name__ == "__main__": - generate(hw_dir / "docs") diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md deleted file mode 100644 index b97326e6f..000000000 --- a/docs/CONTRIBUTING.md +++ /dev/null @@ -1,215 +0,0 @@ -## Python development setup ---- - -- Fork and clone the repository - -- Create a new Python environment with your favourite environment manager, e.g. virtualenv or conda - -- Install repository in editable mode with development dependencies: - - ```bash - cd python - pip install -e ".[dev]" - ``` - -- Install [pre-commit](https://pre-commit.com/) and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. The Feature Store uses pre-commit to ensure code-style and code formatting through [ruff](https://docs.astral.sh/ruff/). Run the following commands from the `python` directory: - - ```bash - cd python - pip install --user pre-commit - pre-commit install - ``` - - Afterwards, pre-commit will run whenever you commit. - -- To run formatting and code-style separately, you can configure your IDE, such as VSCode, to use `ruff`, or run it via the command line: - - ```bash - # linting - ruff check python --fix - # formatting - ruff format python - ``` - -### Python documentation - -We follow a few best practices for writing the Python documentation: - -1. Use the google docstring style: - - ```python - """[One Line Summary] - - [Extended Summary] - - [!!! example - import xyz - ] - - # Arguments - arg1: Type[, optional]. Description[, defaults to `default`] - arg2: Type[, optional]. Description[, defaults to `default`] - - # Returns - Type. Description. - - # Raises - Exception. Description. - """ - ``` - - If Python 3 type annotations are used, they are inserted automatically. - - -2. Hopsworks entity engine methods (e.g. ExecutionEngine etc.) only require a single line docstring. -3. Private REST Api implementations (e.g. GitRemoteApi etc.) should be fully documented with docstrings without defaults. -4. Public Api such as metadata objects and public REST Api implementations should be fully documented with defaults. - -#### Setup and Build Documentation - -We use `mkdocs` together with `mike` ([for versioning](https://github.com/jimporter/mike/)) to build the documentation and a plugin called `keras-autodoc` to auto generate Python API documentation from docstrings. - -**Background about `mike`:** - `mike` builds the documentation and commits it as a new directory to the gh-pages branch. Each directory corresponds to one version of the documentation. Additionally, `mike` maintains a json in the root of gh-pages with the mappings of versions/aliases for each of the directories available. With aliases you can define extra names like `dev` or `latest`, to indicate stable and unstable releases. - -1. Currently we are using our own version of `keras-autodoc` - - ```bash - pip install git+https://github.com/logicalclocks/keras-autodoc - ``` - -2. Install HOPSWORKS with `docs` extras: - - ```bash - pip install -e .[dev,docs] - ``` - -3. To build the docs, first run the auto doc script: - - ```bash - cd .. - python auto_doc.py - ``` - -##### Option 1: Build only current version of docs - -4. Either build the docs, or serve them dynamically: - - Note: Links and pictures might not resolve properly later on when checking with this build. - The reason for that is that the docs are deployed with versioning on docs.hopsworks.ai and - therefore another level is added to all paths, e.g. `docs.hopsworks.ai/[version-or-alias]`. - Using relative links should not be affected by this, however, building the docs with version - (Option 2) is recommended. - - ```bash - mkdocs build - # or - mkdocs serve - ``` - -##### Option 2 (Preferred): Build multi-version doc with `mike` - -###### Versioning on docs.hopsworks.ai - -On docs.hopsworks.ai we implement the following versioning scheme: - -- current master branches (e.g. of hopsworks corresponding to master of Hopsworks): rendered as current Hopsworks snapshot version, e.g. **3.1.0-SNAPSHOT [dev]**, where `dev` is an alias to indicate that this is an unstable version. -- the latest release: rendered with full current version, e.g. **3.0.1 [latest]** with `latest` alias to indicate that this is the latest stable release. -- previous stable releases: rendered without alias, e.g. **3.0.0**. - -###### Build Instructions - -4. For this you can either checkout and make a local copy of the `upstream/gh-pages` branch, where -`mike` maintains the current state of docs.hopsworks.ai, or just build documentation for the branch you are updating: - - Building *one* branch: - - Checkout your dev branch with modified docs: - ```bash - git checkout [dev-branch] - ``` - - Generate API docs if necessary: - ```bash - python auto_doc.py - ``` - - Build docs with a version and alias - ```bash - mike deploy [version] [alias] --update-alias - - # for example, if you are updating documentation to be merged to master, - # which will become the new SNAPSHOT version: - mike deploy 3.1.0-SNAPSHOT dev --update-alias - - # if you are updating docs of the latest stable release branch - mike deploy [version] latest --update-alias - - # if you are updating docs of a previous stable release branch - mike deploy [version] - ``` - - If no gh-pages branch existed in your local repository, this will have created it. - - **Important**: If no previous docs were built, you will have to choose a version as default to be loaded as index, as follows - - ```bash - mike set-default [version-or-alias] - ``` - - You can now checkout the gh-pages branch and serve: - ```bash - git checkout gh-pages - mike serve - ``` - - You can also list all available versions/aliases: - ```bash - mike list - ``` - - Delete and reset your local gh-pages branch: - ```bash - mike delete --all - - # or delete single version - mike delete [version-or-alias] - ``` - -#### Adding new API documentation - -To add new documentation for APIs, you need to add information about the method/class to document to the `auto_doc.py` script: - -```python -PAGES = { - "connection.md": [ - "hopsworks.connection.Connection.connection" - ] - "new_template.md": [ - "module", - "xyz.asd" - ] -} -``` - -Now you can add a template markdown file to the `docs/templates` directory with the name you specified in the auto-doc script. The `new_template.md` file should contain a tag to identify the place at which the API documentation should be inserted: - -``` -## The XYZ package - -{{module}} - -Some extra content here. - -!!! example - ```python - import xyz - ``` - -{{xyz.asd}} -``` - -Finally, run the `auto_doc.py` script, as decribed above, to update the documentation. - -For information about Markdown syntax and possible Admonitions/Highlighting etc. see -the [Material for Mkdocs themes reference documentation](https://squidfunk.github.io/mkdocs-material/reference/abbreviations/). diff --git a/docs/assets/images/hopsworks-logo.png b/docs/assets/images/hopsworks-logo.png new file mode 100644 index 000000000..36f20bb12 Binary files /dev/null and b/docs/assets/images/hopsworks-logo.png differ diff --git a/docs/templates/api/connection_api.md b/docs/templates/api/connection_api.md new file mode 100644 index 000000000..19e13f3eb --- /dev/null +++ b/docs/templates/api/connection_api.md @@ -0,0 +1,11 @@ +# Connection + +{{connection}} + +## Properties + +{{connection_properties}} + +## Methods + +{{connection_methods}} diff --git a/docs/templates/api/embedding_feature_api.md b/docs/templates/api/embedding_feature_api.md new file mode 100644 index 000000000..c054672d0 --- /dev/null +++ b/docs/templates/api/embedding_feature_api.md @@ -0,0 +1,7 @@ +# EmbeddingFeature + +{{embedding_feature}} + +## Properties + +{{embedding_feature_properties}} diff --git a/docs/templates/api/embedding_index_api.md b/docs/templates/api/embedding_index_api.md new file mode 100644 index 000000000..d336e0ddb --- /dev/null +++ b/docs/templates/api/embedding_index_api.md @@ -0,0 +1,12 @@ +# EmbeddingIndex + +{{embedding_index}} + +## Properties + +{{embedding_index_properties}} + +## Methods + +{{embedding_index_methods}} + diff --git a/docs/templates/api/expectation_api.md b/docs/templates/api/expectation_api.md new file mode 100644 index 000000000..7ba4110c1 --- /dev/null +++ b/docs/templates/api/expectation_api.md @@ -0,0 +1,20 @@ +# Expectation + +{{expectation}} + +## Properties + +{{expectation_properties}} + +## Methods + +{{expectation_methods}} + +## Creation +{{expectation_create}} + +## Retrieval + +{{expectation_getall}} + +{{expectation_get}} diff --git a/docs/templates/api/expectation_suite_api.md b/docs/templates/api/expectation_suite_api.md new file mode 100644 index 000000000..a07ac5f8a --- /dev/null +++ b/docs/templates/api/expectation_suite_api.md @@ -0,0 +1,41 @@ +# Expectation Suite + +{{expectation_suite}} + +## Creation with Great Expectations + +```python3 +import great_expectations as ge + +expectation_suite = ge.core.ExpectationSuite( + "new_expectation_suite", + expectations=[ + ge.core.ExpectationConfiguration( + expectation_type="expect_column_max_to_be_between", + kwargs={ + "column": "feature", + "min_value": -1, + "max_value": 1 + } + ) + ] +) +``` + +## Attach to Feature Group + +{{expectation_suite_attach}} + +## Single Expectation API + +An API to edit the expectation list based on Great Expectations API. + +{{single_expectation_api}} + +## Properties + +{{expectation_suite_properties}} + +## Methods + +{{expectation_suite_methods}} diff --git a/docs/templates/api/external_feature_group_api.md b/docs/templates/api/external_feature_group_api.md new file mode 100644 index 000000000..a982a39e8 --- /dev/null +++ b/docs/templates/api/external_feature_group_api.md @@ -0,0 +1,19 @@ +# ExternalFeatureGroup + +{{fg}} + +## Creation + +{{fg_create}} + +## Retrieval + +{{fg_get}} + +## Properties + +{{fg_properties}} + +## Methods + +{{fg_methods}} diff --git a/docs/templates/api/feature_api.md b/docs/templates/api/feature_api.md new file mode 100644 index 000000000..8dca5ef54 --- /dev/null +++ b/docs/templates/api/feature_api.md @@ -0,0 +1,11 @@ +# Feature + +{{feature}} + +## Properties + +{{feature_properties}} + +## Methods + +{{feature_methods}} diff --git a/docs/templates/api/feature_descriptive_statistics_api.md b/docs/templates/api/feature_descriptive_statistics_api.md new file mode 100644 index 000000000..3be8cccd3 --- /dev/null +++ b/docs/templates/api/feature_descriptive_statistics_api.md @@ -0,0 +1,7 @@ +# Feature Descriptive Statistics + +{{feature_descriptive_statistics}} + +## Properties + +{{feature_descriptive_statistics_properties}} diff --git a/docs/templates/api/feature_group_api.md b/docs/templates/api/feature_group_api.md new file mode 100644 index 000000000..372865c4b --- /dev/null +++ b/docs/templates/api/feature_group_api.md @@ -0,0 +1,19 @@ +# FeatureGroup + +{{fg}} + +## Creation + +{{fg_create}} + +## Retrieval + +{{fg_get}} + +## Properties + +{{fg_properties}} + +## Methods + +{{fg_methods}} diff --git a/docs/templates/api/feature_monitoring_config_api.md b/docs/templates/api/feature_monitoring_config_api.md new file mode 100644 index 000000000..7ca9b46ff --- /dev/null +++ b/docs/templates/api/feature_monitoring_config_api.md @@ -0,0 +1,27 @@ +# Feature Monitoring Configuration + +{{feature_monitoring_config}} + +## Creation from Feature Group + +{{feature_monitoring_config_creation_fg}} + +## Creation from Feature View + +{{feature_monitoring_config_creation_fv}} + +## Retrieval from Feature Group + +{{feature_monitoring_config_retrieval_fg}} + +## Retrieval from Feature View + +{{feature_monitoring_config_retrieval_fv}} + +## Properties + +{{feature_monitoring_config_properties}} + +## Methods + +{{feature_monitoring_config_methods}} diff --git a/docs/templates/api/feature_monitoring_result_api.md b/docs/templates/api/feature_monitoring_result_api.md new file mode 100644 index 000000000..5bfca1165 --- /dev/null +++ b/docs/templates/api/feature_monitoring_result_api.md @@ -0,0 +1,11 @@ +# Feature Monitoring Result + +{{feature_monitoring_result}} + +## Retrieval + +{{feature_monitoring_result_retrieval}} + +## Properties + +{{feature_monitoring_result_properties}} diff --git a/docs/templates/api/feature_monitoring_window_config_api.md b/docs/templates/api/feature_monitoring_window_config_api.md new file mode 100644 index 000000000..53ef23ea2 --- /dev/null +++ b/docs/templates/api/feature_monitoring_window_config_api.md @@ -0,0 +1,7 @@ +# Feature Monitoring Window Configuration + +{{feature_monitoring_window_config}} + +## Properties + +{{feature_monitoring_window_config_properties}} diff --git a/docs/templates/api/feature_store_api.md b/docs/templates/api/feature_store_api.md new file mode 100644 index 000000000..f859336f6 --- /dev/null +++ b/docs/templates/api/feature_store_api.md @@ -0,0 +1,15 @@ +# Feature Store + +{{fs}} + +## Retrieval + +{{fs_get}} + +## Properties + +{{fs_properties}} + +## Methods + +{{fs_methods}} diff --git a/docs/templates/api/feature_view_api.md b/docs/templates/api/feature_view_api.md new file mode 100644 index 000000000..c0f7df954 --- /dev/null +++ b/docs/templates/api/feature_view_api.md @@ -0,0 +1,21 @@ +# Feature View + +{{fv}} + +## Creation + +{{fv_create}} + +## Retrieval + +{{fv_get}} + +{{fvs_get}} + +## Properties + +{{fv_properties}} + +## Methods + +{{fv_methods}} diff --git a/docs/templates/api/job.md b/docs/templates/api/job.md new file mode 100644 index 000000000..9ad68d976 --- /dev/null +++ b/docs/templates/api/job.md @@ -0,0 +1,11 @@ +# Job + +{{job}} + +## Methods + +{{job_methods}} + +## Job Configuration + +{{job_configuration}} diff --git a/docs/templates/api/links.md b/docs/templates/api/links.md new file mode 100644 index 000000000..62cdc7001 --- /dev/null +++ b/docs/templates/api/links.md @@ -0,0 +1,14 @@ +# Provenance Links + +Provenance Links are objects returned by methods such as [get_feature_groups_provenance](../storage_connector_api/#get_feature_groups_provenance), [get_storage_connector_provenance](../feature_group_api/#get_storage_connector_provenance), [get_parent_feature_group](../feature_group_api/#get_parent_feature_groups), [get_generated_feature_groups](../feature_group_api/#get_generated_feature_groups), [get_generated_feature_views](../feature_group_api/#get_generated_feature_views) [get_models_provenance](../feature_view_api/#get_models_provenance) and represent sections of the provenance graph, depending on the method invoked. + +## Properties + +{{links_properties}} + +# Artifact + +Artifacts objects are part of the provenance graph and contain a minimal set of information regarding the entities (feature groups, feature views) they represent. +The provenance graph contains Artifact objects when the underlying entities have been deleted or they are corrupted or they are not accessible by the user. + +{{artifact_properties}} diff --git a/docs/templates/api/query_api.md b/docs/templates/api/query_api.md new file mode 100644 index 000000000..7cc664d96 --- /dev/null +++ b/docs/templates/api/query_api.md @@ -0,0 +1,13 @@ +# Query + +Query objects are strictly generated by HSFS APIs called on [Feature Group objects](feature_group_api.md). +Users will never construct a Query object using the constructor of the class. +For this reason we do not provide the full documentation of the class here. + +## Methods + +{{query_methods}} + +## Properties + +{{query_properties}} diff --git a/docs/templates/api/rule_api.md b/docs/templates/api/rule_api.md new file mode 100644 index 000000000..0801e3954 --- /dev/null +++ b/docs/templates/api/rule_api.md @@ -0,0 +1,7 @@ +# Rule + +{{rule}} + +## Properties + +{{rule_properties}} diff --git a/docs/templates/api/rule_definition_api.md b/docs/templates/api/rule_definition_api.md new file mode 100644 index 000000000..326b66db0 --- /dev/null +++ b/docs/templates/api/rule_definition_api.md @@ -0,0 +1,13 @@ +# Rule Definition + +{{ruledefinition}} + +## Properties + +{{ruledefinition_properties}} + +## Retrieval + +{{ruledefinition_getall}} + +{{ruledefinition_get}} diff --git a/docs/templates/api/similarity_function_type_api.md b/docs/templates/api/similarity_function_type_api.md new file mode 100644 index 000000000..bdfbc51c2 --- /dev/null +++ b/docs/templates/api/similarity_function_type_api.md @@ -0,0 +1,3 @@ +# SimilarityFunctionType + +{{similarity_function_type}} diff --git a/docs/templates/api/spine_group_api.md b/docs/templates/api/spine_group_api.md new file mode 100644 index 000000000..a2bdf119c --- /dev/null +++ b/docs/templates/api/spine_group_api.md @@ -0,0 +1,19 @@ +# SpineGroup + +{{fg}} + +## Creation + +{{fg_create}} + +## Retrieval + +{{fg_get}} + +## Properties + +{{fg_properties}} + +## Methods + +{{fg_methods}} diff --git a/docs/templates/api/split_statistics_api.md b/docs/templates/api/split_statistics_api.md new file mode 100644 index 000000000..09053ac5d --- /dev/null +++ b/docs/templates/api/split_statistics_api.md @@ -0,0 +1,7 @@ +# Split Statistics + +{{split_statistics}} + +## Properties + +{{split_statistics_properties}} diff --git a/docs/templates/api/statistics_api.md b/docs/templates/api/statistics_api.md new file mode 100644 index 000000000..27ed90c9d --- /dev/null +++ b/docs/templates/api/statistics_api.md @@ -0,0 +1,7 @@ +# Statistics + +{{statistics}} + +## Properties + +{{statistics_properties}} diff --git a/docs/templates/api/statistics_config_api.md b/docs/templates/api/statistics_config_api.md new file mode 100644 index 000000000..a907d1d32 --- /dev/null +++ b/docs/templates/api/statistics_config_api.md @@ -0,0 +1,7 @@ +# StatisticsConfig + +{{statistics_config}} + +## Properties + +{{statistics_config_properties}} diff --git a/docs/templates/api/storage_connector_api.md b/docs/templates/api/storage_connector_api.md new file mode 100644 index 000000000..1b390e72a --- /dev/null +++ b/docs/templates/api/storage_connector_api.md @@ -0,0 +1,119 @@ +# Storage Connector + +## Retrieval + +{{sc_get}} + +## HopsFS + +### Properties + +{{hopsfs_properties}} + +### Methods + +{{hopsfs_methods}} + +## JDBC + +### Properties + +{{jdbc_properties}} + +### Methods + +{{jdbc_methods}} + +## S3 + +### Properties + +{{s3_properties}} + +### Methods + +{{s3_methods}} + +## Redshift + +### Properties + +{{redshift_properties}} + +### Methods + +{{redshift_methods}} + +## Azure Data Lake Storage + +### Properties + +{{adls_properties}} + +### Methods + +{{adls_methods}} + +## Snowflake + +### Properties + +{{snowflake_properties}} + +### Methods + +{{snowflake_methods}} + +## Google Cloud Storage +This storage connector provides integration to Google Cloud Storage (GCS). +Once you create a connector in FeatureStore, you can transact data from a GCS bucket into a spark dataframe +by calling the `read` API. + +Authentication to GCP is handled by uploading the `JSON keyfile for service account` to the Hopsworks Project. For more information +on service accounts and creating keyfile in GCP, read [Google Cloud documentation.](https://cloud.google.com/docs/authentication/production#create_service_account +'creating service account keyfile') + +The connector also supports the optional encryption method `Customer Supplied Encryption Key` by Google. +The encryption details are stored as `Secrets` in the FeatureStore for keeping it secure. +Read more about encryption on [Google Documentation.](https://cloud.google.com/storage/docs/encryption#customer-supplied_encryption_keys) + +The storage connector uses the Google `gcs-connector-hadoop` behind the scenes. For more information, check out [Google Cloud Storage Connector for Spark and Hadoop]( +https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs#google-cloud-storage-connector-for-spark-and-hadoop 'google-cloud-storage-connector-for-spark-and-hadoop') + +### Properties + +{{gcs_properties}} + +### Methods + +{{gcs_methods}} + +## BigQuery +The BigQuery storage connector provides integration to Google Cloud BigQuery. +You can use it to run bigquery on your GCP cluster and load results into spark dataframe by calling the `read` API. + +Authentication to GCP is handled by uploading the `JSON keyfile for service account` to the Hopsworks Project. For more information +on service accounts and creating keyfile in GCP, read [Google Cloud documentation.](https://cloud.google.com/docs/authentication/production#create_service_account +'creating service account keyfile') + +The storage connector uses the Google `spark-bigquery-connector` behind the scenes. +To read more about the spark connector, like the spark options or usage, check [Apache Spark SQL connector for Google BigQuery.](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#usage +'github.com/GoogleCloudDataproc/spark-bigquery-connector') + +### Properties + +{{bigquery_properties}} + +### Methods + +{{bigquery_methods}} + +## Kafka + +### Properties + +{{kafka_properties}} + +### Methods + +{{kafka_methods}} diff --git a/docs/templates/api/training_dataset_api.md b/docs/templates/api/training_dataset_api.md new file mode 100644 index 000000000..a53696465 --- /dev/null +++ b/docs/templates/api/training_dataset_api.md @@ -0,0 +1,19 @@ +# Training Dataset + +{{td}} + +## Creation + +{{td_create}} + +## Retrieval + +{{td_get}} + +## Properties + +{{td_properties}} + +## Methods + +{{td_methods}} diff --git a/docs/templates/api/transformation_functions_api.md b/docs/templates/api/transformation_functions_api.md new file mode 100644 index 000000000..249262a45 --- /dev/null +++ b/docs/templates/api/transformation_functions_api.md @@ -0,0 +1,20 @@ +# Transformation Function + +{{transformation_function}} + +## Properties + +{{transformation_function_properties}} + +## Methods + +{{transformation_function_methods}} + +## Creation +{{create_transformation_function}} + +## Retrieval + +{{get_transformation_function}} + +{{get_transformation_functions}} diff --git a/docs/templates/api/validation_api.md b/docs/templates/api/validation_api.md new file mode 100644 index 000000000..8e1512f34 --- /dev/null +++ b/docs/templates/api/validation_api.md @@ -0,0 +1,18 @@ +# Validation + +{{validation_result}} + +## Properties + +{{validation_result_properties}} + +## Methods + +{{expectation_methods}} + +## Validate a dataframe +{{validate}} + +## Retrieval + +{{validation_result_get}} diff --git a/docs/templates/api/validation_report_api.md b/docs/templates/api/validation_report_api.md new file mode 100644 index 000000000..435a87a03 --- /dev/null +++ b/docs/templates/api/validation_report_api.md @@ -0,0 +1,19 @@ +# Validation Report + +{{validation_report}} + +## Creation + +{{validation_report_validate}} + +## Retrieval + +{{validation_report_get}} + +## Properties + +{{validation_report_properties}} + +## Methods + +{{validation_report_methods}} diff --git a/docs/templates/connection_api.md b/docs/templates/connection_api.md new file mode 100644 index 000000000..19e13f3eb --- /dev/null +++ b/docs/templates/connection_api.md @@ -0,0 +1,11 @@ +# Connection + +{{connection}} + +## Properties + +{{connection_properties}} + +## Methods + +{{connection_methods}} diff --git a/docs/templates/model-registry/links.md b/docs/templates/model-registry/links.md new file mode 100644 index 000000000..07abe3177 --- /dev/null +++ b/docs/templates/model-registry/links.md @@ -0,0 +1,15 @@ +# Provenance Links + +Provenance Links are objects returned by methods such as [get_feature_view_provenance](../model_api/#get_feature_view_provenance), [get_training_dataset_provenance](../model_api/#get_training_dataset_provenance). These methods use the provenance graph to return the parent feature view/training dataset of a model. These methods will return the actual instances of the feature view/training dataset if available. If the instance was deleted, or it belongs to a featurestore that the current project doesn't have access anymore, an Artifact object is returned. + +There is an additional method using the provenance graph: [get_feature_view](../model_api/#get_feature_view). This method wraps the `get_feature_view_provenance` and always returns a correct, usable Feature View object or throws an exception if the returned object is an Artifact. Thus an exception is thrown if the feature view was deleted or the featurestore it belongs to was unshared. +## Properties + +{{links_properties}} + +# Artifact + +Artifacts objects are part of the provenance graph and contain a minimal set of information regarding the entities (feature views, training datasets) they represent. +The provenance graph contains Artifact objects when the underlying entities have been deleted or they are corrupted or they are not accessible by the current project anymore. + +{{artifact_properties}} diff --git a/docs/templates/model-registry/model_api.md b/docs/templates/model-registry/model_api.md new file mode 100644 index 000000000..edb2e5ade --- /dev/null +++ b/docs/templates/model-registry/model_api.md @@ -0,0 +1,29 @@ +# Model + +## Creation of a TensorFlow model + +{{ml_create_tf}} + +## Creation of a Torch model + +{{ml_create_th}} + +## Creation of a scikit-learn model + +{{ml_create_sl}} + +## Creation of a generic model + +{{ml_create_py}} + +## Retrieval + +{{ml_get}} + +## Properties + +{{ml_properties}} + +## Methods + +{{ml_methods}} diff --git a/docs/templates/model-registry/model_registry_api.md b/docs/templates/model-registry/model_registry_api.md new file mode 100644 index 000000000..d577e91e3 --- /dev/null +++ b/docs/templates/model-registry/model_registry_api.md @@ -0,0 +1,17 @@ +# Model Registry + +## Retrieval + +{{mr_get}} + +## Modules + +{{mr_modules}} + +## Properties + +{{mr_properties}} + +## Methods + +{{mr_methods}} diff --git a/docs/templates/model-registry/model_schema_api.md b/docs/templates/model-registry/model_schema_api.md new file mode 100644 index 000000000..28170a419 --- /dev/null +++ b/docs/templates/model-registry/model_schema_api.md @@ -0,0 +1,36 @@ +# Model Schema + +## Creation + +To create a ModelSchema, the schema of the Model inputs and/or Model ouputs has to be defined beforehand. + +{{schema}} + +After defining the Model inputs and/or outputs schemas, a ModelSchema can be created using its class constructor. + +{{model_schema}} + +## Retrieval + +### Model Schema + +Model schemas can be accessed from the model metadata objects. + +``` python +model.model_schema +``` + +### Model Input & Ouput Schemas + +The schemas of the Model inputs and outputs can be accessed from the ModelSchema metadata objects. + +``` python +model_schema.input_schema +model_schema.output_schema +``` + +## Methods + +{{schema_dict}} + +{{model_schema_dict}} diff --git a/docs/templates/model-serving/deployment_api.md b/docs/templates/model-serving/deployment_api.md new file mode 100644 index 000000000..aebccca55 --- /dev/null +++ b/docs/templates/model-serving/deployment_api.md @@ -0,0 +1,25 @@ +# Deployment + +## Handle + +{{ms_get_model_serving}} + +## Creation + +{{ms_create_deployment}} + +{{m_deploy}} + +{{p_deploy}} + +## Retrieval + +{{ms_get_deployments}} + +## Properties + +{{dep_properties}} + +## Methods + +{{dep_methods}} diff --git a/docs/templates/model-serving/inference_batcher_api.md b/docs/templates/model-serving/inference_batcher_api.md new file mode 100644 index 000000000..3a2609962 --- /dev/null +++ b/docs/templates/model-serving/inference_batcher_api.md @@ -0,0 +1,25 @@ +# Inference batcher + +## Creation + +{{ib}} + +## Retrieval + +### predictor.inference_batcher + +Inference batchers can be accessed from the predictor metadata objects. + +``` python +predictor.inference_batcher +``` + +Predictors can be found in the deployment metadata objects (see [Predictor Reference](../predictor_api/#retrieval)). To retrieve a deployment, see the [Deployment Reference](../deployment_api/#retrieval). + +## Properties + +{{ib_properties}} + +## Methods + +{{ib_methods}} diff --git a/docs/templates/model-serving/inference_logger_api.md b/docs/templates/model-serving/inference_logger_api.md new file mode 100644 index 000000000..2cf68d652 --- /dev/null +++ b/docs/templates/model-serving/inference_logger_api.md @@ -0,0 +1,25 @@ +# Inference logger + +## Creation + +{{il}} + +## Retrieval + +### predictor.inference_logger + +Inference loggers can be accessed from the predictor metadata objects. + +``` python +predictor.inference_logger +``` + +Predictors can be found in the deployment metadata objects (see [Predictor Reference](../predictor_api/#retrieval)). To retrieve a deployment, see the [Deployment Reference](../deployment_api/#retrieval). + +## Properties + +{{il_properties}} + +## Methods + +{{il_methods}} diff --git a/docs/templates/model-serving/model_serving_api.md b/docs/templates/model-serving/model_serving_api.md new file mode 100644 index 000000000..0eb557213 --- /dev/null +++ b/docs/templates/model-serving/model_serving_api.md @@ -0,0 +1,13 @@ +# Model Serving + +## Retrieval + +{{ms_get}} + +## Properties + +{{ms_properties}} + +## Methods + +{{ms_methods}} diff --git a/docs/templates/model-serving/predictor_api.md b/docs/templates/model-serving/predictor_api.md new file mode 100644 index 000000000..3dd9df195 --- /dev/null +++ b/docs/templates/model-serving/predictor_api.md @@ -0,0 +1,29 @@ +# Predictor + +## Handle + +{{ms_get_model_serving}} + +## Creation + +{{ms_create_predictor}} + +## Retrieval + +### deployment.predictor + +Predictors can be accessed from the deployment metadata objects. + +``` python +deployment.predictor +``` + +To retrieve a deployment, see the [Deployment Reference](../deployment_api/#retrieval). + +## Properties + +{{pred_properties}} + +## Methods + +{{pred_methods}} diff --git a/docs/templates/model-serving/predictor_state_api.md b/docs/templates/model-serving/predictor_state_api.md new file mode 100644 index 000000000..2640b9b48 --- /dev/null +++ b/docs/templates/model-serving/predictor_state_api.md @@ -0,0 +1,18 @@ +# Deployment state + +The state of a deployment corresponds to the state of the predictor configured in it. + +!!! note + Currently, only one predictor is supported in a deployment. Support for multiple predictors (the inference graphs) is coming soon. + +## Retrieval + +{{ps_get}} + +## Properties + +{{ps_properties}} + +## Methods + +{{ps_methods}} diff --git a/docs/templates/model-serving/predictor_state_condition_api.md b/docs/templates/model-serving/predictor_state_condition_api.md new file mode 100644 index 000000000..e1566d2b1 --- /dev/null +++ b/docs/templates/model-serving/predictor_state_condition_api.md @@ -0,0 +1,15 @@ +# Deployment state condition + +The state condition of a deployment is a more detailed representation of a deployment state. + +## Retrieval + +{{psc_get}} + +## Properties + +{{psc_properties}} + +## Methods + +{{psc_methods}} diff --git a/docs/templates/model-serving/resources_api.md b/docs/templates/model-serving/resources_api.md new file mode 100644 index 000000000..addc7f51e --- /dev/null +++ b/docs/templates/model-serving/resources_api.md @@ -0,0 +1,35 @@ +# Resources + +## Creation + +{{res}} + +## Retrieval + +### predictor.resources + +Resources allocated for a preditor can be accessed from the predictor metadata object. + +``` python +predictor.resources +``` + +Predictors can be found in the deployment metadata objects (see [Predictor Reference](../predictor_api/#retrieval)). To retrieve a deployment, see the [Deployment Reference](../deployment_api/#retrieval). + +### transformer.resources + +Resources allocated for a transformer can be accessed from the transformer metadata object. + +``` python +transformer.resources +``` + +Transformer can be found in the predictor metadata objects (see [Predictor Reference](../predictor_api/#retrieval)). + +## Properties + +{{res_properties}} + +## Methods + +{{res_methods}} diff --git a/docs/templates/model-serving/transformer_api.md b/docs/templates/model-serving/transformer_api.md new file mode 100644 index 000000000..ae81e84ef --- /dev/null +++ b/docs/templates/model-serving/transformer_api.md @@ -0,0 +1,29 @@ +# Transformer + +## Handle + +{{ms_get_model_serving}} + +## Creation + +{{ms_create_transformer}} + +## Retrieval + +### predictor.transformer + +Transformers can be accessed from the predictor metadata objects. + +``` python +predictor.transformer +``` + +Predictors can be found in the deployment metadata objects (see [Predictor Reference](../predictor_api/#retrieval)). To retrieve a deployment, see the [Deployment Reference](../deployment_api/#retrieval). + +## Properties + +{{trans_properties}} + +## Methods + +{{trans_methods}} diff --git a/java/beam/pom.xml b/java/beam/pom.xml new file mode 100644 index 000000000..3b3f902ca --- /dev/null +++ b/java/beam/pom.xml @@ -0,0 +1,55 @@ + + + + hsfs-parent + com.logicalclocks + 4.0.0-SNAPSHOT + + 4.0.0 + + hsfs-beam + + + 2.48.0 + 3.4.0 + + + + + com.logicalclocks + hsfs + ${project.version} + compile + + + javax.xml.bind + jaxb-api + + + + + + + org.apache.beam + beam-sdks-java-core + ${beam.version} + + + + + org.apache.beam + beam-sdks-java-io-kafka + ${beam.version} + + + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + + diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java new file mode 100644 index 000000000..fd93052a3 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam; + +import com.logicalclocks.hsfs.FeatureStoreBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.StatisticsConfig; +import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.TimeTravelFormat; +import com.logicalclocks.hsfs.TrainingDatasetBase; +import com.logicalclocks.hsfs.beam.constructor.Query; +import com.logicalclocks.hsfs.beam.engine.FeatureGroupEngine; +import com.logicalclocks.hsfs.beam.engine.FeatureViewEngine; +import com.logicalclocks.hsfs.metadata.StorageConnectorApi; +import lombok.NonNull; + +import java.io.IOException; +import java.util.List; + +public class FeatureStore extends FeatureStoreBase { + + private FeatureGroupEngine featureGroupEngine; + private FeatureViewEngine featureViewEngine; + + public FeatureStore() { + storageConnectorApi = new StorageConnectorApi(); + featureGroupEngine = new FeatureGroupEngine(); + } + + @Override + public Object createFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getFeatureGroups(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, List primaryKeys, + boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, List primaryKeys, + List partitionKeys, boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, String description, List primaryKeys, + List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, String topicName, String notificationTopicName, String eventTime) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + /** + * Get a stream feature group object from the feature store. + * + *

Getting a stream feature group metadata handle enables to interact with the feature group, + * such as read the data or use the `Query`-API to perform joins between feature groups and create feature + * views. + * + *

+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureGroup fg = fs.getStreamFeatureGroup("electricity_prices", 1);
+   * }
+   * 
+ * + * @param name the name of the feature group + * @return StreamFeatureGroup The stream feature group metadata object. + * @throws FeatureStoreException If unable to retrieve feature group from the feature store. + * @throws IOException Generic IO exception. + */ + @Override + public Object getStreamFeatureGroup(String name) throws FeatureStoreException, IOException { + LOGGER.info("VersionWarning: No version provided for getting feature group `" + name + "`, defaulting to `" + + DEFAULT_VERSION + "`."); + return getStreamFeatureGroup(name, DEFAULT_VERSION); + } + + /** + * Get a stream feature group object from the feature store. + * + *

Getting a stream feature group metadata handle enables to interact with the feature group, + * such as read the data or use the `Query`-API to perform joins between feature groups and create feature + * views. + * + *

+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureGroup fg = fs.getStreamFeatureGroup("electricity_prices", 1);
+   * }
+   * 
+ * + * @param name the name of the feature group + * @param version the version of the feature group + * @return StreamFeatureGroup The stream feature group metadata object. + * @throws FeatureStoreException If unable to retrieve feature group from the feature store. + * @throws IOException Generic IO exception. + */ + @Override + public StreamFeatureGroup getStreamFeatureGroup(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + return featureGroupEngine.getStreamFeatureGroup(this, name, version); + } + + @Override + public Object createStreamFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, + boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, + List partitionKeys, boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, + List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, + StatisticsConfig statisticsConfig, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object createExternalFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object createFeatureView() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + /** + * Get a feature view object from the selected feature store. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureView fv = fs.getFeatureView("fv_name", 1);
+   * }
+   * 
+ * + * @param name Name of the feature view. + * @param version Version to get. + * @return FeatureView The feature view metadata object. + * @throws FeatureStoreException If unable to retrieve FeatureView from the feature store. + * @throws IOException Generic IO exception. + */ + public FeatureView getFeatureView(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + return featureViewEngine.get(this, name, version); + } + + /** + * Get a feature view object from the selected feature store. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureView fv = fs.getFeatureView("fv_name", 1);
+   * }
+   * 
+ * + * @param name Name of the feature view. + * @return FeatureView The feature view metadata object. + * @throws FeatureStoreException If unable to retrieve FeatureView from the feature store. + * @throws IOException Generic IO exception. + */ + public FeatureView getFeatureView(String name) throws FeatureStoreException, IOException { + LOGGER.info("VersionWarning: No version provided for getting feature view `" + name + "`, defaulting to `" + + DEFAULT_VERSION + "`."); + return getFeatureView(name, DEFAULT_VERSION); + } + + @Override + public Object getOrCreateFeatureView(String name, Query query, Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOrCreateFeatureView(String name, Query query, Integer version, String description, + List labels) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getExternalFeatureGroup(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getExternalFeatureGroup(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StorageConnector getStorageConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getHopsFsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getExternalFeatureGroups(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object sql(String name) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getJdbcConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getS3Connector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getRedshiftConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getSnowflakeConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getAdlsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for beam"); + } + + @Override + public Object getKafkaConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getBigqueryConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOnlineStorageConnector() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getGcsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public TrainingDatasetBase getTrainingDataset(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public TrainingDatasetBase getTrainingDataset(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainingDatasets(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureView.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureView.java new file mode 100644 index 000000000..48c54f127 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureView.java @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.FeatureViewBase; +import com.logicalclocks.hsfs.beam.constructor.Query; +import org.apache.beam.sdk.values.PCollection; + +import java.io.IOException; +import java.text.ParseException; +import java.util.Map; + +public class FeatureView extends FeatureViewBase> { + @Override + public void addTag(String name, Object value) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map getTags() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTag(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void deleteTag(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void addTrainingDatasetTag(Integer version, String name, Object value) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map getTrainingDatasetTags(Integer version) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainingDatasetTag(Integer version, String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void deleteTrainingDatasetTag(Integer version, String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void delete() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void clean(FeatureStore featureStore, String featureViewName, Integer featureViewVersion) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public FeatureView update(FeatureView other) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public String getBatchQuery() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public String getBatchQuery(String startTime, String endTime) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection getBatchData() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection getBatchData(String startTime, String endTime) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection getBatchData(String startTime, String endTime, Map readOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainingData(Integer version, Map readOptions) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainTestSplit(Integer version, Map readOptions) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainValidationTestSplit(Integer version, Map readOptions) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void purgeTrainingData(Integer version) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void purgeAllTrainingData() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void deleteTrainingDataset(Integer version) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void deleteAllTrainingDatasets() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/HopsworksConnection.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/HopsworksConnection.java new file mode 100644 index 000000000..8b19103f5 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/HopsworksConnection.java @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.HopsworksConnectionBase; +import com.logicalclocks.hsfs.SecretStore; +import com.logicalclocks.hsfs.metadata.Credentials; +import com.logicalclocks.hsfs.metadata.HopsworksClient; +import com.logicalclocks.hsfs.metadata.HopsworksHttpClient; + +import lombok.Builder; + +import software.amazon.awssdk.regions.Region; + +import java.io.IOException; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; + +public class HopsworksConnection extends HopsworksConnectionBase { + + @Builder + public HopsworksConnection(String host, int port, String project, Region region, SecretStore secretStore, + boolean hostnameVerification, String trustStorePath, String certPath, String apiKeyFilePath, + String apiKeyValue) throws IOException, FeatureStoreException, CertificateException, KeyStoreException, + NoSuchAlgorithmException { + this.host = host; + this.port = port; + this.project = getProjectName(project); + this.region = region; + this.secretStore = secretStore; + this.hostnameVerification = hostnameVerification; + this.trustStorePath = trustStorePath; + this.certPath = certPath; + this.apiKeyFilePath = apiKeyFilePath; + this.apiKeyValue = apiKeyValue; + + HopsworksClient.setupHopsworksClient(host, port, region, secretStore, + hostnameVerification, trustStorePath, this.apiKeyFilePath, this.apiKeyValue); + this.projectObj = getProject(); + HopsworksClient.getInstance().setProject(this.projectObj); + Credentials credentials = HopsworksClient.getInstance().getCredentials(); + HopsworksHttpClient hopsworksHttpClient = HopsworksClient.getInstance().getHopsworksHttpClient(); + hopsworksHttpClient.setTrustStorePath(credentials.gettStore()); + hopsworksHttpClient.setKeyStorePath(credentials.getkStore()); + hopsworksHttpClient.setCertKey(credentials.getPassword()); + HopsworksClient.getInstance().setHopsworksHttpClient(hopsworksHttpClient); + } + + /** + * Retrieve the project feature store. + * + * @return FeatureStore object. + * @throws IOException Generic IO exception. + * @throws FeatureStoreException If client is not connected to Hopsworks + */ + public FeatureStore getFeatureStore() throws IOException, FeatureStoreException { + return getFeatureStore(rewriteFeatureStoreName(project)); + } + + /** + * Retrieve a feature store based on name. The feature store needs to be shared with + * the connection's project. The name is the project name of the feature store. + * + * @param name the name of the feature store to get the handle for + * @return FeatureStore object. + * @throws IOException Generic IO exception. + * @throws FeatureStoreException If client is not connected to Hopsworks + */ + public FeatureStore getFeatureStore(String name) throws IOException, FeatureStoreException { + return featureStoreApi.get(rewriteFeatureStoreName(name), FeatureStore.class); + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java new file mode 100644 index 000000000..312890dff --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam; + +import com.logicalclocks.hsfs.Feature; +import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.HudiOperationType; +import com.logicalclocks.hsfs.JobConfiguration; +import com.logicalclocks.hsfs.StatisticsConfig; +import com.logicalclocks.hsfs.Storage; +import com.logicalclocks.hsfs.beam.engine.FeatureGroupEngine; +import com.logicalclocks.hsfs.beam.engine.BeamProducer; +import com.logicalclocks.hsfs.constructor.QueryBase; +import com.logicalclocks.hsfs.metadata.Statistics; +import lombok.Builder; +import lombok.NonNull; +import org.apache.beam.sdk.values.PCollection; + +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class StreamFeatureGroup extends FeatureGroupBase> { + + + protected FeatureGroupEngine featureGroupEngine = new FeatureGroupEngine(); + + @Builder + public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, + List primaryKeys, List partitionKeys, String hudiPrecombineKey, + boolean onlineEnabled, List features, + StatisticsConfig statisticsConfig, String onlineTopicName, String eventTime) { + this(); + this.featureStore = featureStore; + this.name = name; + this.version = version; + this.description = description; + this.primaryKeys = primaryKeys != null + ? primaryKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; + this.partitionKeys = partitionKeys != null + ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; + this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; + this.onlineEnabled = onlineEnabled; + this.features = features; + this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); + this.onlineTopicName = onlineTopicName; + this.eventTime = eventTime; + } + + public StreamFeatureGroup() { + this.type = "streamFeatureGroupDTO"; + } + + // used for updates + public StreamFeatureGroup(Integer id, String description, List features) { + this(); + this.id = id; + this.description = description; + this.features = features; + } + + public StreamFeatureGroup(FeatureStore featureStore, int id) { + this(); + this.featureStore = featureStore; + this.id = id; + } + + @Override + public PCollection read() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(boolean online, Map readOptions) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(String wallclockTime) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(String wallclockTime, Map readOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase asOf(String wallclockTime) throws FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase asOf(String wallclockTime, String excludeUntil) throws FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void show(int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void show(int numRows, boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData) throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, Storage storage) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, boolean overwrite) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, Storage storage, boolean overwrite) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, boolean online, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, HudiOperationType hudiOperationType) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, Storage storage, boolean online, + HudiOperationType hudiOperationType, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, JobConfiguration jobConfiguration) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, boolean online, Map writeOptions, + JobConfiguration jobConfiguration) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void commitDeleteRecord(PCollection featureData) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void commitDeleteRecord(PCollection featureData, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map> commitDetails() throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map> commitDetails(Integer integer) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map> commitDetails(String limit) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map> commitDetails(String wallclockTime, Integer limit) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase selectFeatures(List features) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase select(List features) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase selectAll() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase selectExceptFeatures(List features) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase selectExcept(List features) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + /** + * Ingest a feature data to the online feature store using Beam Pipeline object. Currently, + * only org.apache.beam.sdk.values.Row types as feature data type are supported. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *
+   *        // get feature group handle
+   *        StreamFeatureGroup fg = fs.getStreamFeatureGroup("taxi_ride", 1);
+   *
+   *        // create Beam pipeline
+   *        Pipeline pipeline = Pipeline.create();
+   *        pipeline
+   *         .apply("read stream from the source", PubsubIO.readStrings().fromTopic(options.getInputTopic()))
+   *         .apply("Parse JSON to Beam Rows", JsonToRow.withSchema(schema))
+   *         .apply("insert streaming feature data", fg.insertStream());
+   * }
+   * 
+ * + * @return BeamProducer object, that can be wrapped inside Beam Pipeline `apply` method. + */ + public BeamProducer insertStream() throws Exception { + return featureGroupEngine.insertStream(this, null); + } + + public BeamProducer insertStream(Map writeOptions) throws Exception { + return featureGroupEngine.insertStream(this, writeOptions); + } + + @Override + public Object insertStream(PCollection featureData) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, Map writeOptions) throws Exception { + return null; + } + + @Override + public Object insertStream(PCollection featureData, String queryName, Map writeOptions) + throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData,String queryName, String outputMode, + boolean awaitTermination, Long timeout) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation, Map writeOptions) + throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + boolean awaitTermination, String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation, Map writeOptions, + JobConfiguration jobConfiguration) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void updateFeatures(List feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void updateFeatures(Feature feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void appendFeatures(List feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void appendFeatures(Feature feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Statistics computeStatistics() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Statistics computeStatistics(String wallclockTime) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Statistics getStatistics() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/constructor/Query.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/constructor/Query.java new file mode 100644 index 000000000..e3abe7df7 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/constructor/Query.java @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.constructor; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.Storage; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import com.logicalclocks.hsfs.constructor.QueryBase; + +import org.apache.beam.sdk.values.PCollection; + +import java.io.IOException; +import java.util.Map; + +public class Query extends QueryBase> { + @Override + public String sql() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public String sql(Storage storage) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object read() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object read(boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object read(boolean online, Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void show(int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void show(boolean online, int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamEngine.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamEngine.java new file mode 100644 index 000000000..13ff573a1 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamEngine.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import com.google.common.base.Strings; +import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import com.logicalclocks.hsfs.metadata.DatasetApi; +import com.logicalclocks.hsfs.engine.EngineBase; +import com.logicalclocks.hsfs.engine.FeatureGroupUtils; +import com.logicalclocks.hsfs.metadata.HopsworksInternalClient; +import org.apache.avro.Schema; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class BeamEngine extends EngineBase { + private static BeamEngine INSTANCE = null; + private FeatureGroupUtils featureGroupUtils = new FeatureGroupUtils(); + + public static synchronized BeamEngine getInstance() throws FeatureStoreException { + if (INSTANCE == null) { + INSTANCE = new BeamEngine(); + } + return INSTANCE; + } + + private BeamEngine() throws FeatureStoreException { + } + + public BeamProducer insertStream(StreamFeatureGroup streamFeatureGroup, Map writeOptions) + throws FeatureStoreException, IOException { + Map complexFeatureSchemas = new HashMap<>(); + for (String featureName: streamFeatureGroup.getComplexFeatures()) { + complexFeatureSchemas.put(featureName, + new Schema.Parser().parse(streamFeatureGroup.getFeatureAvroSchema(featureName))); + } + Schema deserializedEncodedSchema = new Schema.Parser().parse(streamFeatureGroup.getEncodedAvroSchema()); + + return new BeamProducer(streamFeatureGroup.getOnlineTopicName(), + getKafkaConfig(streamFeatureGroup, writeOptions), + streamFeatureGroup.getDeserializedAvroSchema(), deserializedEncodedSchema, complexFeatureSchemas, + streamFeatureGroup.getPrimaryKeys(), streamFeatureGroup); + } + + @Override + public String addFile(String filePath) throws IOException, FeatureStoreException { + if (Strings.isNullOrEmpty(filePath)) { + return filePath; + } + // this is used for unit testing + if (!filePath.startsWith("file://")) { + filePath = "hdfs://" + filePath; + } + String targetPath = System.getProperty("java.io.tmpdir") + filePath.substring(filePath.lastIndexOf("/")); + try (FileOutputStream outputStream = new FileOutputStream(targetPath)) { + outputStream.write(DatasetApi.readContent(filePath, featureGroupUtils.getDatasetType(filePath))); + } + return targetPath; + } + + @Override + public Map getKafkaConfig(FeatureGroupBase featureGroup, Map writeOptions) + throws FeatureStoreException, IOException { + boolean external = !(System.getProperties().containsKey(HopsworksInternalClient.REST_ENDPOINT_SYS) + || (writeOptions != null + && Boolean.parseBoolean(writeOptions.getOrDefault("internal_kafka", "false")))); + + StorageConnector.KafkaConnector storageConnector = + storageConnectorApi.getKafkaStorageConnector(featureGroup.getFeatureStore(), external); + storageConnector.setSslTruststoreLocation(addFile(storageConnector.getSslTruststoreLocation())); + storageConnector.setSslKeystoreLocation(addFile(storageConnector.getSslKeystoreLocation())); + + Map config = storageConnector.kafkaOptions(); + + if (writeOptions != null) { + config.putAll(writeOptions); + } + return config; + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamKafkaProducer.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamKafkaProducer.java new file mode 100644 index 000000000..c1bbd2748 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamKafkaProducer.java @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import lombok.Setter; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.producer.Callback; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.clients.producer.RecordMetadata; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Future; + +public class BeamKafkaProducer extends KafkaProducer { + @Setter + private Map headerMap = new HashMap<>(); + + public BeamKafkaProducer(Map configs) { + super(configs); + } + + public Future send(ProducerRecord record) { + addHeaders(record); + return super.send(record); + } + + public Future send(ProducerRecord record, Callback callback) { + addHeaders(record); + return super.send(record, callback); + } + + private void addHeaders(ProducerRecord record) { + for (Map.Entry entry: headerMap.entrySet()) { + record.headers().add(entry.getKey(), entry.getValue()); + } + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamProducer.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamProducer.java new file mode 100644 index 000000000..e2b13e074 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamProducer.java @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import lombok.NonNull; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.EncoderFactory; + +import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; +import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; +import org.apache.beam.sdk.io.kafka.KafkaIO; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; +import org.apache.beam.sdk.values.Row; + +import org.apache.kafka.common.config.SslConfigs; +import org.apache.kafka.common.serialization.StringSerializer; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public class BeamProducer extends PTransform<@NonNull PCollection, @NonNull PDone> { + private String topic; + private Map properties; + private transient Schema schema; + private transient Schema encodedSchema; + private Map deserializedComplexFeatureSchemas; + private List primaryKeys; + private final Map headerMap = new HashMap<>(); + + public BeamProducer(String topic, Map properties, Schema schema, Schema encodedSchema, + Map deserializedComplexFeatureSchemas, List primaryKeys, + StreamFeatureGroup streamFeatureGroup) throws FeatureStoreException, IOException { + this.schema = schema; + this.encodedSchema = encodedSchema; + this.topic = topic; + this.properties = properties; + this.deserializedComplexFeatureSchemas = deserializedComplexFeatureSchemas; + this.primaryKeys = primaryKeys; + + headerMap.put("projectId", + String.valueOf(streamFeatureGroup.getFeatureStore().getProjectId()).getBytes(StandardCharsets.UTF_8)); + headerMap.put("featureGroupId", String.valueOf(streamFeatureGroup.getId()).getBytes(StandardCharsets.UTF_8)); + headerMap.put("subjectId", + String.valueOf(streamFeatureGroup.getSubject().getId()).getBytes(StandardCharsets.UTF_8)); + } + + @Override + public PDone expand(PCollection input) { + + PCollection featureGroupAvroRecord = input + .apply("Convert to avro generic record", ParDo.of(new DoFn() { + @ProcessElement + public void processElement(ProcessContext c) { + GenericRecord genericRecord = AvroUtils.toGenericRecord(c.element(), schema); + c.output(genericRecord); + } + })).setCoder(AvroCoder.of(GenericRecord.class, schema)); + + if (!deserializedComplexFeatureSchemas.keySet().isEmpty()) { + featureGroupAvroRecord = featureGroupAvroRecord + .apply("Serialize complex features", ParDo.of(new DoFn() { + @ProcessElement + public void processElement(ProcessContext c) throws IOException { + GenericRecord encodedRecord = new GenericData.Record(encodedSchema); + for (Schema.Field field: c.element().getSchema().getFields()) { + if (deserializedComplexFeatureSchemas.containsKey(field.name())) { + GenericDatumWriter complexFeatureDatumWriter = + new GenericDatumWriter<>(deserializedComplexFeatureSchemas.get(field.name())); + ByteArrayOutputStream complexFeatureByteArrayOutputStream = new ByteArrayOutputStream(); + complexFeatureByteArrayOutputStream.reset(); + BinaryEncoder complexFeatureBinaryEncoder = + new EncoderFactory().binaryEncoder(complexFeatureByteArrayOutputStream, null); + complexFeatureDatumWriter.write(field.name(), complexFeatureBinaryEncoder); + complexFeatureBinaryEncoder.flush(); + encodedRecord.put(field.name(), ByteBuffer.wrap(complexFeatureByteArrayOutputStream.toByteArray())); + } + } + c.output(encodedRecord); + } + })); + } + + return featureGroupAvroRecord.apply("Convert To KV of primaryKey:GenericRecord", + ParDo.of(new DoFn>() { + @ProcessElement + public void processElement(ProcessContext c) { + List primaryKeyValues = new ArrayList<>(); + for (String primaryKey: primaryKeys) { + primaryKeyValues.add(c.element().get(primaryKey).toString()); + } + c.output(KV.of(String.join(";", primaryKeyValues), c.element())); + } + }) + ) + .apply("Sync to online feature group kafka topic", KafkaIO.write() + .withBootstrapServers(properties.get("bootstrap.servers").toString()) + .withTopic(topic) + //.withProducerConfigUpdates(properties) + .withKeySerializer(StringSerializer.class) + .withValueSerializer(GenericAvroSerializer.class) + .withInputTimestamp() + .withProducerFactoryFn(props -> { + // copy jks files from resources to dataflow workers + try { + Path keyStorePath = Paths.get(properties.get(SslConfigs.SSL_KEYSTORE_LOCATION_CONFIG)); + InputStream keyStoreStream = Objects.requireNonNull(BeamProducer.class.getClassLoader() + .getResourceAsStream(keyStorePath.getFileName().toString())); + if (!Files.exists(keyStorePath)) { + Files.copy(keyStoreStream, keyStorePath, StandardCopyOption.REPLACE_EXISTING); + } + Path trustStorePath = Paths.get(properties.get(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG)); + InputStream trustStoreStream = Objects.requireNonNull(BeamProducer.class.getClassLoader() + .getResourceAsStream(trustStorePath.getFileName().toString())); + if (!Files.exists(trustStorePath)) { + Files.copy(trustStoreStream, trustStorePath, StandardCopyOption.REPLACE_EXISTING); + } + } catch (IOException e) { + e.printStackTrace(); + } + props.putAll(properties); + BeamKafkaProducer producer = new BeamKafkaProducer(props); + producer.setHeaderMap(headerMap); + return producer; + }) + ); + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureGroupEngine.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureGroupEngine.java new file mode 100644 index 000000000..c56fb2673 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureGroupEngine.java @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.beam.FeatureStore; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import com.logicalclocks.hsfs.engine.FeatureGroupEngineBase; +import lombok.SneakyThrows; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class FeatureGroupEngine extends FeatureGroupEngineBase { + + @SneakyThrows + public BeamProducer insertStream(StreamFeatureGroup streamFeatureGroup, Map writeOptions) { + return BeamEngine.getInstance().insertStream(streamFeatureGroup, writeOptions); + } + + public StreamFeatureGroup getStreamFeatureGroup(FeatureStore featureStore, String fgName, Integer fgVersion) + throws IOException, FeatureStoreException { + StreamFeatureGroup[] streamFeatureGroups = + featureGroupApi.getInternal(featureStore, fgName, fgVersion, StreamFeatureGroup[].class); + + // There can be only one single feature group with a specific name and version in a feature store + // There has to be one otherwise an exception would have been thrown. + StreamFeatureGroup resultFg = streamFeatureGroups[0]; + resultFg.setFeatureStore(featureStore); + return resultFg; + } + + public List getStreamFeatureGroups(FeatureStore featureStore, String fgName) + throws FeatureStoreException, IOException { + StreamFeatureGroup[] streamFeatureGroups = + featureGroupApi.getInternal(featureStore, fgName, null, StreamFeatureGroup[].class); + + return Arrays.asList(streamFeatureGroups); + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureViewEngine.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureViewEngine.java new file mode 100644 index 000000000..0402d4c9b --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureViewEngine.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.beam.FeatureStore; +import com.logicalclocks.hsfs.beam.FeatureView; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import com.logicalclocks.hsfs.beam.constructor.Query; +import com.logicalclocks.hsfs.engine.FeatureViewEngineBase; +import org.apache.beam.sdk.values.PCollection; + +import java.io.IOException; +import java.util.Date; +import java.util.List; +import java.util.Map; + +public class FeatureViewEngine extends FeatureViewEngineBase> { + @Override + public FeatureView update(FeatureView featureView) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public FeatureView get(FeatureStore featureStore, String name, Integer version) + throws FeatureStoreException, IOException { + FeatureView featureView = get(featureStore, name, version, FeatureView.class); + featureView.setFeatureStore(featureStore); + return featureView; + } + + @Override + public Query getBatchQuery(FeatureView featureView, Date startTime, Date endTime, Boolean withLabels, + Integer trainingDataVersion) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public String getBatchQueryString(FeatureView featureView, Date startTime, Date endTime, Integer trainingDataVersion) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public FeatureView getOrCreateFeatureView(FeatureStore featureStore, String name, Integer version, Query query, + String description, List labels) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection getBatchData(FeatureView featureView, Date startTime, Date endTime, + Map readOptions, Integer trainingDataVersion) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/GenericAvroSerializer.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/GenericAvroSerializer.java new file mode 100644 index 000000000..556426a47 --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/GenericAvroSerializer.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.DatumWriter; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.reflect.ReflectDatumWriter; +import org.apache.kafka.common.serialization.Serializer; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class GenericAvroSerializer implements Serializer { + + @Override + public byte[] serialize(String topic, GenericRecord genericRecord) { + DatumWriter datumWriter = new ReflectDatumWriter<>(genericRecord.getSchema()); + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + byteArrayOutputStream.reset(); + + List records = new ArrayList<>(); + records.add(genericRecord); + + BinaryEncoder binaryEncoder = new EncoderFactory().binaryEncoder(byteArrayOutputStream, null); + for (GenericRecord segment: records) { + try { + datumWriter.write(segment, binaryEncoder); + } catch (IOException e) { + e.printStackTrace(); + } + } + try { + binaryEncoder.flush(); + } catch (IOException e) { + e.printStackTrace(); + } + return byteArrayOutputStream.toByteArray(); + } + +} diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/KeySerializer.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/KeySerializer.java new file mode 100644 index 000000000..d0af9a7eb --- /dev/null +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/KeySerializer.java @@ -0,0 +1,26 @@ +package com.logicalclocks.hsfs.beam.engine; + +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.common.serialization.Serializer; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +public class KeySerializer implements Serializer { + + List primaryKeys; + + public KeySerializer(List primaryKeys) { + this.primaryKeys = primaryKeys; + } + + @Override + public byte[] serialize(String topic, GenericRecord record) { + List primaryKeyValues = new ArrayList<>(); + for (String primaryKey: this.primaryKeys) { + primaryKeyValues.add(record.get(primaryKey).toString()); + } + return String.join(";", primaryKeyValues).getBytes(StandardCharsets.UTF_8); + } +} diff --git a/java/flink/pom.xml b/java/flink/pom.xml new file mode 100644 index 000000000..d2d7b87e0 --- /dev/null +++ b/java/flink/pom.xml @@ -0,0 +1,92 @@ + + + + hsfs-parent + com.logicalclocks + 4.0.0-SNAPSHOT + + 4.0.0 + + hsfs-flink + + + 1.17.1.0 + 2.13.4.2 + + + + + com.logicalclocks + hsfs + ${project.version} + compile + + + com.fasterxml.jackson.core + * + + + javax.xml.bind + jaxb-api + + + com.databricks + * + + + org.scala-lang + * + + + + + + org.apache.flink + flink-streaming-java + ${flink.version} + provided + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + + org.apache.flink + flink-connector-kafka + ${flink.version} + provided + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + + org.apache.flink + flink-avro + ${flink.version} + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + com.fasterxml.jackson.core + jackson-databind + ${fasterxml.version} + + + + diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java new file mode 100644 index 000000000..3ab0dfe24 --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink; + +import com.logicalclocks.hsfs.FeatureStoreBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.StatisticsConfig; +import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.TimeTravelFormat; +import com.logicalclocks.hsfs.TrainingDatasetBase; +import com.logicalclocks.hsfs.flink.constructor.Query; +import com.logicalclocks.hsfs.flink.engine.FeatureViewEngine; +import com.logicalclocks.hsfs.metadata.StorageConnectorApi; + +import com.logicalclocks.hsfs.flink.engine.FeatureGroupEngine; + +import lombok.NonNull; + +import java.io.IOException; +import java.util.List; + +public class FeatureStore extends FeatureStoreBase { + + private FeatureGroupEngine featureGroupEngine; + private FeatureViewEngine featureViewEngine; + + public FeatureStore() { + storageConnectorApi = new StorageConnectorApi(); + featureViewEngine = new FeatureViewEngine(); + featureGroupEngine = new FeatureGroupEngine(); + } + + @Override + public Object createFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getFeatureGroups(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + + @Override + public Object getOrCreateFeatureGroup(String name, Integer integer, List primaryKeys, + boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, List primaryKeys, + List partitionKeys, boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, String description, List primaryKeys, + List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, String topicName, String notificationTopicName, String eventTime) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + /** + * Get a stream feature group object from the feature store. + * + *

Getting a stream feature group metadata handle enables to interact with the feature group, + * such as read the data or use the `Query`-API to perform joins between feature groups and create feature + * views. + * + *

+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureGroup fg = fs.getStreamFeatureGroup("electricity_prices", 1);
+   * }
+   * 
+ * + * @param name the name of the feature group + * @return StreamFeatureGroup The stream feature group metadata object. + * @throws FeatureStoreException If unable to retrieve feature group from the feature store. + * @throws IOException Generic IO exception. + */ + @Override + public StreamFeatureGroup getStreamFeatureGroup(String name) throws FeatureStoreException, IOException { + LOGGER.info("VersionWarning: No version provided for getting feature group `" + name + "`, defaulting to `" + + DEFAULT_VERSION + "`."); + return getStreamFeatureGroup(name, DEFAULT_VERSION); + } + + /** + * Get a stream feature group object from the feature store. + * + *

Getting a stream feature group metadata handle enables to interact with the feature group, + * such as read the data or use the `Query`-API to perform joins between feature groups and create feature + * views. + * + *

+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureGroup fg = fs.getStreamFeatureGroup("electricity_prices", 1);
+   * }
+   * 
+ * + * @param name the name of the feature group + * @param version the version of the feature group + * @return StreamFeatureGroup The stream feature group metadata object. + * @throws FeatureStoreException If unable to retrieve feature group from the feature store. + * @throws IOException Generic IO exception. + */ + @Override + public StreamFeatureGroup getStreamFeatureGroup(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + return featureGroupEngine.getStreamFeatureGroup(this, name, version); + } + + @Override + public StreamFeatureGroup.StreamFeatureGroupBuilder createStreamFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, + boolean onlineEnabled, String eventTime) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, + List partitionKeys, boolean onlineEnabled, + String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, + List primaryKeys, List partitionKeys, + String hudiPrecombineKey, boolean onlineEnabled, + StatisticsConfig statisticsConfig, + String eventTime) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object createExternalFeatureGroup() { + return null; + } + + @Override + public Object createFeatureView() { + return null; + } + + @Override + public StorageConnector getStorageConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getHopsFsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getJdbcConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getOnlineStorageConnector() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getGcsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getS3Connector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getRedshiftConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getSnowflakeConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getAdlsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getKafkaConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getBigqueryConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getExternalFeatureGroups(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object sql(String query) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public TrainingDatasetBase getTrainingDataset(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public TrainingDatasetBase getTrainingDataset(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainingDatasets(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public FeatureView getOrCreateFeatureView(String name, Query query, Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public FeatureView getOrCreateFeatureView(String name, Query query, Integer version, String description, + List labels) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + /** + * Get a feature view object from the selected feature store. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureView fv = fs.getFeatureView("fv_name", 1);
+   * }
+   * 
+ * + * @param name Name of the feature view. + * @param version Version to get. + * @return FeatureView The feature view metadata object. + * @throws FeatureStoreException If unable to retrieve FeatureView from the feature store. + * @throws IOException Generic IO exception. + */ + public FeatureView getFeatureView(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + return featureViewEngine.get(this, name, version); + } + + /** + * Get a feature view object from the selected feature store. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureView fv = fs.getFeatureView("fv_name", 1);
+   * }
+   * 
+ * + * @param name Name of the feature view. + * @return FeatureView The feature view metadata object. + * @throws FeatureStoreException If unable to retrieve FeatureView from the feature store. + * @throws IOException Generic IO exception. + */ + public FeatureView getFeatureView(String name) throws FeatureStoreException, IOException { + LOGGER.info("VersionWarning: No version provided for getting feature view `" + name + "`, defaulting to `" + + DEFAULT_VERSION + "`."); + return getFeatureView(name, DEFAULT_VERSION); + } + + @Override + public Object getExternalFeatureGroup(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getExternalFeatureGroup(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureView.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureView.java new file mode 100644 index 000000000..e95578acd --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureView.java @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.FeatureViewBase; +import com.logicalclocks.hsfs.flink.constructor.Query; + +import org.apache.flink.streaming.api.datastream.DataStream; + +import lombok.NoArgsConstructor; + +import java.io.IOException; +import java.text.ParseException; + +import java.util.Map; + +@NoArgsConstructor +public class FeatureView extends FeatureViewBase> { + + @Override + public void addTag(String s, Object o) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map getTags() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTag(String s) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void deleteTag(String s) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void addTrainingDatasetTag(Integer integer, String s, Object o) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map getTrainingDatasetTags(Integer integer) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainingDatasetTag(Integer integer, String s) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void deleteTrainingDatasetTag(Integer integer, String s) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void delete() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void clean(FeatureStore featureStore, String s, Integer integer) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public FeatureView update(FeatureView featureView) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public String getBatchQuery() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public String getBatchQuery(String s, String s1) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream getBatchData() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream getBatchData(String s, String s1) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream getBatchData(String s, String s1, Map map) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainingData(Integer integer, Map map) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainTestSplit(Integer integer, Map map) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainValidationTestSplit(Integer integer, Map map) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void purgeTrainingData(Integer integer) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void purgeAllTrainingData() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void deleteTrainingDataset(Integer integer) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void deleteAllTrainingDatasets() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/HopsworksConnection.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/HopsworksConnection.java new file mode 100644 index 000000000..3d8d71d0f --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/HopsworksConnection.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.HopsworksConnectionBase; +import com.logicalclocks.hsfs.SecretStore; +import com.logicalclocks.hsfs.flink.engine.FlinkEngine; +import com.logicalclocks.hsfs.metadata.HopsworksClient; + +import com.logicalclocks.hsfs.metadata.HopsworksHttpClient; +import com.logicalclocks.hsfs.metadata.HopsworksInternalClient; +import lombok.Builder; + +import software.amazon.awssdk.regions.Region; + +import java.io.IOException; + +public class HopsworksConnection extends HopsworksConnectionBase { + + @Builder + public HopsworksConnection(String host, int port, String project, Region region, SecretStore secretStore, + boolean hostnameVerification, String trustStorePath, + String certPath, String apiKeyFilePath, String apiKeyValue) + throws IOException, FeatureStoreException { + this.host = host; + this.port = port; + this.project = getProjectName(project); + this.region = region; + this.secretStore = secretStore; + this.hostnameVerification = hostnameVerification; + this.trustStorePath = trustStorePath; + this.certPath = certPath; + this.apiKeyFilePath = apiKeyFilePath; + this.apiKeyValue = apiKeyValue; + + HopsworksClient.setupHopsworksClient(host, port, region, secretStore, + hostnameVerification, trustStorePath, this.apiKeyFilePath, this.apiKeyValue); + this.projectObj = getProject(); + HopsworksClient.getInstance().setProject(this.projectObj); + if (!System.getProperties().containsKey(HopsworksInternalClient.REST_ENDPOINT_SYS)) { + HopsworksHttpClient hopsworksHttpClient = HopsworksClient.getInstance().getHopsworksHttpClient(); + hopsworksHttpClient.setTrustStorePath(FlinkEngine.getInstance().getTrustStorePath()); + hopsworksHttpClient.setKeyStorePath(FlinkEngine.getInstance().getKeyStorePath()); + hopsworksHttpClient.setCertKey(HopsworksHttpClient.readCertKey(FlinkEngine.getInstance().getCertKey())); + HopsworksClient.getInstance().setHopsworksHttpClient(hopsworksHttpClient); + } + } + + /** + * Retrieve the project feature store. + * + * @return FeatureStore object. + * @throws IOException Generic IO exception. + * @throws FeatureStoreException If client is not connected to Hopsworks + */ + public FeatureStore getFeatureStore() throws IOException, FeatureStoreException { + return getFeatureStore(rewriteFeatureStoreName(project)); + } + + /** + * Retrieve a feature store based on name. The feature store needs to be shared with + * the connection's project. The name is the project name of the feature store. + * + * @param name the name of the feature store to get the handle for + * @return FeatureStore object. + * @throws IOException Generic IO exception. + * @throws FeatureStoreException If client is not connected to Hopsworks + */ + public FeatureStore getFeatureStore(String name) throws IOException, FeatureStoreException { + return featureStoreApi.get(rewriteFeatureStoreName(name), FeatureStore.class); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java new file mode 100644 index 000000000..41b121d8b --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.logicalclocks.hsfs.Feature; +import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.HudiOperationType; +import com.logicalclocks.hsfs.JobConfiguration; +import com.logicalclocks.hsfs.StatisticsConfig; +import com.logicalclocks.hsfs.Storage; +import com.logicalclocks.hsfs.constructor.QueryBase; + +import com.logicalclocks.hsfs.metadata.Statistics; + +import com.logicalclocks.hsfs.flink.engine.FeatureGroupEngine; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.NonNull; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; + +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +@AllArgsConstructor +@JsonIgnoreProperties(ignoreUnknown = true) +public class StreamFeatureGroup extends FeatureGroupBase> { + + protected FeatureGroupEngine featureGroupEngine = new FeatureGroupEngine(); + + @Builder + public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, + List primaryKeys, List partitionKeys, String hudiPrecombineKey, + boolean onlineEnabled, List features, StatisticsConfig statisticsConfig, + String onlineTopicName, String topicName, String notificationTopicName, String eventTime) { + this(); + this.featureStore = featureStore; + this.name = name; + this.version = version; + this.description = description; + this.primaryKeys = primaryKeys != null + ? primaryKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; + this.partitionKeys = partitionKeys != null + ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; + this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; + this.onlineEnabled = onlineEnabled; + this.features = features; + this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); + this.onlineTopicName = onlineTopicName; + this.topicName = topicName; + this.notificationTopicName = notificationTopicName; + this.eventTime = eventTime; + } + + public StreamFeatureGroup() { + this.type = "streamFeatureGroupDTO"; + } + + // used for updates + public StreamFeatureGroup(Integer id, String description, List features) { + this(); + this.id = id; + this.description = description; + this.features = features; + } + + public StreamFeatureGroup(FeatureStore featureStore, int id) { + this(); + this.featureStore = featureStore; + this.id = id; + } + + @Override + public DataStream read() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(boolean online, Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(String wallclockTime) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(String wallclockTime, Map readOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase asOf(String wallclockTime) throws FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase asOf(String wallclockTime, String excludeUntil) throws FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void show(int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void show(int numRows, boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData) throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, Storage storage) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, boolean overwrite) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, Storage storage, boolean overwrite) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, boolean online, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, HudiOperationType hudiOperationType) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, Storage storage, boolean online, HudiOperationType hudiOperationType, + Map writeOptions) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, JobConfiguration jobConfiguration) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, boolean online, Map writeOptions, + JobConfiguration jobConfiguration) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void commitDeleteRecord(DataStream featureData) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void commitDeleteRecord(DataStream featureData, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map> commitDetails() throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map> commitDetails(Integer integer) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map> commitDetails(String limit) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map> commitDetails(String wallclockTime, Integer limit) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase selectFeatures(List features) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase select(List features) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase selectAll() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase selectExceptFeatures(List features) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase selectExcept(List features) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + /** + * Ingest a feature data to the online feature store using Flink DataStream API. Currently, only POJO + * types as feature data type are supported. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *
+   *        // get feature group handle
+   *        StreamFeatureGroup fg = fs.getStreamFeatureGroup("card_transactions", 1);
+   *
+   *        // read stream from the source and aggregate stream
+   *        DataStream aggregationStream =
+   *          env.fromSource(transactionSource, customWatermark, "Transaction Kafka Source")
+   *          .keyBy(r -> r.getCcNum())
+   *          .window(SlidingEventTimeWindows.of(Time.minutes(windowLength), Time.minutes(1)))
+   *          .aggregate(new TransactionCountAggregate());
+   *
+   *        // insert streaming feature data
+   *        fg.insertStream(featureData);
+   * }
+   * 
+ * + * @param featureData Features in Streaming Dataframe to be saved. + * @return DataStreamSink object. + */ + @Override + public DataStreamSink insertStream(DataStream featureData) throws Exception { + return featureGroupEngine.insertStream(this, featureData, null); + } + + @Override + public DataStreamSink insertStream(DataStream featureData, Map writeOptions) throws Exception { + return featureGroupEngine.insertStream(this, featureData, writeOptions); + } + + @Override + public Object insertStream(DataStream featureData, String queryName) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, Map writeOptions) + throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, + String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData,String queryName, String outputMode, + boolean awaitTermination, Long timeout) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation) + throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation, + Map writeOptions) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, boolean awaitTermination, + String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, boolean awaitTermination, + Long timeout, String checkpointLocation, Map writeOptions, + JobConfiguration jobConfiguration) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void updateFeatures(Feature feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void updateFeatures(List feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void appendFeatures(List feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void appendFeatures(Feature feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Statistics computeStatistics() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Statistics computeStatistics(String wallclockTime) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Statistics getStatistics() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/FsQuery.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/FsQuery.java new file mode 100644 index 000000000..c9527119d --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/FsQuery.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.constructor; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.logicalclocks.hsfs.constructor.FsQueryBase; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import lombok.AllArgsConstructor; + +import java.util.Map; + +@JsonIgnoreProperties(ignoreUnknown = true) +@AllArgsConstructor +public class FsQuery extends FsQueryBase { + @Override + public void registerOnDemandFeatureGroups() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void registerHudiFeatureGroups(Map readOptions) { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/Query.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/Query.java new file mode 100644 index 000000000..4d1c85359 --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/Query.java @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.constructor; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.Storage; +import com.logicalclocks.hsfs.constructor.QueryBase; + +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import lombok.NoArgsConstructor; +import org.apache.flink.streaming.api.datastream.DataStream; + +import java.io.IOException; +import java.util.Map; + +@NoArgsConstructor +public class Query extends QueryBase> { + + @Override + public String sql() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public String sql(Storage storage) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object read() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object read(boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object read(boolean online, Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void show(int i) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void show(boolean online, int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureGroupEngine.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureGroupEngine.java new file mode 100644 index 000000000..7fe3d0b82 --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureGroupEngine.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.engine.FeatureGroupEngineBase; + +import com.logicalclocks.hsfs.flink.FeatureStore; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import lombok.SneakyThrows; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class FeatureGroupEngine extends FeatureGroupEngineBase { + + @SneakyThrows + public DataStreamSink insertStream(StreamFeatureGroup streamFeatureGroup, DataStream featureData, + Map writeOptions) { + return FlinkEngine.getInstance().writeDataStream(streamFeatureGroup, featureData, writeOptions); + } + + public StreamFeatureGroup getStreamFeatureGroup(FeatureStore featureStore, String fgName, Integer fgVersion) + throws IOException, FeatureStoreException { + StreamFeatureGroup[] streamFeatureGroups = + featureGroupApi.getInternal(featureStore, fgName, fgVersion, StreamFeatureGroup[].class); + + // There can be only one single feature group with a specific name and version in a feature store + // There has to be one otherwise an exception would have been thrown. + StreamFeatureGroup resultFg = streamFeatureGroups[0]; + resultFg.setFeatureStore(featureStore); + return resultFg; + } + + public List getStreamFeatureGroups(FeatureStore featureStore, String fgName) + throws FeatureStoreException, IOException { + StreamFeatureGroup[] streamFeatureGroups = + featureGroupApi.getInternal(featureStore, fgName, null, StreamFeatureGroup[].class); + + return Arrays.asList(streamFeatureGroups); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureViewEngine.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureViewEngine.java new file mode 100644 index 000000000..6562b4ae4 --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureViewEngine.java @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.engine.FeatureViewEngineBase; + +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import com.logicalclocks.hsfs.flink.constructor.Query; +import com.logicalclocks.hsfs.flink.FeatureView; +import com.logicalclocks.hsfs.flink.FeatureStore; + +import org.apache.flink.streaming.api.datastream.DataStream; + +import java.io.IOException; +import java.util.Date; +import java.util.List; +import java.util.Map; + +public class FeatureViewEngine extends FeatureViewEngineBase> { + + @Override + public FeatureView update(FeatureView featureView) throws FeatureStoreException, IOException { + featureViewApi.update(featureView, FeatureView.class); + return featureView; + } + + @Override + public FeatureView get(FeatureStore featureStore, String name, Integer version) + throws FeatureStoreException, IOException { + FeatureView featureView = get(featureStore, name, version, FeatureView.class); + featureView.setFeatureStore(featureStore); + return featureView; + } + + @Override + public Query getBatchQuery(FeatureView featureView, Date date, Date date1, Boolean withLabels, Integer integer) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public String getBatchQueryString(FeatureView featureView, Date startTime, Date endTime, Integer trainingDataVersion) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public FeatureView getOrCreateFeatureView(FeatureStore featureStore, String name, Integer version, Query query, + String description, List labels) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream getBatchData(FeatureView featureView, Date startTime, Date endTime, Map readOptions, + Integer trainingDataVersion) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FlinkEngine.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FlinkEngine.java new file mode 100644 index 000000000..9e0645e96 --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FlinkEngine.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import com.google.common.base.Strings; +import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.engine.EngineBase; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; + +import com.logicalclocks.hsfs.metadata.HopsworksInternalClient; +import lombok.Getter; + +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.GlobalConfiguration; +import org.apache.flink.connector.base.DeliveryGuarantee; +import org.apache.flink.connector.kafka.sink.KafkaSink; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.util.FileUtils; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +import static org.apache.flink.configuration.ConfigOptions.key; + +public class FlinkEngine extends EngineBase { + private static FlinkEngine INSTANCE = null; + + public static synchronized FlinkEngine getInstance() throws FeatureStoreException { + if (INSTANCE == null) { + INSTANCE = new FlinkEngine(); + } + return INSTANCE; + } + + @Getter + private StreamExecutionEnvironment streamExecutionEnvironment; + + private final Configuration flinkConfig = GlobalConfiguration.loadConfiguration(); + private final ConfigOption keyStorePath = + key("flink.hadoop.hops.ssl.keystore.name") + .stringType() + .defaultValue("trustStore.jks") + .withDescription("path to keyStore.jks"); + private final ConfigOption trustStorePath = + key("flink.hadoop.hops.ssl.truststore.name") + .stringType() + .defaultValue("trustStore.jks") + .withDescription("path to trustStore.jks"); + private final ConfigOption materialPasswdPath = + key("flink.hadoop.hops.ssl.keystores.passwd.name") + .stringType() + .defaultValue("material_passwd") + .withDescription("path to material_passwd"); + + private FlinkEngine() throws FeatureStoreException { + streamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment(); + // Configure the streamExecutionEnvironment + streamExecutionEnvironment.getConfig().enableObjectReuse(); + } + + public DataStreamSink writeDataStream(StreamFeatureGroup streamFeatureGroup, DataStream dataStream, + Map writeOptions) throws FeatureStoreException, IOException { + + DataStream genericDataStream = (DataStream) dataStream; + Properties properties = new Properties(); + properties.putAll(getKafkaConfig(streamFeatureGroup, writeOptions)); + + KafkaSink sink = KafkaSink.builder() + .setBootstrapServers(properties.getProperty("bootstrap.servers")) + .setKafkaProducerConfig(properties) + .setRecordSerializer(new KafkaRecordSerializer(streamFeatureGroup)) + .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE) + .build(); + Map complexFeatureSchemas = new HashMap<>(); + for (String featureName: streamFeatureGroup.getComplexFeatures()) { + complexFeatureSchemas.put(featureName, streamFeatureGroup.getFeatureAvroSchema(featureName)); + } + + DataStream avroRecordDataStream = + genericDataStream.map(new PojoToAvroRecord( + streamFeatureGroup.getDeserializedAvroSchema(), + streamFeatureGroup.getDeserializedEncodedAvroSchema(), + complexFeatureSchemas)) + .returns( + new GenericRecordAvroTypeInfo(streamFeatureGroup.getDeserializedEncodedAvroSchema()) + ); + + return avroRecordDataStream.sinkTo(sink); + } + + @Override + public String addFile(String filePath) throws IOException { + if (Strings.isNullOrEmpty(filePath)) { + return filePath; + } + // this is used for unit testing + if (!filePath.startsWith("file://")) { + filePath = "hdfs://" + filePath; + } + String targetPath = FileUtils.getCurrentWorkingDirectory().toString() + + filePath.substring(filePath.lastIndexOf("/")); + FileUtils.copy(new Path(filePath), new Path(targetPath), false); + return targetPath; + } + + @Override + public Map getKafkaConfig(FeatureGroupBase featureGroup, Map writeOptions) + throws FeatureStoreException, IOException { + boolean external = !(System.getProperties().containsKey(HopsworksInternalClient.REST_ENDPOINT_SYS) + || (writeOptions != null + && Boolean.parseBoolean(writeOptions.getOrDefault("internal_kafka", "false")))); + + StorageConnector.KafkaConnector storageConnector = + storageConnectorApi.getKafkaStorageConnector(featureGroup.getFeatureStore(), external); + storageConnector.setSslTruststoreLocation(addFile(storageConnector.getSslTruststoreLocation())); + storageConnector.setSslKeystoreLocation(addFile(storageConnector.getSslKeystoreLocation())); + + Map config = storageConnector.kafkaOptions(); + + if (writeOptions != null) { + config.putAll(writeOptions); + } + config.put("enable.idempotence", "false"); + return config; + } + + public String getTrustStorePath() { + return flinkConfig.getString(trustStorePath); + } + + public String getKeyStorePath() { + return flinkConfig.getString(keyStorePath); + } + + public String getCertKey() { + return flinkConfig.getString(materialPasswdPath); + } +} diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/KafkaRecordSerializer.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/KafkaRecordSerializer.java new file mode 100644 index 000000000..b1729f75d --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/KafkaRecordSerializer.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.DatumWriter; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.reflect.ReflectDatumWriter; +import org.apache.flink.api.common.serialization.SerializationSchema; +import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema; +import org.apache.kafka.clients.producer.ProducerRecord; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class KafkaRecordSerializer implements KafkaRecordSerializationSchema { + + private final String topic; + private final List primaryKeys; + private final Map headerMap = new HashMap<>(); + + KafkaRecordSerializer(StreamFeatureGroup streamFeatureGroup) throws FeatureStoreException, IOException { + this.topic = streamFeatureGroup.getOnlineTopicName(); + this.primaryKeys = streamFeatureGroup.getPrimaryKeys(); + + headerMap.put("projectId", + String.valueOf(streamFeatureGroup.getFeatureStore().getProjectId()).getBytes(StandardCharsets.UTF_8)); + headerMap.put("featureGroupId", String.valueOf(streamFeatureGroup.getId()).getBytes(StandardCharsets.UTF_8)); + headerMap.put("subjectId", + String.valueOf(streamFeatureGroup.getSubject().getId()).getBytes(StandardCharsets.UTF_8)); + } + + @Override + public void open(SerializationSchema.InitializationContext context, + KafkaRecordSerializationSchema.KafkaSinkContext sinkContext) { + // TODO not needed + } + + @Override + public ProducerRecord serialize(GenericRecord genericRecord, + KafkaRecordSerializationSchema.KafkaSinkContext context, + Long timestamp) { + byte[] key = this.serializeKey(genericRecord); + byte[] value = this.serializeValue(genericRecord); + ProducerRecord producerRecord = new ProducerRecord<>(topic, null, timestamp, key, value); + for (Map.Entry entry: headerMap.entrySet()) { + producerRecord.headers().add(entry.getKey(), entry.getValue()); + } + return producerRecord; + } + + public byte[] serializeKey(GenericRecord genericRecord) { + List primaryKeyValues = new ArrayList<>(); + for (String primaryKey: primaryKeys) { + primaryKeyValues.add(genericRecord.get(primaryKey).toString()); + } + return String.join(";", primaryKeyValues).getBytes(StandardCharsets.UTF_8); + } + + public byte[] serializeValue(GenericRecord genericRecord) { + DatumWriter datumWriter = new ReflectDatumWriter<>(genericRecord.getSchema()); + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + byteArrayOutputStream.reset(); + + BinaryEncoder binaryEncoder = new EncoderFactory().binaryEncoder(byteArrayOutputStream, null); + try { + datumWriter.write(genericRecord, binaryEncoder); + binaryEncoder.flush(); + } catch (IOException e) { + e.printStackTrace(); + } + return byteArrayOutputStream.toByteArray(); + } +} \ No newline at end of file diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/PojoToAvroRecord.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/PojoToAvroRecord.java new file mode 100644 index 000000000..d2a37c26e --- /dev/null +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/PojoToAvroRecord.java @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import org.apache.avro.Schema; +import org.apache.avro.SchemaValidationException; +import org.apache.avro.SchemaValidatorBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.reflect.ReflectData; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.lang.reflect.Field; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class PojoToAvroRecord extends RichMapFunction implements + ResultTypeQueryable { + + private final String schema; + private final String encodedSchema; + private final Map complexFeatureSchemas; + + // org.apache.avro.Schema$Field is not serializable. Create in open() and reused later on + private transient Schema deserializedSchema; + private transient Schema deserializedEncodedSchema; + private transient Map deserializedComplexFeatureSchemas; + private transient GenericRecordAvroTypeInfo producedType; + + public PojoToAvroRecord(Schema schema, Schema encodedSchema, Map complexFeatureSchemas) { + this.schema = schema.toString(); + this.encodedSchema = encodedSchema.toString(); + this.complexFeatureSchemas = complexFeatureSchemas; + } + + @Override + public GenericRecord map(T input) throws Exception { + + // validate + validatePojoAgainstSchema(input, this.deserializedSchema); + + // Create a new Avro record based on the given schema + GenericRecord record = new GenericData.Record(this.deserializedEncodedSchema); + // Get the fields of the POJO class populate fields of the Avro record + List fields = + Arrays.stream(input.getClass().getDeclaredFields()) + .filter(f -> f.getName().equals("SCHEMA$")) + .collect(Collectors.toList()); + if (!fields.isEmpty()) { + // it means POJO was generated from avro schema + Field schemaField = input.getClass().getDeclaredField("SCHEMA$"); + schemaField.setAccessible(true); + Schema fieldSchema = (Schema) schemaField.get(null); + for (Schema.Field field : fieldSchema.getFields()) { + String fieldName = field.name(); + Field pojoField = input.getClass().getDeclaredField(fieldName); + pojoField.setAccessible(true); + Object fieldValue = pojoField.get(input); + populateAvroRecord(record, fieldName, fieldValue); + } + } else { + for (Field field : fields) { + String fieldName = field.getName(); + Object fieldValue = field.get(input); + populateAvroRecord(record, fieldName, fieldValue); + } + } + return record; + } + + @Override + public void open(Configuration configuration) throws Exception { + super.open(configuration); + this.deserializedSchema = new Schema.Parser().parse(this.schema); + this.deserializedEncodedSchema = new Schema.Parser().parse(this.encodedSchema); + this.deserializedComplexFeatureSchemas = new HashMap<>(); + for (String featureName: this.complexFeatureSchemas.keySet()) { + deserializedComplexFeatureSchemas.put(featureName, + new Schema.Parser().parse(this.complexFeatureSchemas.get(featureName))); + } + this.producedType = new GenericRecordAvroTypeInfo(deserializedEncodedSchema); + } + + @Override + public TypeInformation getProducedType() { + return producedType; + } + + private void populateAvroRecord(GenericRecord record, String fieldName, Object fieldValue) throws IOException { + if (this.deserializedComplexFeatureSchemas.containsKey(fieldName)) { + GenericDatumWriter complexFeatureDatumWriter = + new GenericDatumWriter<>(this.deserializedComplexFeatureSchemas.get(fieldName)); + ByteArrayOutputStream complexFeatureByteArrayOutputStream = new ByteArrayOutputStream(); + complexFeatureByteArrayOutputStream.reset(); + BinaryEncoder complexFeatureBinaryEncoder = + new EncoderFactory().binaryEncoder(complexFeatureByteArrayOutputStream, null); + complexFeatureDatumWriter.write(fieldValue, complexFeatureBinaryEncoder); + complexFeatureBinaryEncoder.flush(); + record.put(fieldName, ByteBuffer.wrap(complexFeatureByteArrayOutputStream.toByteArray())); + complexFeatureByteArrayOutputStream.flush(); + complexFeatureByteArrayOutputStream.close(); + } else { + record.put(fieldName, fieldValue); + } + } + + private void validatePojoAgainstSchema(Object pojo, Schema avroSchema) throws SchemaValidationException { + Schema pojoSchema = ReflectData.get().getSchema(pojo.getClass()); + SchemaValidatorBuilder builder = new SchemaValidatorBuilder(); + builder.canReadStrategy().validateAll().validate(avroSchema, Collections.singletonList(pojoSchema)); + } +} diff --git a/java/hsfs/pom.xml b/java/hsfs/pom.xml new file mode 100644 index 000000000..56847be5d --- /dev/null +++ b/java/hsfs/pom.xml @@ -0,0 +1,100 @@ + + + + hsfs-parent + com.logicalclocks + 4.0.0-SNAPSHOT + + 4.0.0 + + hsfs + + + + 2.2.11 + + + + + + com.fasterxml.jackson.core + jackson-databind + ${fasterxml.jackson.databind.version} + provided + + + + + org.apache.avro + avro + ${avro.version} + + + + + javax.xml.bind + jaxb-api + ${javax.version} + provided + + + + software.amazon.awssdk + ssm + ${awssdk.version} + + + org.apache.httpcomponents + * + + + + + + software.amazon.awssdk + sts + ${awssdk.version} + + + org.apache.httpcomponents + * + + + + + + software.amazon.awssdk + secretsmanager + ${awssdk.version} + + + org.apache.httpcomponents + * + + + + + + com.databricks + dbutils-api_${scala-short.version} + ${dbutils.version} + provided + + + + + org.scala-lang + scala-library + ${scala.version} + + + + + commons-io + commons-io + 2.11.0 + + + diff --git a/java/hsfs/src/main/java/com/logicalclocks/hsfs/DataFormat.java b/java/hsfs/src/main/java/com/logicalclocks/hsfs/DataFormat.java new file mode 100644 index 000000000..89aab31c9 --- /dev/null +++ b/java/hsfs/src/main/java/com/logicalclocks/hsfs/DataFormat.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020-2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public enum DataFormat { + @JsonProperty("csv") + CSV, + @JsonProperty("tsv") + TSV, + @JsonProperty("parquet") + PARQUET, + @JsonProperty("avro") + AVRO, + @JsonProperty("image") + IMAGE, + @JsonProperty("orc") + ORC, + @JsonProperty("tfrecords") + TFRECORDS, + @JsonProperty("tfrecord") + TFRECORD +} diff --git a/java/hsfs/src/main/java/com/logicalclocks/hsfs/DeltaStreamerJobConf.java b/java/hsfs/src/main/java/com/logicalclocks/hsfs/DeltaStreamerJobConf.java new file mode 100644 index 000000000..7990f2e42 --- /dev/null +++ b/java/hsfs/src/main/java/com/logicalclocks/hsfs/DeltaStreamerJobConf.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022-2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs; + +import com.logicalclocks.hsfs.metadata.Option; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +import java.util.List; + +@NoArgsConstructor +@AllArgsConstructor +public class DeltaStreamerJobConf { + + @Getter + @Setter + private List