diff --git a/auto_doc.py b/auto_doc.py index 9b2a6ad00..1fd5b40f8 100644 --- a/auto_doc.py +++ b/auto_doc.py @@ -27,6 +27,9 @@ "mr_api": ["hopsworks.project.Project.get_model_registry"], "ms_api": ["hopsworks.project.Project.get_model_serving"], }, + "api/udf.md": { + "udf": ["hopsworks.udf"], + }, "api/connection.md": { "connection_create": ["hopsworks.connection.Connection.connection"], "connection_properties": keras_autodoc.get_properties( diff --git a/docs/templates/api/udf.md b/docs/templates/api/udf.md new file mode 100644 index 000000000..47106cb1f --- /dev/null +++ b/docs/templates/api/udf.md @@ -0,0 +1,3 @@ +# UDF API + +{{udf}} \ No newline at end of file diff --git a/hsfs/.github/pull_request_template.md b/hsfs/.github/pull_request_template.md new file mode 100644 index 000000000..2a5a1e5ec --- /dev/null +++ b/hsfs/.github/pull_request_template.md @@ -0,0 +1,32 @@ +This PR adds/fixes/changes... +- please summarize your changes to the code +- and make sure to include all changes to user-facing APIs + +JIRA Issue: - + +Priority for Review: - + +Related PRs: - + +**How Has This Been Tested?** + +- [ ] Unit Tests +- [ ] Integration Tests +- [ ] Manual Tests on VM + + +**Checklist For The Assigned Reviewer:** + +``` +- [ ] Checked if merge conflicts with master exist +- [ ] Checked if stylechecks for Java and Python pass +- [ ] Checked if all docstrings were added and/or updated appropriately +- [ ] Ran spellcheck on docstring +- [ ] Checked if guides & concepts need to be updated +- [ ] Checked if naming conventions for parameters and variables were followed +- [ ] Checked if private methods are properly declared and used +- [ ] Checked if hard-to-understand areas of code are commented +- [ ] Checked if tests are effective +- [ ] Built and deployed changes on dev VM and tested manually +- [x] (Checked if all type annotations were added and/or updated appropriately) +``` diff --git a/hsfs/.github/workflows/java-ut.yml b/hsfs/.github/workflows/java-ut.yml new file mode 100644 index 000000000..f83f62caf --- /dev/null +++ b/hsfs/.github/workflows/java-ut.yml @@ -0,0 +1,62 @@ +name: java + +on: pull_request + +jobs: + unit_tests_utc: + name: Java Unit Tests + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone UTC + + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'adopt' + + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Test + working-directory: ./java + run: mvn clean test + + unit_tests_local: + name: Java Unit Tests (Local TZ) + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone Europe/Amsterdam + + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'adopt' + + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Test + working-directory: ./java + run: mvn clean test diff --git a/hsfs/.github/workflows/mkdocs-master.yml b/hsfs/.github/workflows/mkdocs-master.yml new file mode 100644 index 000000000..1c904ad28 --- /dev/null +++ b/hsfs/.github/workflows/mkdocs-master.yml @@ -0,0 +1,53 @@ +name: mkdocs-master + +on: pull_request + +jobs: + publish-master: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: set dev version + working-directory: ./java + run: echo "DEV_VERSION=$(mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | grep -Ev 'Download|INFO|WARNING')" >> $GITHUB_ENV + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: install deps + working-directory: ./python + run: cp ../README.md . && pip3 install -r ../requirements-docs.txt && pip3 install -e .[python,dev] + + - name: generate autodoc + run: python3 auto_doc.py + + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: "8" + distribution: "adopt" + + - name: Build java doc documentation + working-directory: ./java + run: mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../docs/javadoc + + - name: setup git + run: | + git config --global user.name Mike + git config --global user.email mike@docs.hopsworks.ai + + - name: mike deploy docs + run: mike deploy ${{ env.DEV_VERSION }} dev -u diff --git a/hsfs/.github/workflows/mkdocs-release.yml b/hsfs/.github/workflows/mkdocs-release.yml new file mode 100644 index 000000000..66ca638ae --- /dev/null +++ b/hsfs/.github/workflows/mkdocs-release.yml @@ -0,0 +1,59 @@ +name: mkdocs-release + +on: + push: + branches: [branch-*] + +jobs: + publish-release: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: set major/minor/bugfix release version + working-directory: ./java + run: echo "RELEASE_VERSION=$(mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | grep -Ev 'Download|INFO|WARNING')" >> $GITHUB_ENV + + - name: set major/minor release version + run: echo "MAJOR_VERSION=$(echo $RELEASE_VERSION | sed 's/^\([0-9]*\.[0-9]*\).*$/\1/')" >> $GITHUB_ENV + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: install deps + working-directory: ./python + run: cp ../README.md . && pip3 install -r ../requirements-docs.txt && pip3 install -e .[python,dev] + + - name: generate autodoc + run: python3 auto_doc.py + + - name: Cache local Maven repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: "8" + distribution: "adopt" + + - name: Build java doc documentation + working-directory: ./java + run: mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../docs/javadoc + + - name: setup git + run: | + git config --global user.name Mike + git config --global user.email mike@docs.hopsworks.ai + - name: mike deploy docs + run: | + mike deploy ${{ env.RELEASE_VERSION }} ${{ env.MAJOR_VERSION }} -u --push + mike alias ${{ env.RELEASE_VERSION }} latest -u --push diff --git a/hsfs/.github/workflows/optional-dependency.yml b/hsfs/.github/workflows/optional-dependency.yml new file mode 100644 index 000000000..547b02029 --- /dev/null +++ b/hsfs/.github/workflows/optional-dependency.yml @@ -0,0 +1,29 @@ +name: optional-dependency + +on: pull_request + +jobs: + unit_tests_no_great_expectations: + name: Unit Testing (No Great Expectations) + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone UTC + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev-no-opt] + + - name: Run Pytest suite + env: + ENABLE_HOPSWORKS_USAGE: "false" + run: pytest python/tests \ No newline at end of file diff --git a/hsfs/.github/workflows/python-lint.yml b/hsfs/.github/workflows/python-lint.yml new file mode 100644 index 000000000..f638b0128 --- /dev/null +++ b/hsfs/.github/workflows/python-lint.yml @@ -0,0 +1,222 @@ +name: python + +on: pull_request + +jobs: + lint_stylecheck: + name: Lint and Stylecheck + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Get all changed files + id: get-changed-files + uses: tj-actions/changed-files@v44 + with: + files_yaml: | + src: + - 'python/**/*.py' + - '!python/tests/**/*.py' + test: + - 'python/tests/**/*.py' + + - name: install deps + run: pip install ruff==0.4.2 + + - name: ruff on python files + if: steps.get-changed-files.outputs.src_any_changed == 'true' + env: + SRC_ALL_CHANGED_FILES: ${{ steps.get-changed-files.outputs.src_all_changed_files }} + run: ruff check --output-format=github $SRC_ALL_CHANGED_FILES + + - name: ruff on test files + if: steps.get-changed-files.outputs.test_any_changed == 'true' + env: + TEST_ALL_CHANGED_FILES: ${{ steps.get-changed-files.outputs.test_all_changed_files }} + run: ruff check --output-format=github $TEST_ALL_CHANGED_FILES + + - name: ruff format --check $ALL_CHANGED_FILES + env: + ALL_CHANGED_FILES: ${{ steps.get-changed-files.outputs.all_changed_files }} + run: ruff format $ALL_CHANGED_FILES + + unit_tests_ubuntu_utc: + name: Unit Testing (Ubuntu) + needs: lint_stylecheck + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone UTC + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev] + + - name: Display Python version + run: python --version + + - name: Run Pytest suite + env: + ENABLE_HOPSWORKS_USAGE: "false" + run: pytest python/tests + + unit_tests_ubuntu_pandas: + name: Unit Testing (Ubuntu) (Pandas 1.x) + needs: lint_stylecheck + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone UTC + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v4 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev-pandas1] + + - name: Display Python version + run: python --version + + - name: Run Pytest suite + env: + ENABLE_HOPSWORKS_USAGE: "false" + run: pytest python/tests + + unit_tests_ubuntu_local: + name: Unit Testing (Ubuntu) (Local TZ) + needs: lint_stylecheck + runs-on: ubuntu-latest + + steps: + - name: Set Timezone + run: sudo timedatectl set-timezone Europe/Amsterdam + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e 'python[python,dev]' + + - name: Display Python version + run: python --version + + - name: Run Pytest suite + env: + ENABLE_HOPSWORKS_USAGE: "false" + run: pytest python/tests + + unit_tests_ubuntu_typechecked: + name: Typechecked Unit Testing (Ubuntu) + needs: lint_stylecheck + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v4 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev,docs] + + - name: Run Pytest suite + env: + ENABLE_HOPSWORKS_USAGE: "false" + HOPSWORKS_RUN_WITH_TYPECHECK: "true" + run: pytest python/tests + continue-on-error: true + + unit_tests_windows_utc: + name: Unit Testing (Windows) + needs: lint_stylecheck + runs-on: windows-latest + + steps: + - name: Set Timezone + run: tzutil /s "UTC" + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev] + + - name: Display Python version + run: python --version + + - name: Run Pytest suite + env: + ENABLE_HOPSWORKS_USAGE: "false" + run: pytest python/tests + + unit_tests_windows_local: + name: Unit Testing (Windows) (Local TZ) + needs: lint_stylecheck + runs-on: windows-latest + + steps: + - name: Set Timezone + run: tzutil /s "W. Europe Standard Time" + + - uses: actions/checkout@v4 + - name: Copy README + run: cp README.md python/ + + - uses: actions/setup-python@v5 + name: Setup Python + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "python/setup.py" + - run: pip install -e python[python,dev] + + - name: Display Python version + run: python --version + + - name: Display pip freeze + run: pip freeze + + - name: Run Pytest suite + env: + ENABLE_HOPSWORKS_USAGE: "false" + run: pytest python/tests diff --git a/hsfs/.gitignore b/hsfs/.gitignore new file mode 100644 index 000000000..a8b4c5683 --- /dev/null +++ b/hsfs/.gitignore @@ -0,0 +1,145 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +python/README.md +python/LICENSE + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ +.ruff_cache/ +bigquery.json +metastore_db/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# Mike Javadoc +docs/javadoc + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Java +.idea +.vscode +*.iml +target/ + +# Mac +.DS_Store + +# mkdocs intemediate files +docs/generated + +# Test artifacts +keyFile.json + +# delombok dir +delombok + +# dev scripts dir +dev_scripts/ +dev_tools/ diff --git a/hsfs/CONTRIBUTING.md b/hsfs/CONTRIBUTING.md new file mode 100644 index 000000000..0df3de08e --- /dev/null +++ b/hsfs/CONTRIBUTING.md @@ -0,0 +1,220 @@ +## Python development setup + +--- + +- Fork and clone the repository + +- Create a new Python environment with your favourite environment manager (e.g. virtualenv or conda) and Python 3.9 (newer versions will return a library conflict in `auto_doc.py`) + +- Install repository in editable mode with development dependencies: + + ```bash + cd python + pip install -e ".[python,dev]" + ``` + +- Install [pre-commit](https://pre-commit.com/) and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. The Feature Store uses pre-commit to ensure code-style and code formatting through [ruff](https://docs.astral.sh/ruff/). Run the following commands from the `python` directory: + + ```bash + cd python + pip install --user pre-commit + pre-commit install + ``` + + Afterwards, pre-commit will run whenever you commit. + +- To run formatting and code-style separately, you can configure your IDE, such as VSCode, to use `ruff`, or run it via the command line: + + ```bash + # linting + ruff check python --fix + # formatting + ruff format python + ``` + +### Python documentation + +We follow a few best practices for writing the Python documentation: + +1. Use the google docstring style: + + ```python + """[One Line Summary] + + [Extended Summary] + + [!!! example + import xyz + ] + + # Arguments + arg1: Type[, optional]. Description[, defaults to `default`] + arg2: Type[, optional]. Description[, defaults to `default`] + + # Returns + Type. Description. + + # Raises + Exception. Description. + """ + ``` + + If Python 3 type annotations are used, they are inserted automatically. + +2. Feature store entity engine methods (e.g. FeatureGroupEngine etc.) only require a single line docstring. +3. REST Api implementations (e.g. FeatureGroupApi etc.) should be fully documented with docstrings without defaults. +4. Public Api such as metadata objects should be fully documented with defaults. + +#### Setup and Build Documentation + +We use `mkdocs` together with `mike` ([for versioning](https://github.com/jimporter/mike/)) to build the documentation and a plugin called `keras-autodoc` to auto generate Python API documentation from docstrings. + +**Background about `mike`:** +`mike` builds the documentation and commits it as a new directory to the gh-pages branch. Each directory corresponds to one version of the documentation. Additionally, `mike` maintains a json in the root of gh-pages with the mappings of versions/aliases for each of the directories available. With aliases you can define extra names like `dev` or `latest`, to indicate stable and unstable releases. + +1. Currently we are using our own version of `keras-autodoc` + + ```bash + pip install git+https://github.com/logicalclocks/keras-autodoc + ``` + +2. Install HSFS with `docs` extras: + + ```bash + pip install -e ".[python,dev]" && pip install -r ../requirements-docs.txt + ``` + +3. To build the docs, first run the auto doc script: + + ```bash + cd .. + python auto_doc.py + ``` + +##### Option 1: Build only current version of docs + +4. Either build the docs, or serve them dynamically: + + Note: Links and pictures might not resolve properly later on when checking with this build. + The reason for that is that the docs are deployed with versioning on docs.hopsworks.ai and + therefore another level is added to all paths, e.g. `docs.hopsworks.ai/[version-or-alias]`. + Using relative links should not be affected by this, however, building the docs with version + (Option 2) is recommended. + + ```bash + mkdocs build + # or + mkdocs serve + ``` + +##### Option 2 (Preferred): Build multi-version doc with `mike` + +###### Versioning on docs.hopsworks.ai + +On docs.hopsworks.ai we implement the following versioning scheme: + +- current master branches (e.g. of hsfs corresponding to master of Hopsworks): rendered as current Hopsworks snapshot version, e.g. **2.2.0-SNAPSHOT [dev]**, where `dev` is an alias to indicate that this is an unstable version. +- the latest release: rendered with full current version, e.g. **2.1.5 [latest]** with `latest` alias to indicate that this is the latest stable release. +- previous stable releases: rendered without alias, e.g. **2.1.4**. + +###### Build Instructions + +4. For this you can either checkout and make a local copy of the `upstream/gh-pages` branch, where `mike` maintains the current state of docs.hopsworks.ai, or just build documentation for the branch you are updating: + + Building _one_ branch: + + Checkout your dev branch with modified docs: + + ```bash + git checkout [dev-branch] + ``` + + Generate API docs if necessary: + + ```bash + python auto_doc.py + ``` + + Build docs with a version and alias + + ```bash + mike deploy [version] [alias] --update-alias + + # for example, if you are updating documentation to be merged to master, + # which will become the new SNAPSHOT version: + mike deploy 2.2.0-SNAPSHOT dev --update-alias + + # if you are updating docs of the latest stable release branch + mike deploy [version] latest --update-alias + + # if you are updating docs of a previous stable release branch + mike deploy [version] + ``` + + If no gh-pages branch existed in your local repository, this will have created it. + + **Important**: If no previous docs were built, you will have to choose a version as default to be loaded as index, as follows + + ```bash + mike set-default [version-or-alias] + ``` + + You can now checkout the gh-pages branch and serve: + + ```bash + git checkout gh-pages + mike serve + ``` + + You can also list all available versions/aliases: + + ```bash + mike list + ``` + + Delete and reset your local gh-pages branch: + + ```bash + mike delete --all + + # or delete single version + mike delete [version-or-alias] + ``` + +#### Adding new API documentation + +To add new documentation for APIs, you need to add information about the method/class to document to the `auto_doc.py` script: + +```python +PAGES = { + "connection.md": [ + "hsfs.connection.Connection.connection" + ] + "new_template.md": [ + "module", + "xyz.asd" + ] +} +``` + +Now you can add a template markdown file to the `docs/templates` directory with the name you specified in the auto-doc script. The `new_template.md` file should contain a tag to identify the place at which the API documentation should be inserted: + +```` +## The XYZ package + +{{module}} + +Some extra content here. + +!!! example + ```python + import xyz + ``` + +{{xyz.asd}} +```` + +Finally, run the `auto_doc.py` script, as decribed above, to update the documentation. + +For information about Markdown syntax and possible Admonitions/Highlighting etc. see +the [Material for Mkdocs themes reference documentation](https://squidfunk.github.io/mkdocs-material/reference/abbreviations/). diff --git a/hsfs/Dockerfile b/hsfs/Dockerfile new file mode 100644 index 000000000..38d9025c5 --- /dev/null +++ b/hsfs/Dockerfile @@ -0,0 +1,13 @@ +FROM ubuntu:22.04 + +RUN apt-get update && \ + apt-get install -y python3-pip git && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN pip3 install twine build virtualenv \ + mkdocs==1.5.3 \ + mkdocs-material==9.5.17 \ + mike==2.0.0 \ + git+https://github.com/logicalclocks/keras-autodoc + +RUN mkdir -p /.local && chmod -R 777 /.local diff --git a/hsfs/Jenkinsfile b/hsfs/Jenkinsfile new file mode 100644 index 000000000..d2014d5cb --- /dev/null +++ b/hsfs/Jenkinsfile @@ -0,0 +1,23 @@ +pipeline { + agent { + docker { + label "local" + image "docker.hops.works/hopsworks_twine:0.0.1" + } + } + stages { + stage("publish") { + environment { + PYPI = credentials('977daeb0-e1c8-43a0-b35a-fc37bb9eee9b') + } + steps { + dir("python") { + sh "rm -f LICENSE README.md" + sh "cp -f ../LICENSE ../README.md ./" + sh "python3 -m build" + sh "twine upload -u $PYPI_USR -p $PYPI_PSW --skip-existing dist/*" + } + } + } + } +} diff --git a/hsfs/LICENSE b/hsfs/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/hsfs/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/hsfs/README.md b/hsfs/README.md new file mode 100644 index 000000000..a13ea2ce5 --- /dev/null +++ b/hsfs/README.md @@ -0,0 +1,201 @@ +# Hopsworks Feature Store + +

+ Hopsworks Community + Hopsworks Feature Store Documentation + python + PyPiStatus + Scala/Java Artifacts + Downloads + Ruff + License +

+ +HSFS is the library to interact with the Hopsworks Feature Store. The library makes creating new features, feature groups and training datasets easy. + +The library is environment independent and can be used in two modes: + +- Spark mode: For data engineering jobs that create and write features into the feature store or generate training datasets. It requires a Spark environment such as the one provided in the Hopsworks platform or Databricks. In Spark mode, HSFS provides bindings both for Python and JVM languages. + +- Python mode: For data science jobs to explore the features available in the feature store, generate training datasets and feed them in a training pipeline. Python mode requires just a Python interpreter and can be used both in Hopsworks from Python Jobs/Jupyter Kernels, Amazon SageMaker or KubeFlow. + +The library automatically configures itself based on the environment it is run. +However, to connect from an external environment such as Databricks or AWS Sagemaker, +additional connection information, such as host and port, is required. For more information checkout the [Hopsworks documentation](https://docs.hopsworks.ai/latest/). + +## Getting Started On Hopsworks + +Get started easily by registering an account on [Hopsworks Serverless](https://app.hopsworks.ai/). Create your project and a [new Api key](https://docs.hopsworks.ai/latest/user_guides/projects/api_key/create_api_key/). In a new python environment with Python 3.8 or higher, install the [client library](https://docs.hopsworks.ai/latest/user_guides/client_installation/) using pip: + +```bash +# Get all Hopsworks SDKs: Feature Store, Model Serving and Platform SDK +pip install hopsworks +# or minimum install with the Feature Store SDK +pip install hsfs[python] +# if using zsh don't forget the quotes +pip install 'hsfs[python]' +``` + +You can start a notebook and instantiate a connection and get the project feature store handler. + +```python +import hopsworks + +project = hopsworks.login() # you will be prompted for your api key +fs = project.get_feature_store() +``` + +or using `hsfs` directly: + +```python +import hsfs + +connection = hsfs.connection( + host="c.app.hopsworks.ai", # + project="your-project", + api_key_value="your-api-key", +) +fs = connection.get_feature_store() +``` + +Create a new feature group to start inserting feature values. +```python +fg = fs.create_feature_group("rain", + version=1, + description="Rain features", + primary_key=['date', 'location_id'], + online_enabled=True) + +fg.save(dataframe) +``` + +Upsert new data in to the feature group with `time_travel_format="HUDI"`". +```python +fg.insert(upsert_df) +``` + +Retrieve commit timeline metdata of the feature group with `time_travel_format="HUDI"`". +```python +fg.commit_details() +``` + +"Reading feature group as of specific point in time". +```python +fg = fs.get_feature_group("rain", 1) +fg.read("2020-10-20 07:34:11").show() +``` + +Read updates that occurred between specified points in time. +```python +fg = fs.get_feature_group("rain", 1) +fg.read_changes("2020-10-20 07:31:38", "2020-10-20 07:34:11").show() +``` + +Join features together +```python +feature_join = rain_fg.select_all() + .join(temperature_fg.select_all(), on=["date", "location_id"]) + .join(location_fg.select_all()) +feature_join.show(5) +``` + +join feature groups that correspond to specific point in time +```python +feature_join = rain_fg.select_all() + .join(temperature_fg.select_all(), on=["date", "location_id"]) + .join(location_fg.select_all()) + .as_of("2020-10-31") +feature_join.show(5) +``` + +join feature groups that correspond to different time +```python +rain_fg_q = rain_fg.select_all().as_of("2020-10-20 07:41:43") +temperature_fg_q = temperature_fg.select_all().as_of("2020-10-20 07:32:33") +location_fg_q = location_fg.select_all().as_of("2020-10-20 07:33:08") +joined_features_q = rain_fg_q.join(temperature_fg_q).join(location_fg_q) +``` + +Use the query object to create a training dataset: +```python +td = fs.create_training_dataset("rain_dataset", + version=1, + data_format="tfrecords", + description="A test training dataset saved in TfRecords format", + splits={'train': 0.7, 'test': 0.2, 'validate': 0.1}) + +td.save(feature_join) +``` + +A short introduction to the Scala API: +```scala +import com.logicalclocks.hsfs._ +val connection = HopsworksConnection.builder().build() +val fs = connection.getFeatureStore(); +val attendances_features_fg = fs.getFeatureGroup("games_features", 1); +attendances_features_fg.show(1) +``` + +You can find more examples on how to use the library in our [hops-examples](https://github.com/logicalclocks/hops-examples) repository. + +## Usage + +Usage data is collected for improving quality of the library. It is turned on by default if the backend +is "c.app.hopsworks.ai". To turn it off, use one of the following way: +```python +# use environment variable +import os +os.environ["ENABLE_HOPSWORKS_USAGE"] = "false" + +# use `disable_usage_logging` +import hsfs +hsfs.disable_usage_logging() +``` + +The source code can be found in python/hsfs/usage.py. + +## Documentation + +Documentation is available at [Hopsworks Feature Store Documentation](https://docs.hopsworks.ai/). + +## Issues + +For general questions about the usage of Hopsworks and the Feature Store please open a topic on [Hopsworks Community](https://community.hopsworks.ai/). + +Please report any issue using [Github issue tracking](https://github.com/logicalclocks/feature-store-api/issues). + +Please attach the client environment from the output below in the issue: +```python +import hopsworks +import hsfs +hopsworks.login().get_feature_store() +print(hsfs.get_env()) +``` + +## Contributing + +If you would like to contribute to this library, please see the [Contribution Guidelines](CONTRIBUTING.md). diff --git a/hsfs/auto_doc.py b/hsfs/auto_doc.py new file mode 100644 index 000000000..a98af258b --- /dev/null +++ b/hsfs/auto_doc.py @@ -0,0 +1,384 @@ +import os +import pathlib +import shutil + +import keras_autodoc + +PAGES = { + "api/connection_api.md": { + "connection": ["hsfs.connection.Connection"], + "connection_properties": keras_autodoc.get_properties( + "hsfs.connection.Connection" + ), + "connection_methods": keras_autodoc.get_methods("hsfs.connection.Connection"), + }, + "api/spine_group_api.md": { + "fg": ["hsfs.feature_group.SpineGroup"], + "fg_create": ["hsfs.feature_store.FeatureStore.get_or_create_spine_group"], + "fg_get": ["hsfs.feature_store.FeatureStore.get_or_create_spine_group"], + "fg_properties": keras_autodoc.get_properties( + "hsfs.feature_group.SpineGroup", + exclude=[ + "expectation_suite", + "location", + "online_enabled", + "statistics", + "statistics_config", + "subject", + ], + ), + "fg_methods": keras_autodoc.get_methods( + "hsfs.feature_group.SpineGroup", + exclude=[ + "append_features", + "compute_statistics", + "delete_expectation_suite", + "from_response_json", + "get_all_validation_reports", + "get_expectation_suite", + "get_latest_validation_report", + "get_statistics", + "get_validation_history", + "save_expectation_suite", + "save_validation_report", + "update_from_response_json", + "update_statistics_config", + "validate", + ], + ), + }, + "api/training_dataset_api.md": { + "td": ["hsfs.training_dataset.TrainingDataset"], + "td_create": ["hsfs.feature_store.FeatureStore.create_training_dataset"], + "td_get": ["hsfs.feature_store.FeatureStore.get_training_dataset"], + "td_properties": keras_autodoc.get_properties( + "hsfs.training_dataset.TrainingDataset" + ), + "td_methods": keras_autodoc.get_methods( + "hsfs.training_dataset.TrainingDataset" + ), + }, + "api/feature_view_api.md": { + "fv": ["hsfs.feature_view.FeatureView"], + "fv_create": ["hsfs.feature_store.FeatureStore.create_feature_view"], + "fv_get": ["hsfs.feature_store.FeatureStore.get_feature_view"], + "fvs_get": ["hsfs.feature_store.FeatureStore.get_feature_views"], + "fv_properties": keras_autodoc.get_properties("hsfs.feature_view.FeatureView"), + "fv_methods": keras_autodoc.get_methods("hsfs.feature_view.FeatureView"), + }, + "api/feature_api.md": { + "feature": ["hsfs.feature.Feature"], + "feature_properties": keras_autodoc.get_properties("hsfs.feature.Feature"), + "feature_methods": keras_autodoc.get_methods("hsfs.feature.Feature"), + }, + "api/expectation_suite_api.md": { + "expectation_suite": ["hsfs.expectation_suite.ExpectationSuite"], + "expectation_suite_attach": [ + "hsfs.feature_group.FeatureGroup.save_expectation_suite" + ], + "single_expectation_api": [ + "hsfs.expectation_suite.ExpectationSuite.add_expectation", + "hsfs.expectation_suite.ExpectationSuite.replace_expectation", + "hsfs.expectation_suite.ExpectationSuite.remove_expectation", + ], + "expectation_suite_properties": keras_autodoc.get_properties( + "hsfs.expectation_suite.ExpectationSuite" + ), + "expectation_suite_methods": keras_autodoc.get_methods( + "hsfs.expectation_suite.ExpectationSuite" + ), + }, + "api/feature_store_api.md": { + "fs": ["hsfs.feature_store.FeatureStore"], + "fs_get": ["hsfs.connection.Connection.get_feature_store"], + "fs_properties": keras_autodoc.get_properties( + "hsfs.feature_store.FeatureStore" + ), + "fs_methods": keras_autodoc.get_methods("hsfs.feature_store.FeatureStore"), + }, + "api/feature_group_api.md": { + "fg": ["hsfs.feature_group.FeatureGroup"], + "fg_create": [ + "hsfs.feature_store.FeatureStore.create_feature_group", + "hsfs.feature_store.FeatureStore.get_or_create_feature_group", + ], + "fg_get": ["hsfs.feature_store.FeatureStore.get_feature_group"], + "fg_properties": keras_autodoc.get_properties( + "hsfs.feature_group.FeatureGroup" + ), + "fg_methods": keras_autodoc.get_methods("hsfs.feature_group.FeatureGroup"), + }, + "api/external_feature_group_api.md": { + "fg": ["hsfs.feature_group.ExternalFeatureGroup"], + "fg_create": ["hsfs.feature_store.FeatureStore.create_external_feature_group"], + "fg_get": ["hsfs.feature_store.FeatureStore.get_external_feature_group"], + "fg_properties": keras_autodoc.get_properties( + "hsfs.feature_group.ExternalFeatureGroup" + ), + "fg_methods": keras_autodoc.get_methods( + "hsfs.feature_group.ExternalFeatureGroup" + ), + }, + "api/storage_connector_api.md": { + "sc_get": [ + "hsfs.feature_store.FeatureStore.get_storage_connector", + "hsfs.feature_store.FeatureStore.get_online_storage_connector", + ], + "hopsfs_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.HopsFSConnector", exclude=["from_response_json"] + ), + "hopsfs_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.HopsFSConnector" + ), + "s3_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.S3Connector", exclude=["from_response_json"] + ), + "s3_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.S3Connector" + ), + "redshift_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.RedshiftConnector", exclude=["from_response_json"] + ), + "redshift_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.RedshiftConnector" + ), + "adls_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.AdlsConnector", exclude=["from_response_json"] + ), + "adls_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.AdlsConnector" + ), + "snowflake_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.SnowflakeConnector", exclude=["from_response_json"] + ), + "snowflake_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.SnowflakeConnector" + ), + "jdbc_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.JdbcConnector", exclude=["from_response_json"] + ), + "jdbc_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.JdbcConnector" + ), + "gcs_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.GcsConnector", exclude=["from_response_json"] + ), + "gcs_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.GcsConnector" + ), + "bigquery_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.BigQueryConnector", exclude=["from_response_json"] + ), + "bigquery_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.BigQueryConnector" + ), + "kafka_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.KafkaConnector", exclude=["from_response_json"] + ), + "kafka_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.KafkaConnector" + ), + }, + "api/statistics_config_api.md": { + "statistics_config": ["hsfs.statistics_config.StatisticsConfig"], + "statistics_config_properties": keras_autodoc.get_properties( + "hsfs.statistics_config.StatisticsConfig" + ), + }, + "api/transformation_functions_api.md": { + "transformation_function": [ + "hsfs.transformation_function.TransformationFunction" + ], + "transformation_function_properties": keras_autodoc.get_properties( + "hsfs.transformation_function.TransformationFunction" + ), + "transformation_function_methods": keras_autodoc.get_methods( + "hsfs.transformation_function.TransformationFunction", + exclude=[ + "from_response_json", + "update_from_response_json", + "json", + "to_dict", + ], + ), + "create_transformation_function": [ + "hsfs.feature_store.FeatureStore.create_transformation_function" + ], + "get_transformation_function": [ + "hsfs.feature_store.FeatureStore.get_transformation_function" + ], + "get_transformation_functions": [ + "hsfs.feature_store.FeatureStore.get_transformation_functions" + ], + }, + "api/validation_report_api.md": { + "validation_report": ["hsfs.validation_report.ValidationReport"], + "validation_report_validate": [ + "hsfs.feature_group.FeatureGroup.validate", + "hsfs.feature_group.FeatureGroup.insert", + ], + "validation_report_get": [ + "hsfs.feature_group.FeatureGroup.get_latest_validation_report", + "hsfs.feature_group.FeatureGroup.get_all_validation_reports", + ], + "validation_report_properties": keras_autodoc.get_properties( + "hsfs.validation_report.ValidationReport" + ), + "validation_report_methods": keras_autodoc.get_methods( + "hsfs.validation_report.ValidationReport" + ), + }, + "api/job.md": { + "job_configuration": ["hsfs.core.job_configuration.JobConfiguration"], + "job": ["hsfs.core.job.Job"], + "job_methods": [ + "hsfs.core.job.Job.get_state", + "hsfs.core.job.Job.get_final_state", + ], + }, + "api/query_api.md": { + "query_methods": keras_autodoc.get_methods( + "hsfs.constructor.query.Query", + exclude=["json", "to_dict"], + ), + "query_properties": keras_autodoc.get_properties( + "hsfs.constructor.query.Query" + ), + }, + "api/links.md": { + "links_properties": keras_autodoc.get_properties( + "hsfs.core.explicit_provenance.Links" + ), + "artifact_properties": keras_autodoc.get_properties( + "hsfs.core.explicit_provenance.Artifact" + ), + }, + "api/statistics_api.md": { + "statistics": ["hsfs.statistics.Statistics"], + "statistics_properties": keras_autodoc.get_properties( + "hsfs.statistics.Statistics" + ), + }, + "api/split_statistics_api.md": { + "split_statistics": ["hsfs.split_statistics.SplitStatistics"], + "split_statistics_properties": keras_autodoc.get_properties( + "hsfs.split_statistics.SplitStatistics" + ), + }, + "api/feature_descriptive_statistics_api.md": { + "feature_descriptive_statistics": [ + "hsfs.core.feature_descriptive_statistics.FeatureDescriptiveStatistics" + ], + "feature_descriptive_statistics_properties": keras_autodoc.get_properties( + "hsfs.core.feature_descriptive_statistics.FeatureDescriptiveStatistics" + ), + }, + "api/feature_monitoring_config_api.md": { + "feature_monitoring_config": [ + "hsfs.core.feature_monitoring_config.FeatureMonitoringConfig" + ], + "feature_monitoring_config_properties": keras_autodoc.get_properties( + "hsfs.core.feature_monitoring_config.FeatureMonitoringConfig" + ), + "feature_monitoring_config_methods": keras_autodoc.get_methods( + "hsfs.core.feature_monitoring_config.FeatureMonitoringConfig", + exclude=[ + "from_response_json", + "update_from_response_json", + "json", + "to_dict", + ], + ), + # from feature group + "feature_monitoring_config_creation_fg": [ + "hsfs.feature_group.FeatureGroup.create_statistics_monitoring", + "hsfs.feature_group.FeatureGroup.create_feature_monitoring", + ], + # from feature view + "feature_monitoring_config_creation_fv": [ + "hsfs.feature_view.FeatureView.create_statistics_monitoring", + "hsfs.feature_view.FeatureView.create_feature_monitoring", + ], + # retrieval + "feature_monitoring_config_retrieval_fg": [ + "hsfs.feature_group.FeatureGroup.get_feature_monitoring_configs", + ], + "feature_monitoring_config_retrieval_fv": [ + "hsfs.feature_view.FeatureView.get_feature_monitoring_configs", + ], + }, + "api/feature_monitoring_result_api.md": { + "feature_monitoring_result": [ + "hsfs.core.feature_monitoring_result.FeatureMonitoringResult" + ], + "feature_monitoring_result_retrieval": [ + "hsfs.core.feature_monitoring_config.FeatureMonitoringConfig.get_history" + ], + "feature_monitoring_result_properties": keras_autodoc.get_properties( + "hsfs.core.feature_monitoring_result.FeatureMonitoringResult" + ), + }, + "api/feature_monitoring_window_config_api.md": { + "feature_monitoring_window_config": [ + "hsfs.core.monitoring_window_config.MonitoringWindowConfig" + ], + "feature_monitoring_window_config_properties": keras_autodoc.get_properties( + "hsfs.core.monitoring_window_config.MonitoringWindowConfig" + ), + }, + "api/embedding_index_api.md": { + "embedding_index": ["hsfs.embedding.EmbeddingIndex"], + "embedding_index_properties": keras_autodoc.get_properties( + "hsfs.embedding.EmbeddingIndex" + ), + "embedding_index_methods": keras_autodoc.get_methods( + "hsfs.embedding.EmbeddingIndex", exclude=["from_response_json"] + ), + }, + "api/embedding_feature_api.md": { + "embedding_feature": ["hsfs.embedding.EmbeddingFeature"], + "embedding_feature_properties": keras_autodoc.get_properties( + "hsfs.embedding.EmbeddingFeature" + ), + }, + "api/similarity_function_type_api.md": { + "similarity_function_type": ["hsfs.embedding.SimilarityFunctionType"], + }, +} + +hsfs_dir = pathlib.Path(__file__).resolve().parents[0] +if "GITHUB_SHA" in os.environ: + commit_sha = os.environ["GITHUB_SHA"] + project_url = ( + f"https://github.com/logicalclocks/feature-store-api/tree/{commit_sha}/python" + ) +else: + branch_name = os.environ.get("GITHUB_BASE_REF", "master") + project_url = ( + f"https://github.com/logicalclocks/feature-store-api/blob/{branch_name}/python" + ) + + +def generate(dest_dir): + doc_generator = keras_autodoc.DocumentationGenerator( + PAGES, + project_url=project_url, + template_dir="./docs/templates", + titles_size="###", + extra_aliases={ + "hsfs.core.query.Query": "hsfs.Query", + "hsfs.storage_connector.StorageConnector": "hsfs.StorageConnector", + "hsfs.statistics_config.StatisticsConfig": "hsfs.StatisticsConfig", + "hsfs.training_dataset_feature.TrainingDatasetFeature": "hsfs.TrainingDatasetFeature", + "pandas.core.frame.DataFrame": "pandas.DataFrame", + }, + max_signature_line_length=100, + ) + shutil.copyfile(hsfs_dir / "CONTRIBUTING.md", dest_dir / "CONTRIBUTING.md") + shutil.copyfile(hsfs_dir / "README.md", dest_dir / "index.md") + + doc_generator.generate(dest_dir / "generated") + + +if __name__ == "__main__": + generate(hsfs_dir / "docs") diff --git a/hsfs/docs/CONTRIBUTING.md b/hsfs/docs/CONTRIBUTING.md new file mode 100644 index 000000000..0df3de08e --- /dev/null +++ b/hsfs/docs/CONTRIBUTING.md @@ -0,0 +1,220 @@ +## Python development setup + +--- + +- Fork and clone the repository + +- Create a new Python environment with your favourite environment manager (e.g. virtualenv or conda) and Python 3.9 (newer versions will return a library conflict in `auto_doc.py`) + +- Install repository in editable mode with development dependencies: + + ```bash + cd python + pip install -e ".[python,dev]" + ``` + +- Install [pre-commit](https://pre-commit.com/) and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. The Feature Store uses pre-commit to ensure code-style and code formatting through [ruff](https://docs.astral.sh/ruff/). Run the following commands from the `python` directory: + + ```bash + cd python + pip install --user pre-commit + pre-commit install + ``` + + Afterwards, pre-commit will run whenever you commit. + +- To run formatting and code-style separately, you can configure your IDE, such as VSCode, to use `ruff`, or run it via the command line: + + ```bash + # linting + ruff check python --fix + # formatting + ruff format python + ``` + +### Python documentation + +We follow a few best practices for writing the Python documentation: + +1. Use the google docstring style: + + ```python + """[One Line Summary] + + [Extended Summary] + + [!!! example + import xyz + ] + + # Arguments + arg1: Type[, optional]. Description[, defaults to `default`] + arg2: Type[, optional]. Description[, defaults to `default`] + + # Returns + Type. Description. + + # Raises + Exception. Description. + """ + ``` + + If Python 3 type annotations are used, they are inserted automatically. + +2. Feature store entity engine methods (e.g. FeatureGroupEngine etc.) only require a single line docstring. +3. REST Api implementations (e.g. FeatureGroupApi etc.) should be fully documented with docstrings without defaults. +4. Public Api such as metadata objects should be fully documented with defaults. + +#### Setup and Build Documentation + +We use `mkdocs` together with `mike` ([for versioning](https://github.com/jimporter/mike/)) to build the documentation and a plugin called `keras-autodoc` to auto generate Python API documentation from docstrings. + +**Background about `mike`:** +`mike` builds the documentation and commits it as a new directory to the gh-pages branch. Each directory corresponds to one version of the documentation. Additionally, `mike` maintains a json in the root of gh-pages with the mappings of versions/aliases for each of the directories available. With aliases you can define extra names like `dev` or `latest`, to indicate stable and unstable releases. + +1. Currently we are using our own version of `keras-autodoc` + + ```bash + pip install git+https://github.com/logicalclocks/keras-autodoc + ``` + +2. Install HSFS with `docs` extras: + + ```bash + pip install -e ".[python,dev]" && pip install -r ../requirements-docs.txt + ``` + +3. To build the docs, first run the auto doc script: + + ```bash + cd .. + python auto_doc.py + ``` + +##### Option 1: Build only current version of docs + +4. Either build the docs, or serve them dynamically: + + Note: Links and pictures might not resolve properly later on when checking with this build. + The reason for that is that the docs are deployed with versioning on docs.hopsworks.ai and + therefore another level is added to all paths, e.g. `docs.hopsworks.ai/[version-or-alias]`. + Using relative links should not be affected by this, however, building the docs with version + (Option 2) is recommended. + + ```bash + mkdocs build + # or + mkdocs serve + ``` + +##### Option 2 (Preferred): Build multi-version doc with `mike` + +###### Versioning on docs.hopsworks.ai + +On docs.hopsworks.ai we implement the following versioning scheme: + +- current master branches (e.g. of hsfs corresponding to master of Hopsworks): rendered as current Hopsworks snapshot version, e.g. **2.2.0-SNAPSHOT [dev]**, where `dev` is an alias to indicate that this is an unstable version. +- the latest release: rendered with full current version, e.g. **2.1.5 [latest]** with `latest` alias to indicate that this is the latest stable release. +- previous stable releases: rendered without alias, e.g. **2.1.4**. + +###### Build Instructions + +4. For this you can either checkout and make a local copy of the `upstream/gh-pages` branch, where `mike` maintains the current state of docs.hopsworks.ai, or just build documentation for the branch you are updating: + + Building _one_ branch: + + Checkout your dev branch with modified docs: + + ```bash + git checkout [dev-branch] + ``` + + Generate API docs if necessary: + + ```bash + python auto_doc.py + ``` + + Build docs with a version and alias + + ```bash + mike deploy [version] [alias] --update-alias + + # for example, if you are updating documentation to be merged to master, + # which will become the new SNAPSHOT version: + mike deploy 2.2.0-SNAPSHOT dev --update-alias + + # if you are updating docs of the latest stable release branch + mike deploy [version] latest --update-alias + + # if you are updating docs of a previous stable release branch + mike deploy [version] + ``` + + If no gh-pages branch existed in your local repository, this will have created it. + + **Important**: If no previous docs were built, you will have to choose a version as default to be loaded as index, as follows + + ```bash + mike set-default [version-or-alias] + ``` + + You can now checkout the gh-pages branch and serve: + + ```bash + git checkout gh-pages + mike serve + ``` + + You can also list all available versions/aliases: + + ```bash + mike list + ``` + + Delete and reset your local gh-pages branch: + + ```bash + mike delete --all + + # or delete single version + mike delete [version-or-alias] + ``` + +#### Adding new API documentation + +To add new documentation for APIs, you need to add information about the method/class to document to the `auto_doc.py` script: + +```python +PAGES = { + "connection.md": [ + "hsfs.connection.Connection.connection" + ] + "new_template.md": [ + "module", + "xyz.asd" + ] +} +``` + +Now you can add a template markdown file to the `docs/templates` directory with the name you specified in the auto-doc script. The `new_template.md` file should contain a tag to identify the place at which the API documentation should be inserted: + +```` +## The XYZ package + +{{module}} + +Some extra content here. + +!!! example + ```python + import xyz + ``` + +{{xyz.asd}} +```` + +Finally, run the `auto_doc.py` script, as decribed above, to update the documentation. + +For information about Markdown syntax and possible Admonitions/Highlighting etc. see +the [Material for Mkdocs themes reference documentation](https://squidfunk.github.io/mkdocs-material/reference/abbreviations/). diff --git a/hsfs/docs/assets/images/favicon.ico b/hsfs/docs/assets/images/favicon.ico new file mode 100644 index 000000000..ab7573067 Binary files /dev/null and b/hsfs/docs/assets/images/favicon.ico differ diff --git a/hsfs/docs/assets/images/hops-logo.png b/hsfs/docs/assets/images/hops-logo.png new file mode 100644 index 000000000..d3625ae07 Binary files /dev/null and b/hsfs/docs/assets/images/hops-logo.png differ diff --git a/hsfs/docs/assets/images/hopsworks-logo.png b/hsfs/docs/assets/images/hopsworks-logo.png new file mode 100644 index 000000000..36f20bb12 Binary files /dev/null and b/hsfs/docs/assets/images/hopsworks-logo.png differ diff --git a/hsfs/docs/css/custom.css b/hsfs/docs/css/custom.css new file mode 100644 index 000000000..45f87459a --- /dev/null +++ b/hsfs/docs/css/custom.css @@ -0,0 +1,114 @@ +[data-md-color-scheme="hopsworks"] { + --md-primary-fg-color: #1EB382; + --md-secondary-fg-color: #188a64; + --md-tertiary-fg-color: #0d493550; + --md-quaternary-fg-color: #fdfdfd; + --border-radius-variable: 5px; +} + +.md-footer__inner:not([hidden]) { + display: none +} + +/* Lex did stuff here */ +.svg_topnav{ + width: 12px; + filter: invert(100); +} +.svg_topnav:hover{ + width: 12px; + filter: invert(10); +} + +.md-header[data-md-state=shadow] { + box-shadow: 0 0 0 0; +} + +.md-tabs__item { + min-width: 2.25rem; + min-height: 1.5rem; +} + +.md-tabs__item:hover { + background-color: var(--md-tertiary-fg-color); + transition: background-color 450ms; +} + +/* +.md-sidebar__scrollwrap{ + background-color: var(--md-quaternary-fg-color); + padding: 15px 5px 5px 5px; + border-radius: var(--border-radius-variable); +} +*/ +.md-nav__link:focus{ +} + +.image_logo_02{ + width:450px; +} + +/* End of Lex did stuff here */ + +.md-header__button.md-logo { + margin: .1rem; + padding: .1rem; +} + +.md-header__button.md-logo img, .md-header__button.md-logo svg { + display: block; + width: 1.8rem; + height: 1.8rem; + fill: currentColor; +} + +.md-tabs { + width: 100%; + overflow: auto; + color: var(--md-primary-bg-color); + background-color: var(--md-secondary-fg-color); + transition: background-color 250ms; +} + +.wrapper { + display: grid; + grid-template-columns: repeat(4, 1fr); + gap: 10px; + grid-auto-rows: minmax(100px, auto); +} + +.wrapper * { + border: 2px solid green; + text-align: center; + padding: 70px 0; +} + +.one { + grid-column: 1 / 2; + grid-row: 1; +} +.two { + grid-column: 2 / 3; + grid-row: 1; +} +.three { + grid-column: 3 / 4; + grid-row: 1; +} +.four { + grid-column: 4 / 5; + grid-row: 1; +} +.five { + grid-column: 1 / 3; + grid-row: 2; +} +.six { + grid-column: 3 / 5; + grid-row: 2; +} + +/* Jupyter Stuff */ +.jupyter-wrapper .jp-CodeCell .jp-Cell-inputWrapper .jp-InputPrompt { + display: none !important; +} diff --git a/hsfs/docs/css/dropdown.css b/hsfs/docs/css/dropdown.css new file mode 100644 index 000000000..531f7b10d --- /dev/null +++ b/hsfs/docs/css/dropdown.css @@ -0,0 +1,55 @@ +/* Style The Dropdown Button */ +.dropbtn { + color: white; + border: none; + cursor: pointer; +} + +.md-tabs__list { + contain: inherit; +} + +.md-tabs { + overflow: inherit; +} + + +/* The container
- needed to position the dropdown content */ +.dropdown { + position: absolute; + display: inline-block; +} + +/* Dropdown Content (Hidden by Default) */ +.dropdown-content { + display: none; + font-size: 13px; + position: absolute; + background-color: #f9f9f9; + min-width: 160px; + box-shadow: 0px 8px 16px 0px rgba(0, 0, 0, 0.2); + z-index: 1000; + border-radius: 2px; + left: -15px; +} + +/* Links inside the dropdown */ +.dropdown-content a { + color: black; + padding: 12px 16px; + text-decoration: none; + display: block; +} + +/* Change color of dropdown links on hover */ +.dropdown-content a:hover { + background-color: #f1f1f1 +} + +/* Show the dropdown menu on hover */ +.dropdown:hover .dropdown-content { + display: block; +} + +/* Change the background color of the dropdown button when the dropdown content is shown */ +.dropdown:hover .dropbtn {} \ No newline at end of file diff --git a/hsfs/docs/css/marctech.css b/hsfs/docs/css/marctech.css new file mode 100644 index 000000000..8bb58c97b --- /dev/null +++ b/hsfs/docs/css/marctech.css @@ -0,0 +1,1047 @@ +:root { + --md-primary-fg-color: #1EB382; + --md-secondary-fg-color: #188a64; + --md-tertiary-fg-color: #0d493550; + --md-quaternary-fg-color: #fdfdfd; + --md-fiftuary-fg-color: #2471cf; + --border-radius-variable: 5px; + --border-width:1px; + } + + .marctech_main a{ + color: var(--md-fiftuary-fg-color); + border-bottom: 1px dotted var(--md-fiftuary-fg-color) !important; + text-decoration: dotted !important;} + + .marctech_main a:hover{ + border-bottom: 1px dotted var(--md-primary-fg-color)!important; + } + + .marctech_main a:visited{ + color: var(--md-tertiary-fg-color); + border-bottom: 1px dotted var(--md-tertiary-fg-color) !important; + + } + + .w-layout-grid { + display: -ms-grid; + display: grid; + grid-auto-columns: 1fr; + -ms-grid-columns: 1fr 1fr; + grid-template-columns: 1fr 1fr; + -ms-grid-rows: auto auto; + grid-template-rows: auto auto; + grid-row-gap: 16px; + grid-column-gap: 16px; + } + + .image_logo{ + width: 69%; + background-color: white; + z-index: 50; + padding: 0px 15px 0px 15px; + margin-bottom: 10px; + } + + .layer_02{ + pointer-events: none; + } + + .round-frame{ + pointer-events: initial; + } + + .marctech_main { + margin-top:-20px; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + margin-bottom: 55px; + } + + .collumns { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + height: 100%; + -webkit-box-align: stretch; + -webkit-align-items: stretch; + -ms-flex-align: stretch; + align-items: stretch; + } + + .col_heading { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + } + + .enterprisefs { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + } + + .enterprise_ai { + -webkit-align-self: center; + -ms-flex-item-align: center; + -ms-grid-row-align: center; + align-self: center; + -webkit-box-flex: 1; + -webkit-flex: 1; + -ms-flex: 1; + flex: 1; + } + + .side-content { + z-index: 0; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + width: 240px; + height: 100%; + margin-top: 10px; + margin-bottom: 10px; + padding: 20px 10px; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + -webkit-align-content: flex-start; + -ms-flex-line-pack: start; + align-content: flex-start; + border-style: solid; + border-width: var(--border-width); + border-color: #585858; + border-radius: 10px; + background-color:var(--md-quaternary-fg-color); + } + .body { + padding: 40px; + font-family: Roboto, sans-serif; + } + + .green { + color: #1eb182; + font-size: 1.2vw; + } + + .rec_frame { + position: relative; + z-index: 1; + display: inline-block; + min-width: 150px; + margin-top: 10px; + margin-right: 10px; + margin-left: 10px; + padding: 10px 10px; + border-style: solid; + border-width: var(--border-width); + border-color: #585858; + border-radius: 10px; + background-color: #fff; + box-shadow: 4px 4px 0 0 rgba(88, 88, 88, 0.16); + -webkit-transition: box-shadow 200ms ease, border-color 200ms ease; + transition: box-shadow 200ms ease, border-color 200ms ease; + color: #585858; + text-align: center; + cursor: pointer; + } + + .rec_frame:hover { + border-color: #c2c2c2; + box-shadow: none; + } + + .name_item { + font-size: 0.7rem; + line-height: 120%; + font-weight: 700; + } + + .name_item.db { + position: relative; + z-index: 3; + text-align: left; + } + + .name_item.small { + font-size: 0.6rem; + font-weight: 500; + } + + .name_item.ingrey { + padding-bottom: 20px; + } + + .db_frame-mid { + position: relative; + z-index: 1; + margin-top: -8px; + padding: 5px 2px; + border-style: solid; + border-width: var(--border-width); + border-color: #585858; + border-radius: 0px 0% 50% 50%; + background-color: #fff; + color: #585858; + text-align: center; + } + + .db_frame-top { + position: relative; + z-index: 2; + padding: 5px 2px; + border-style: solid; + border-width: var(--border-width); + border-color: #585858; + border-radius: 50%; + background-color: #fff; + color: #585858; + text-align: center; + } + + .icondb { + position: relative; + width: 25px; + min-width: 25px; + margin-right: 10px; + } + + .db_frame { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + width: 150px; + height: 55px; + padding: 20px 10px; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + border-style: solid; + border-width: var(--border-width); + border-color: #585858; + border-radius: 10px; + background-color: #fff; + box-shadow: 4px 4px 0 0 rgba(88, 88, 88, 0.16); + -webkit-transition: box-shadow 200ms ease, border-color 200ms ease; + transition: box-shadow 200ms ease, border-color 200ms ease; + color: #585858; + text-align: center; + cursor: pointer; + } + + .db_frame:hover { + border-color: #c2c2c2; + box-shadow: none; + } + + .grid { + -ms-grid-rows: auto auto auto; + grid-template-rows: auto auto auto; + } + + .arrowdown { + position: relative; + z-index: 0; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + margin-top: -10px; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + } + + .heading_MT { + margin-top: 0px !important; + margin-bottom: 0px !important; + font-size: 1.3rem !important; + white-space: nowrap !important; + } + + .head_col { + padding-left: 10px; + } + + .MT_heading3 { + margin-top: 0px !important ; + font-size: 0.8rem !important; + } + + .MT_heading3.green { + color: #1eb182 !important; + } + + .column_sides { + position: relative; + z-index: 2; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: justify; + -webkit-justify-content: space-between; + -ms-flex-pack: justify; + justify-content: space-between; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + } + + .hopsicon { + width: 45px; + height: 45px; + } + + .column_center { + z-index: 10; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + } + + .center-content { + z-index: -50; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + width: 750px; + height: 670px; + margin-top: 10px; + margin-bottom: 10px; + padding: 20px 10px; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + -webkit-align-content: center; + -ms-flex-line-pack: center; + align-content: center; + border-radius: 10px; + background-color: transparent; + } + + .image { + width: 260px; + } + + .layer_01 { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + -webkit-box-align: stretch; + -webkit-align-items: stretch; + -ms-flex-align: stretch; + align-items: stretch; + } + + .name_center { + font-size: 1rem; + font-weight: 700; + } + + .rec_frame_main { + position: relative; + z-index: 1; + margin-top: 10px; + margin-right: 10px; + margin-left: 10px; + padding: 5px 10px; + border-style: solid; + border-width: var(--border-width); + border-color: #1eb182; + border-radius: 10px; + background-color: #e6fdf6; + box-shadow: 4px 4px 0 0 #dcf7ee; + -webkit-transition: box-shadow 200ms ease, border-color 200ms ease; + transition: box-shadow 200ms ease, border-color 200ms ease; + color: #1eb182; + text-align: center; + cursor: pointer; + } + + .rec_frame_main:hover { + border-color: #9fecd4; + box-shadow: none; + } + + .rec_frame_main.no_content { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + height: 100%; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + box-shadow: 4px 4px 0 0 #dcf7ee; + } + + .rec_frame_main.no_content:hover { + border-color: #1eb182; + box-shadow: 4px 4px 0 0 rgba(88, 88, 88, 0.16); + } + + .name_item_02 { + font-size: 0.85rem; + font-weight: 700; + } + + .grid-infra { + padding-top: 20px; + -ms-grid-columns: 1fr 1fr 1fr 1fr; + grid-template-columns: 1fr 1fr 1fr 1fr; + -ms-grid-rows: auto; + grid-template-rows: auto; + } + + .rec_frame_main-white { + position: relative; + z-index: 1; + display: inline-block; + width: 100%; + margin-top: 10px; + margin-bottom: 10px; + padding: 5px 10px; + border-style: solid; + border-width: var(--border-width); + border-color: #1eb182; + border-radius: 10px; + background-color: #fff; + box-shadow: 4px 4px 0 0 rgba(88, 88, 88, 0.16); + -webkit-transition: box-shadow 200ms ease, border-color 200ms ease; + transition: box-shadow 200ms ease, border-color 200ms ease; + color: #1eb182; + text-align: center; + cursor: pointer; + } + + .rec_frame_main-white:hover { + border-color: #c2c2c2; + box-shadow: none; + } + + .rec_frame_main-white.dotted { + border-style: dotted; + } + + .column { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: justify; + -webkit-justify-content: space-between; + -ms-flex-pack: justify; + justify-content: space-between; + -webkit-box-align: stretch; + -webkit-align-items: stretch; + -ms-flex-align: stretch; + align-items: stretch; + } + + .columns_center { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-orient: horizontal; + -webkit-box-direction: normal; + -webkit-flex-direction: row; + -ms-flex-direction: row; + flex-direction: row; + -webkit-box-pack: justify; + -webkit-justify-content: space-between; + -ms-flex-pack: justify; + justify-content: space-between; + } + + .non-bold { + font-weight: 400; + } + + .logo-holder { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + } + + .infra { + text-align: center; + position: relative; + z-index: 30; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + padding: 10px; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + border: 1px dashed #000; + border-radius: 6px; + background-color: #fff; + cursor: pointer; + } + + .infra:hover { + border-style: solid; + border-color: #585858; + } + + .text_and_icon { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + } + + .svg_icon { + width: 33px; + margin-right: 10px; + margin-left: 10px; + } + + .layer_02 { + position: absolute; + z-index: 10; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + width: 96%; + height: 90%; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + -webkit-box-align: stretch; + -webkit-align-items: stretch; + -ms-flex-align: stretch; + align-items: stretch; + border-style: solid; + border-width: calc (var(--border-width)*2); + border-color: #bbbbbb50 ; + border-radius: 100%; + background-color: transparent; + } + + .round-frame { + position: absolute; + left: 0%; + top: auto; + right: auto; + bottom: 0%; + z-index: 10; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + width: 120px; + height: 120px; + margin: 10px; + padding: 20px; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + -webkit-box-align: center; + -webkit-align-items: center; + -ms-flex-align: center; + align-items: center; + border-style: solid; + border-width: var(--border-width); + border-color: #585858; + border-radius: 100%; + background-color: #fff; + outline-color: #fff; + outline-offset: 0px; + outline-style: solid; + outline-width: 7px; + -webkit-transition: box-shadow 200ms ease, border-color 200ms ease; + transition: box-shadow 200ms ease, border-color 200ms ease; + color: #585858; + text-align: center; + cursor: pointer; + } + + .round-frame:hover { + border-color: #c2c2c2; + box-shadow: none; + } + + .round-frame.top-left { + left: 4%; + top: 15%; + right: auto; + bottom: auto; + } + + .round-frame.bottom-left { + left: 4%; + bottom: 15%; + } + + .round-frame.top-right { + left: auto; + top: 15%; + right: 4%; + bottom: auto; + } + + .round-frame.bottom-right { + left: auto; + top: auto; + right: 4%; + bottom: 15%; + padding: 10px; + } + + .side-holder { + z-index: -1; + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + height: 630px; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: center; + -webkit-justify-content: center; + -ms-flex-pack: center; + justify-content: center; + } + + .infra-icon { + width: 25px; + height: 25px; + } + + .div-block { + display: -webkit-box; + display: -webkit-flex; + display: -ms-flexbox; + display: flex; + height: 100%; + -webkit-box-orient: vertical; + -webkit-box-direction: normal; + -webkit-flex-direction: column; + -ms-flex-direction: column; + flex-direction: column; + -webkit-box-pack: justify; + -webkit-justify-content: space-between; + -ms-flex-pack: justify; + justify-content: space-between; + } + + #w-node-a2a9b648-f5dd-74e5-e1c2-f7aaf4fa1fcd-46672785 { + -ms-grid-column: span 1; + grid-column-start: span 1; + -ms-grid-column-span: 1; + grid-column-end: span 1; + -ms-grid-row: span 1; + grid-row-start: span 1; + -ms-grid-row-span: 1; + grid-row-end: span 1; + } + + #w-node-_466aa2bf-88bf-5a65-eab4-fc1eb95e7384-46672785 { + -ms-grid-column: span 1; + grid-column-start: span 1; + -ms-grid-column-span: 1; + grid-column-end: span 1; + -ms-grid-row: span 1; + grid-row-start: span 1; + -ms-grid-row-span: 1; + grid-row-end: span 1; + } + + #w-node-_87009ba3-d9a6-e0b7-4cce-581190a19cf3-46672785 { + -ms-grid-column: span 1; + grid-column-start: span 1; + -ms-grid-column-span: 1; + grid-column-end: span 1; + -ms-grid-row: span 1; + grid-row-start: span 1; + -ms-grid-row-span: 1; + grid-row-end: span 1; + } + + #w-node-_4a479fbb-90c7-9f47-d439-20aa6a224339-46672785 { + -ms-grid-column: span 1; + grid-column-start: span 1; + -ms-grid-column-span: 1; + grid-column-end: span 1; + -ms-grid-row: span 1; + grid-row-start: span 1; + -ms-grid-row-span: 1; + grid-row-end: span 1; + } + + + /* + + + inherited from the original template + + */ + + .w-container .w-row { + margin-left: -10px; + margin-right: -10px; + } + .w-row:before, + .w-row:after { + content: " "; + display: table; + grid-column-start: 1; + grid-row-start: 1; + grid-column-end: 2; + grid-row-end: 2; + } + .w-row:after { + clear: both; + } + .w-row .w-row { + margin-left: 0; + margin-right: 0; + } + .w-col { + position: relative; + float: left; + width: 100%; + min-height: 1px; + padding-left: 10px; + padding-right: 10px; + } + .w-col .w-col { + padding-left: 0; + padding-right: 0; + } + .w-col-1 { + width: 8.33333333%; + } + .w-col-2 { + width: 16.66666667%; + } + .w-col-3 { + width: 25%; + } + .w-col-4 { + width: 33.33333333%; + } + .w-col-5 { + width: 41.66666667%; + } + .w-col-6 { + width: 50%; + } + .w-col-7 { + width: 58.33333333%; + } + .w-col-8 { + width: 66.66666667%; + } + .w-col-9 { + width: 75%; + } + .w-col-10 { + width: 83.33333333%; + } + .w-col-11 { + width: 91.66666667%; + } + .w-col-12 { + width: 100%; + } + .w-hidden-main { + display: none !important; + } + @media screen and (max-width: 991px) { + .w-container { + max-width: 728px; + } + .w-hidden-main { + display: inherit !important; + } + .w-hidden-medium { + display: none !important; + } + .w-col-medium-1 { + width: 8.33333333%; + } + .w-col-medium-2 { + width: 16.66666667%; + } + .w-col-medium-3 { + width: 25%; + } + .w-col-medium-4 { + width: 33.33333333%; + } + .w-col-medium-5 { + width: 41.66666667%; + } + .w-col-medium-6 { + width: 50%; + } + .w-col-medium-7 { + width: 58.33333333%; + } + .w-col-medium-8 { + width: 66.66666667%; + } + .w-col-medium-9 { + width: 75%; + } + .w-col-medium-10 { + width: 83.33333333%; + } + .w-col-medium-11 { + width: 91.66666667%; + } + .w-col-medium-12 { + width: 100%; + } + .w-col-stack { + width: 100%; + left: auto; + right: auto; + } + } + @media screen and (max-width: 767px) { + .w-hidden-main { + display: inherit !important; + } + .w-hidden-medium { + display: inherit !important; + } + .w-hidden-small { + display: none !important; + } + .w-row, + .w-container .w-row { + margin-left: 0; + margin-right: 0; + } + .w-col { + width: 100%; + left: auto; + right: auto; + } + .w-col-small-1 { + width: 8.33333333%; + } + .w-col-small-2 { + width: 16.66666667%; + } + .w-col-small-3 { + width: 25%; + } + .w-col-small-4 { + width: 33.33333333%; + } + .w-col-small-5 { + width: 41.66666667%; + } + .w-col-small-6 { + width: 50%; + } + .w-col-small-7 { + width: 58.33333333%; + } + .w-col-small-8 { + width: 66.66666667%; + } + .w-col-small-9 { + width: 75%; + } + .w-col-small-10 { + width: 83.33333333%; + } + .w-col-small-11 { + width: 91.66666667%; + } + .w-col-small-12 { + width: 100%; + } + } + @media screen and (max-width: 479px) { + .w-container { + max-width: none; + } + .w-hidden-main { + display: inherit !important; + } + .w-hidden-medium { + display: inherit !important; + } + .w-hidden-small { + display: inherit !important; + } + .w-hidden-tiny { + display: none !important; + } + .w-col { + width: 100%; + } + .w-col-tiny-1 { + width: 8.33333333%; + } + .w-col-tiny-2 { + width: 16.66666667%; + } + .w-col-tiny-3 { + width: 25%; + } + .w-col-tiny-4 { + width: 33.33333333%; + } + .w-col-tiny-5 { + width: 41.66666667%; + } + .w-col-tiny-6 { + width: 50%; + } + .w-col-tiny-7 { + width: 58.33333333%; + } + .w-col-tiny-8 { + width: 66.66666667%; + } + .w-col-tiny-9 { + width: 75%; + } + .w-col-tiny-10 { + width: 83.33333333%; + } + .w-col-tiny-11 { + width: 91.66666667%; + } + .w-col-tiny-12 { + width: 100%; + } + } diff --git a/hsfs/docs/css/version-select.css b/hsfs/docs/css/version-select.css new file mode 100644 index 000000000..3b908ae84 --- /dev/null +++ b/hsfs/docs/css/version-select.css @@ -0,0 +1,36 @@ +@media only screen and (max-width:76.1875em) { +} + +#version-selector select.form-control { + appearance: none; + -webkit-appearance: none; + -moz-appearance: none; + + background-color: #F5F5F5; + + background-position: center right; + background-repeat: no-repeat; + border: 0px; + border-radius: 2px; + /* box-shadow: 0px 1px 3px rgb(0 0 0 / 10%); */ + color: inherit; + width: -webkit-fill-available; + width: -moz-available; + max-width: 200px; + font-size: inherit; + /* font-weight: 600; */ + margin: 10px; + overflow: hidden; + padding: 7px 10px; + text-overflow: ellipsis; + white-space: nowrap; +} + +#version-selector::after { + content: '⌄'; + font-family: inherit; + font-size: 22px; + margin: -35px; + vertical-align: 7%; + padding-bottom: 10px; +} diff --git a/hsfs/docs/index.md b/hsfs/docs/index.md new file mode 100644 index 000000000..a13ea2ce5 --- /dev/null +++ b/hsfs/docs/index.md @@ -0,0 +1,201 @@ +# Hopsworks Feature Store + +

+ Hopsworks Community + Hopsworks Feature Store Documentation + python + PyPiStatus + Scala/Java Artifacts + Downloads + Ruff + License +

+ +HSFS is the library to interact with the Hopsworks Feature Store. The library makes creating new features, feature groups and training datasets easy. + +The library is environment independent and can be used in two modes: + +- Spark mode: For data engineering jobs that create and write features into the feature store or generate training datasets. It requires a Spark environment such as the one provided in the Hopsworks platform or Databricks. In Spark mode, HSFS provides bindings both for Python and JVM languages. + +- Python mode: For data science jobs to explore the features available in the feature store, generate training datasets and feed them in a training pipeline. Python mode requires just a Python interpreter and can be used both in Hopsworks from Python Jobs/Jupyter Kernels, Amazon SageMaker or KubeFlow. + +The library automatically configures itself based on the environment it is run. +However, to connect from an external environment such as Databricks or AWS Sagemaker, +additional connection information, such as host and port, is required. For more information checkout the [Hopsworks documentation](https://docs.hopsworks.ai/latest/). + +## Getting Started On Hopsworks + +Get started easily by registering an account on [Hopsworks Serverless](https://app.hopsworks.ai/). Create your project and a [new Api key](https://docs.hopsworks.ai/latest/user_guides/projects/api_key/create_api_key/). In a new python environment with Python 3.8 or higher, install the [client library](https://docs.hopsworks.ai/latest/user_guides/client_installation/) using pip: + +```bash +# Get all Hopsworks SDKs: Feature Store, Model Serving and Platform SDK +pip install hopsworks +# or minimum install with the Feature Store SDK +pip install hsfs[python] +# if using zsh don't forget the quotes +pip install 'hsfs[python]' +``` + +You can start a notebook and instantiate a connection and get the project feature store handler. + +```python +import hopsworks + +project = hopsworks.login() # you will be prompted for your api key +fs = project.get_feature_store() +``` + +or using `hsfs` directly: + +```python +import hsfs + +connection = hsfs.connection( + host="c.app.hopsworks.ai", # + project="your-project", + api_key_value="your-api-key", +) +fs = connection.get_feature_store() +``` + +Create a new feature group to start inserting feature values. +```python +fg = fs.create_feature_group("rain", + version=1, + description="Rain features", + primary_key=['date', 'location_id'], + online_enabled=True) + +fg.save(dataframe) +``` + +Upsert new data in to the feature group with `time_travel_format="HUDI"`". +```python +fg.insert(upsert_df) +``` + +Retrieve commit timeline metdata of the feature group with `time_travel_format="HUDI"`". +```python +fg.commit_details() +``` + +"Reading feature group as of specific point in time". +```python +fg = fs.get_feature_group("rain", 1) +fg.read("2020-10-20 07:34:11").show() +``` + +Read updates that occurred between specified points in time. +```python +fg = fs.get_feature_group("rain", 1) +fg.read_changes("2020-10-20 07:31:38", "2020-10-20 07:34:11").show() +``` + +Join features together +```python +feature_join = rain_fg.select_all() + .join(temperature_fg.select_all(), on=["date", "location_id"]) + .join(location_fg.select_all()) +feature_join.show(5) +``` + +join feature groups that correspond to specific point in time +```python +feature_join = rain_fg.select_all() + .join(temperature_fg.select_all(), on=["date", "location_id"]) + .join(location_fg.select_all()) + .as_of("2020-10-31") +feature_join.show(5) +``` + +join feature groups that correspond to different time +```python +rain_fg_q = rain_fg.select_all().as_of("2020-10-20 07:41:43") +temperature_fg_q = temperature_fg.select_all().as_of("2020-10-20 07:32:33") +location_fg_q = location_fg.select_all().as_of("2020-10-20 07:33:08") +joined_features_q = rain_fg_q.join(temperature_fg_q).join(location_fg_q) +``` + +Use the query object to create a training dataset: +```python +td = fs.create_training_dataset("rain_dataset", + version=1, + data_format="tfrecords", + description="A test training dataset saved in TfRecords format", + splits={'train': 0.7, 'test': 0.2, 'validate': 0.1}) + +td.save(feature_join) +``` + +A short introduction to the Scala API: +```scala +import com.logicalclocks.hsfs._ +val connection = HopsworksConnection.builder().build() +val fs = connection.getFeatureStore(); +val attendances_features_fg = fs.getFeatureGroup("games_features", 1); +attendances_features_fg.show(1) +``` + +You can find more examples on how to use the library in our [hops-examples](https://github.com/logicalclocks/hops-examples) repository. + +## Usage + +Usage data is collected for improving quality of the library. It is turned on by default if the backend +is "c.app.hopsworks.ai". To turn it off, use one of the following way: +```python +# use environment variable +import os +os.environ["ENABLE_HOPSWORKS_USAGE"] = "false" + +# use `disable_usage_logging` +import hsfs +hsfs.disable_usage_logging() +``` + +The source code can be found in python/hsfs/usage.py. + +## Documentation + +Documentation is available at [Hopsworks Feature Store Documentation](https://docs.hopsworks.ai/). + +## Issues + +For general questions about the usage of Hopsworks and the Feature Store please open a topic on [Hopsworks Community](https://community.hopsworks.ai/). + +Please report any issue using [Github issue tracking](https://github.com/logicalclocks/feature-store-api/issues). + +Please attach the client environment from the output below in the issue: +```python +import hopsworks +import hsfs +hopsworks.login().get_feature_store() +print(hsfs.get_env()) +``` + +## Contributing + +If you would like to contribute to this library, please see the [Contribution Guidelines](CONTRIBUTING.md). diff --git a/hsfs/docs/js/dropdown.js b/hsfs/docs/js/dropdown.js new file mode 100644 index 000000000..2618e0ce7 --- /dev/null +++ b/hsfs/docs/js/dropdown.js @@ -0,0 +1,2 @@ +document.getElementsByClassName("md-tabs__link")[7].style.display = "none"; +document.getElementsByClassName("md-tabs__link")[9].style.display = "none"; diff --git a/hsfs/docs/js/inject-api-links.js b/hsfs/docs/js/inject-api-links.js new file mode 100644 index 000000000..aa5852283 --- /dev/null +++ b/hsfs/docs/js/inject-api-links.js @@ -0,0 +1,32 @@ +window.addEventListener("DOMContentLoaded", function () { + var windowPathNameSplits = window.location.pathname.split("/"); + var majorVersionRegex = new RegExp("(\\d+[.]\\d+)") + var latestRegex = new RegExp("latest"); + if (majorVersionRegex.test(windowPathNameSplits[1])) { // On landing page docs.hopsworks.api/3.0 - URL contains major version + // Version API dropdown + document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + windowPathNameSplits[1] + "/generated/api/login/"; + document.getElementById("hsfs_api_link").href = "https://docs.hopsworks.ai/feature-store-api/" + windowPathNameSplits[1] + "/generated/api/connection_api/"; + document.getElementById("hsml_api_link").href = "https://docs.hopsworks.ai/machine-learning-api/" + windowPathNameSplits[1] + "/generated/connection_api/"; + } else { // on docs.hopsworks.api/feature-store-api/3.0 / docs.hopsworks.api/hopsworks-api/3.0 / docs.hopsworks.api/machine-learning-api/3.0 + if (latestRegex.test(windowPathNameSplits[2]) || latestRegex.test(windowPathNameSplits[1])) { + var majorVersion = "latest"; + } else { + + var apiVersion = windowPathNameSplits[2]; + var majorVersion = apiVersion.match(majorVersionRegex)[0]; + } + // Version main navigation + document.getElementsByClassName("md-tabs__link")[0].href = "https://docs.hopsworks.ai/" + majorVersion; + document.getElementsByClassName("md-tabs__link")[1].href = "https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/quickstart.ipynb"; + document.getElementsByClassName("md-tabs__link")[2].href = "https://docs.hopsworks.ai/" + majorVersion + "/tutorials/"; + document.getElementsByClassName("md-tabs__link")[3].href = "https://docs.hopsworks.ai/" + majorVersion + "/concepts/hopsworks/"; + document.getElementsByClassName("md-tabs__link")[4].href = "https://docs.hopsworks.ai/" + majorVersion + "/user_guides/"; + document.getElementsByClassName("md-tabs__link")[5].href = "https://docs.hopsworks.ai/" + majorVersion + "/setup_installation/aws/getting_started/"; + document.getElementsByClassName("md-tabs__link")[6].href = "https://docs.hopsworks.ai/" + majorVersion + "/admin/"; + // Version API dropdown + document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + majorVersion + "/generated/api/login/"; + document.getElementById("hsfs_api_link").href = "https://docs.hopsworks.ai/feature-store-api/" + majorVersion + "/generated/api/connection_api/"; + document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/feature-store-api/" + majorVersion + "/javadoc"; + document.getElementById("hsml_api_link").href = "https://docs.hopsworks.ai/machine-learning-api/" + majorVersion + "/generated/connection_api/"; + } +}); diff --git a/hsfs/docs/js/version-select.js b/hsfs/docs/js/version-select.js new file mode 100644 index 000000000..fcac029e3 --- /dev/null +++ b/hsfs/docs/js/version-select.js @@ -0,0 +1,64 @@ +window.addEventListener("DOMContentLoaded", function() { + // This is a bit hacky. Figure out the base URL from a known CSS file the + // template refers to... + var ex = new RegExp("/?css/version-select.css$"); + var sheet = document.querySelector('link[href$="version-select.css"]'); + + var ABS_BASE_URL = sheet.href.replace(ex, ""); + var CURRENT_VERSION = ABS_BASE_URL.split("/").pop(); + + function makeSelect(options, selected) { + var select = document.createElement("select"); + select.classList.add("form-control"); + + options.forEach(function(i) { + var option = new Option(i.text, i.value, undefined, + i.value === selected); + select.add(option); + }); + + return select; + } + + var xhr = new XMLHttpRequest(); + xhr.open("GET", ABS_BASE_URL + "/../versions.json"); + xhr.onload = function() { + var versions = JSON.parse(this.responseText); + + var realVersion = versions.find(function(i) { + return i.version === CURRENT_VERSION || + i.aliases.includes(CURRENT_VERSION); + }).version; + var latestVersion = versions.find(function(i) { + return i.aliases.includes("latest"); + }).version; + let outdated_banner = document.querySelector('div[data-md-color-scheme="default"][data-md-component="outdated"]'); + if (realVersion !== latestVersion) { + outdated_banner.removeAttribute("hidden"); + } else { + outdated_banner.setAttribute("hidden", ""); + } + + var select = makeSelect(versions.map(function(i) { + var allowedAliases = ["dev", "latest"] + if (i.aliases.length > 0) { + var aliasString = " [" + i.aliases.filter(function (str) { return allowedAliases.includes(str); }).join(", ") + "]"; + } else { + var aliasString = ""; + } + return {text: i.title + aliasString, value: i.version}; + }), realVersion); + select.addEventListener("change", function(event) { + window.location.href = ABS_BASE_URL + "/../" + this.value + "/generated/api/connection_api/"; + }); + + var container = document.createElement("div"); + container.id = "version-selector"; + // container.className = "md-nav__item"; + container.appendChild(select); + + var sidebar = document.querySelector(".md-nav--primary > .md-nav__list"); + sidebar.parentNode.insertBefore(container, sidebar.nextSibling); + }; + xhr.send(); +}); diff --git a/hsfs/docs/overrides/main.html b/hsfs/docs/overrides/main.html new file mode 100644 index 000000000..ecb09de07 --- /dev/null +++ b/hsfs/docs/overrides/main.html @@ -0,0 +1,8 @@ +{% extends "base.html" %} + +{% block outdated %} +You're not viewing the latest version of the documentation. + + Click here to go to latest. + +{% endblock %} diff --git a/hsfs/docs/templates/api/connection_api.md b/hsfs/docs/templates/api/connection_api.md new file mode 100644 index 000000000..19e13f3eb --- /dev/null +++ b/hsfs/docs/templates/api/connection_api.md @@ -0,0 +1,11 @@ +# Connection + +{{connection}} + +## Properties + +{{connection_properties}} + +## Methods + +{{connection_methods}} diff --git a/hsfs/docs/templates/api/embedding_feature_api.md b/hsfs/docs/templates/api/embedding_feature_api.md new file mode 100644 index 000000000..c054672d0 --- /dev/null +++ b/hsfs/docs/templates/api/embedding_feature_api.md @@ -0,0 +1,7 @@ +# EmbeddingFeature + +{{embedding_feature}} + +## Properties + +{{embedding_feature_properties}} diff --git a/hsfs/docs/templates/api/embedding_index_api.md b/hsfs/docs/templates/api/embedding_index_api.md new file mode 100644 index 000000000..d336e0ddb --- /dev/null +++ b/hsfs/docs/templates/api/embedding_index_api.md @@ -0,0 +1,12 @@ +# EmbeddingIndex + +{{embedding_index}} + +## Properties + +{{embedding_index_properties}} + +## Methods + +{{embedding_index_methods}} + diff --git a/hsfs/docs/templates/api/expectation_api.md b/hsfs/docs/templates/api/expectation_api.md new file mode 100644 index 000000000..7ba4110c1 --- /dev/null +++ b/hsfs/docs/templates/api/expectation_api.md @@ -0,0 +1,20 @@ +# Expectation + +{{expectation}} + +## Properties + +{{expectation_properties}} + +## Methods + +{{expectation_methods}} + +## Creation +{{expectation_create}} + +## Retrieval + +{{expectation_getall}} + +{{expectation_get}} diff --git a/hsfs/docs/templates/api/expectation_suite_api.md b/hsfs/docs/templates/api/expectation_suite_api.md new file mode 100644 index 000000000..a07ac5f8a --- /dev/null +++ b/hsfs/docs/templates/api/expectation_suite_api.md @@ -0,0 +1,41 @@ +# Expectation Suite + +{{expectation_suite}} + +## Creation with Great Expectations + +```python3 +import great_expectations as ge + +expectation_suite = ge.core.ExpectationSuite( + "new_expectation_suite", + expectations=[ + ge.core.ExpectationConfiguration( + expectation_type="expect_column_max_to_be_between", + kwargs={ + "column": "feature", + "min_value": -1, + "max_value": 1 + } + ) + ] +) +``` + +## Attach to Feature Group + +{{expectation_suite_attach}} + +## Single Expectation API + +An API to edit the expectation list based on Great Expectations API. + +{{single_expectation_api}} + +## Properties + +{{expectation_suite_properties}} + +## Methods + +{{expectation_suite_methods}} diff --git a/hsfs/docs/templates/api/external_feature_group_api.md b/hsfs/docs/templates/api/external_feature_group_api.md new file mode 100644 index 000000000..a982a39e8 --- /dev/null +++ b/hsfs/docs/templates/api/external_feature_group_api.md @@ -0,0 +1,19 @@ +# ExternalFeatureGroup + +{{fg}} + +## Creation + +{{fg_create}} + +## Retrieval + +{{fg_get}} + +## Properties + +{{fg_properties}} + +## Methods + +{{fg_methods}} diff --git a/hsfs/docs/templates/api/feature_api.md b/hsfs/docs/templates/api/feature_api.md new file mode 100644 index 000000000..8dca5ef54 --- /dev/null +++ b/hsfs/docs/templates/api/feature_api.md @@ -0,0 +1,11 @@ +# Feature + +{{feature}} + +## Properties + +{{feature_properties}} + +## Methods + +{{feature_methods}} diff --git a/hsfs/docs/templates/api/feature_descriptive_statistics_api.md b/hsfs/docs/templates/api/feature_descriptive_statistics_api.md new file mode 100644 index 000000000..3be8cccd3 --- /dev/null +++ b/hsfs/docs/templates/api/feature_descriptive_statistics_api.md @@ -0,0 +1,7 @@ +# Feature Descriptive Statistics + +{{feature_descriptive_statistics}} + +## Properties + +{{feature_descriptive_statistics_properties}} diff --git a/hsfs/docs/templates/api/feature_group_api.md b/hsfs/docs/templates/api/feature_group_api.md new file mode 100644 index 000000000..372865c4b --- /dev/null +++ b/hsfs/docs/templates/api/feature_group_api.md @@ -0,0 +1,19 @@ +# FeatureGroup + +{{fg}} + +## Creation + +{{fg_create}} + +## Retrieval + +{{fg_get}} + +## Properties + +{{fg_properties}} + +## Methods + +{{fg_methods}} diff --git a/hsfs/docs/templates/api/feature_monitoring_config_api.md b/hsfs/docs/templates/api/feature_monitoring_config_api.md new file mode 100644 index 000000000..7ca9b46ff --- /dev/null +++ b/hsfs/docs/templates/api/feature_monitoring_config_api.md @@ -0,0 +1,27 @@ +# Feature Monitoring Configuration + +{{feature_monitoring_config}} + +## Creation from Feature Group + +{{feature_monitoring_config_creation_fg}} + +## Creation from Feature View + +{{feature_monitoring_config_creation_fv}} + +## Retrieval from Feature Group + +{{feature_monitoring_config_retrieval_fg}} + +## Retrieval from Feature View + +{{feature_monitoring_config_retrieval_fv}} + +## Properties + +{{feature_monitoring_config_properties}} + +## Methods + +{{feature_monitoring_config_methods}} diff --git a/hsfs/docs/templates/api/feature_monitoring_result_api.md b/hsfs/docs/templates/api/feature_monitoring_result_api.md new file mode 100644 index 000000000..5bfca1165 --- /dev/null +++ b/hsfs/docs/templates/api/feature_monitoring_result_api.md @@ -0,0 +1,11 @@ +# Feature Monitoring Result + +{{feature_monitoring_result}} + +## Retrieval + +{{feature_monitoring_result_retrieval}} + +## Properties + +{{feature_monitoring_result_properties}} diff --git a/hsfs/docs/templates/api/feature_monitoring_window_config_api.md b/hsfs/docs/templates/api/feature_monitoring_window_config_api.md new file mode 100644 index 000000000..53ef23ea2 --- /dev/null +++ b/hsfs/docs/templates/api/feature_monitoring_window_config_api.md @@ -0,0 +1,7 @@ +# Feature Monitoring Window Configuration + +{{feature_monitoring_window_config}} + +## Properties + +{{feature_monitoring_window_config_properties}} diff --git a/hsfs/docs/templates/api/feature_store_api.md b/hsfs/docs/templates/api/feature_store_api.md new file mode 100644 index 000000000..f859336f6 --- /dev/null +++ b/hsfs/docs/templates/api/feature_store_api.md @@ -0,0 +1,15 @@ +# Feature Store + +{{fs}} + +## Retrieval + +{{fs_get}} + +## Properties + +{{fs_properties}} + +## Methods + +{{fs_methods}} diff --git a/hsfs/docs/templates/api/feature_view_api.md b/hsfs/docs/templates/api/feature_view_api.md new file mode 100644 index 000000000..c0f7df954 --- /dev/null +++ b/hsfs/docs/templates/api/feature_view_api.md @@ -0,0 +1,21 @@ +# Feature View + +{{fv}} + +## Creation + +{{fv_create}} + +## Retrieval + +{{fv_get}} + +{{fvs_get}} + +## Properties + +{{fv_properties}} + +## Methods + +{{fv_methods}} diff --git a/hsfs/docs/templates/api/job.md b/hsfs/docs/templates/api/job.md new file mode 100644 index 000000000..9ad68d976 --- /dev/null +++ b/hsfs/docs/templates/api/job.md @@ -0,0 +1,11 @@ +# Job + +{{job}} + +## Methods + +{{job_methods}} + +## Job Configuration + +{{job_configuration}} diff --git a/hsfs/docs/templates/api/links.md b/hsfs/docs/templates/api/links.md new file mode 100644 index 000000000..62cdc7001 --- /dev/null +++ b/hsfs/docs/templates/api/links.md @@ -0,0 +1,14 @@ +# Provenance Links + +Provenance Links are objects returned by methods such as [get_feature_groups_provenance](../storage_connector_api/#get_feature_groups_provenance), [get_storage_connector_provenance](../feature_group_api/#get_storage_connector_provenance), [get_parent_feature_group](../feature_group_api/#get_parent_feature_groups), [get_generated_feature_groups](../feature_group_api/#get_generated_feature_groups), [get_generated_feature_views](../feature_group_api/#get_generated_feature_views) [get_models_provenance](../feature_view_api/#get_models_provenance) and represent sections of the provenance graph, depending on the method invoked. + +## Properties + +{{links_properties}} + +# Artifact + +Artifacts objects are part of the provenance graph and contain a minimal set of information regarding the entities (feature groups, feature views) they represent. +The provenance graph contains Artifact objects when the underlying entities have been deleted or they are corrupted or they are not accessible by the user. + +{{artifact_properties}} diff --git a/hsfs/docs/templates/api/query_api.md b/hsfs/docs/templates/api/query_api.md new file mode 100644 index 000000000..7cc664d96 --- /dev/null +++ b/hsfs/docs/templates/api/query_api.md @@ -0,0 +1,13 @@ +# Query + +Query objects are strictly generated by HSFS APIs called on [Feature Group objects](feature_group_api.md). +Users will never construct a Query object using the constructor of the class. +For this reason we do not provide the full documentation of the class here. + +## Methods + +{{query_methods}} + +## Properties + +{{query_properties}} diff --git a/hsfs/docs/templates/api/rule_api.md b/hsfs/docs/templates/api/rule_api.md new file mode 100644 index 000000000..0801e3954 --- /dev/null +++ b/hsfs/docs/templates/api/rule_api.md @@ -0,0 +1,7 @@ +# Rule + +{{rule}} + +## Properties + +{{rule_properties}} diff --git a/hsfs/docs/templates/api/rule_definition_api.md b/hsfs/docs/templates/api/rule_definition_api.md new file mode 100644 index 000000000..326b66db0 --- /dev/null +++ b/hsfs/docs/templates/api/rule_definition_api.md @@ -0,0 +1,13 @@ +# Rule Definition + +{{ruledefinition}} + +## Properties + +{{ruledefinition_properties}} + +## Retrieval + +{{ruledefinition_getall}} + +{{ruledefinition_get}} diff --git a/hsfs/docs/templates/api/similarity_function_type_api.md b/hsfs/docs/templates/api/similarity_function_type_api.md new file mode 100644 index 000000000..bdfbc51c2 --- /dev/null +++ b/hsfs/docs/templates/api/similarity_function_type_api.md @@ -0,0 +1,3 @@ +# SimilarityFunctionType + +{{similarity_function_type}} diff --git a/hsfs/docs/templates/api/spine_group_api.md b/hsfs/docs/templates/api/spine_group_api.md new file mode 100644 index 000000000..a2bdf119c --- /dev/null +++ b/hsfs/docs/templates/api/spine_group_api.md @@ -0,0 +1,19 @@ +# SpineGroup + +{{fg}} + +## Creation + +{{fg_create}} + +## Retrieval + +{{fg_get}} + +## Properties + +{{fg_properties}} + +## Methods + +{{fg_methods}} diff --git a/hsfs/docs/templates/api/split_statistics_api.md b/hsfs/docs/templates/api/split_statistics_api.md new file mode 100644 index 000000000..09053ac5d --- /dev/null +++ b/hsfs/docs/templates/api/split_statistics_api.md @@ -0,0 +1,7 @@ +# Split Statistics + +{{split_statistics}} + +## Properties + +{{split_statistics_properties}} diff --git a/hsfs/docs/templates/api/statistics_api.md b/hsfs/docs/templates/api/statistics_api.md new file mode 100644 index 000000000..27ed90c9d --- /dev/null +++ b/hsfs/docs/templates/api/statistics_api.md @@ -0,0 +1,7 @@ +# Statistics + +{{statistics}} + +## Properties + +{{statistics_properties}} diff --git a/hsfs/docs/templates/api/statistics_config_api.md b/hsfs/docs/templates/api/statistics_config_api.md new file mode 100644 index 000000000..a907d1d32 --- /dev/null +++ b/hsfs/docs/templates/api/statistics_config_api.md @@ -0,0 +1,7 @@ +# StatisticsConfig + +{{statistics_config}} + +## Properties + +{{statistics_config_properties}} diff --git a/hsfs/docs/templates/api/storage_connector_api.md b/hsfs/docs/templates/api/storage_connector_api.md new file mode 100644 index 000000000..1b390e72a --- /dev/null +++ b/hsfs/docs/templates/api/storage_connector_api.md @@ -0,0 +1,119 @@ +# Storage Connector + +## Retrieval + +{{sc_get}} + +## HopsFS + +### Properties + +{{hopsfs_properties}} + +### Methods + +{{hopsfs_methods}} + +## JDBC + +### Properties + +{{jdbc_properties}} + +### Methods + +{{jdbc_methods}} + +## S3 + +### Properties + +{{s3_properties}} + +### Methods + +{{s3_methods}} + +## Redshift + +### Properties + +{{redshift_properties}} + +### Methods + +{{redshift_methods}} + +## Azure Data Lake Storage + +### Properties + +{{adls_properties}} + +### Methods + +{{adls_methods}} + +## Snowflake + +### Properties + +{{snowflake_properties}} + +### Methods + +{{snowflake_methods}} + +## Google Cloud Storage +This storage connector provides integration to Google Cloud Storage (GCS). +Once you create a connector in FeatureStore, you can transact data from a GCS bucket into a spark dataframe +by calling the `read` API. + +Authentication to GCP is handled by uploading the `JSON keyfile for service account` to the Hopsworks Project. For more information +on service accounts and creating keyfile in GCP, read [Google Cloud documentation.](https://cloud.google.com/docs/authentication/production#create_service_account +'creating service account keyfile') + +The connector also supports the optional encryption method `Customer Supplied Encryption Key` by Google. +The encryption details are stored as `Secrets` in the FeatureStore for keeping it secure. +Read more about encryption on [Google Documentation.](https://cloud.google.com/storage/docs/encryption#customer-supplied_encryption_keys) + +The storage connector uses the Google `gcs-connector-hadoop` behind the scenes. For more information, check out [Google Cloud Storage Connector for Spark and Hadoop]( +https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs#google-cloud-storage-connector-for-spark-and-hadoop 'google-cloud-storage-connector-for-spark-and-hadoop') + +### Properties + +{{gcs_properties}} + +### Methods + +{{gcs_methods}} + +## BigQuery +The BigQuery storage connector provides integration to Google Cloud BigQuery. +You can use it to run bigquery on your GCP cluster and load results into spark dataframe by calling the `read` API. + +Authentication to GCP is handled by uploading the `JSON keyfile for service account` to the Hopsworks Project. For more information +on service accounts and creating keyfile in GCP, read [Google Cloud documentation.](https://cloud.google.com/docs/authentication/production#create_service_account +'creating service account keyfile') + +The storage connector uses the Google `spark-bigquery-connector` behind the scenes. +To read more about the spark connector, like the spark options or usage, check [Apache Spark SQL connector for Google BigQuery.](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#usage +'github.com/GoogleCloudDataproc/spark-bigquery-connector') + +### Properties + +{{bigquery_properties}} + +### Methods + +{{bigquery_methods}} + +## Kafka + +### Properties + +{{kafka_properties}} + +### Methods + +{{kafka_methods}} diff --git a/hsfs/docs/templates/api/training_dataset_api.md b/hsfs/docs/templates/api/training_dataset_api.md new file mode 100644 index 000000000..a53696465 --- /dev/null +++ b/hsfs/docs/templates/api/training_dataset_api.md @@ -0,0 +1,19 @@ +# Training Dataset + +{{td}} + +## Creation + +{{td_create}} + +## Retrieval + +{{td_get}} + +## Properties + +{{td_properties}} + +## Methods + +{{td_methods}} diff --git a/hsfs/docs/templates/api/transformation_functions_api.md b/hsfs/docs/templates/api/transformation_functions_api.md new file mode 100644 index 000000000..249262a45 --- /dev/null +++ b/hsfs/docs/templates/api/transformation_functions_api.md @@ -0,0 +1,20 @@ +# Transformation Function + +{{transformation_function}} + +## Properties + +{{transformation_function_properties}} + +## Methods + +{{transformation_function_methods}} + +## Creation +{{create_transformation_function}} + +## Retrieval + +{{get_transformation_function}} + +{{get_transformation_functions}} diff --git a/hsfs/docs/templates/api/validation_api.md b/hsfs/docs/templates/api/validation_api.md new file mode 100644 index 000000000..8e1512f34 --- /dev/null +++ b/hsfs/docs/templates/api/validation_api.md @@ -0,0 +1,18 @@ +# Validation + +{{validation_result}} + +## Properties + +{{validation_result_properties}} + +## Methods + +{{expectation_methods}} + +## Validate a dataframe +{{validate}} + +## Retrieval + +{{validation_result_get}} diff --git a/hsfs/docs/templates/api/validation_report_api.md b/hsfs/docs/templates/api/validation_report_api.md new file mode 100644 index 000000000..435a87a03 --- /dev/null +++ b/hsfs/docs/templates/api/validation_report_api.md @@ -0,0 +1,19 @@ +# Validation Report + +{{validation_report}} + +## Creation + +{{validation_report_validate}} + +## Retrieval + +{{validation_report_get}} + +## Properties + +{{validation_report_properties}} + +## Methods + +{{validation_report_methods}} diff --git a/hsfs/java/beam/pom.xml b/hsfs/java/beam/pom.xml new file mode 100644 index 000000000..3b3f902ca --- /dev/null +++ b/hsfs/java/beam/pom.xml @@ -0,0 +1,55 @@ + + + + hsfs-parent + com.logicalclocks + 4.0.0-SNAPSHOT + + 4.0.0 + + hsfs-beam + + + 2.48.0 + 3.4.0 + + + + + com.logicalclocks + hsfs + ${project.version} + compile + + + javax.xml.bind + jaxb-api + + + + + + + org.apache.beam + beam-sdks-java-core + ${beam.version} + + + + + org.apache.beam + beam-sdks-java-io-kafka + ${beam.version} + + + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + + diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java new file mode 100644 index 000000000..fd93052a3 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam; + +import com.logicalclocks.hsfs.FeatureStoreBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.StatisticsConfig; +import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.TimeTravelFormat; +import com.logicalclocks.hsfs.TrainingDatasetBase; +import com.logicalclocks.hsfs.beam.constructor.Query; +import com.logicalclocks.hsfs.beam.engine.FeatureGroupEngine; +import com.logicalclocks.hsfs.beam.engine.FeatureViewEngine; +import com.logicalclocks.hsfs.metadata.StorageConnectorApi; +import lombok.NonNull; + +import java.io.IOException; +import java.util.List; + +public class FeatureStore extends FeatureStoreBase { + + private FeatureGroupEngine featureGroupEngine; + private FeatureViewEngine featureViewEngine; + + public FeatureStore() { + storageConnectorApi = new StorageConnectorApi(); + featureGroupEngine = new FeatureGroupEngine(); + } + + @Override + public Object createFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getFeatureGroups(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, List primaryKeys, + boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, List primaryKeys, + List partitionKeys, boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, String description, List primaryKeys, + List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, String topicName, String notificationTopicName, String eventTime) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + /** + * Get a stream feature group object from the feature store. + * + *

Getting a stream feature group metadata handle enables to interact with the feature group, + * such as read the data or use the `Query`-API to perform joins between feature groups and create feature + * views. + * + *

+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureGroup fg = fs.getStreamFeatureGroup("electricity_prices", 1);
+   * }
+   * 
+ * + * @param name the name of the feature group + * @return StreamFeatureGroup The stream feature group metadata object. + * @throws FeatureStoreException If unable to retrieve feature group from the feature store. + * @throws IOException Generic IO exception. + */ + @Override + public Object getStreamFeatureGroup(String name) throws FeatureStoreException, IOException { + LOGGER.info("VersionWarning: No version provided for getting feature group `" + name + "`, defaulting to `" + + DEFAULT_VERSION + "`."); + return getStreamFeatureGroup(name, DEFAULT_VERSION); + } + + /** + * Get a stream feature group object from the feature store. + * + *

Getting a stream feature group metadata handle enables to interact with the feature group, + * such as read the data or use the `Query`-API to perform joins between feature groups and create feature + * views. + * + *

+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureGroup fg = fs.getStreamFeatureGroup("electricity_prices", 1);
+   * }
+   * 
+ * + * @param name the name of the feature group + * @param version the version of the feature group + * @return StreamFeatureGroup The stream feature group metadata object. + * @throws FeatureStoreException If unable to retrieve feature group from the feature store. + * @throws IOException Generic IO exception. + */ + @Override + public StreamFeatureGroup getStreamFeatureGroup(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + return featureGroupEngine.getStreamFeatureGroup(this, name, version); + } + + @Override + public Object createStreamFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, + boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, + List partitionKeys, boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, + List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, + StatisticsConfig statisticsConfig, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object createExternalFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object createFeatureView() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + /** + * Get a feature view object from the selected feature store. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureView fv = fs.getFeatureView("fv_name", 1);
+   * }
+   * 
+ * + * @param name Name of the feature view. + * @param version Version to get. + * @return FeatureView The feature view metadata object. + * @throws FeatureStoreException If unable to retrieve FeatureView from the feature store. + * @throws IOException Generic IO exception. + */ + public FeatureView getFeatureView(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + return featureViewEngine.get(this, name, version); + } + + /** + * Get a feature view object from the selected feature store. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureView fv = fs.getFeatureView("fv_name", 1);
+   * }
+   * 
+ * + * @param name Name of the feature view. + * @return FeatureView The feature view metadata object. + * @throws FeatureStoreException If unable to retrieve FeatureView from the feature store. + * @throws IOException Generic IO exception. + */ + public FeatureView getFeatureView(String name) throws FeatureStoreException, IOException { + LOGGER.info("VersionWarning: No version provided for getting feature view `" + name + "`, defaulting to `" + + DEFAULT_VERSION + "`."); + return getFeatureView(name, DEFAULT_VERSION); + } + + @Override + public Object getOrCreateFeatureView(String name, Query query, Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOrCreateFeatureView(String name, Query query, Integer version, String description, + List labels) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getExternalFeatureGroup(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getExternalFeatureGroup(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public StorageConnector getStorageConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getHopsFsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getExternalFeatureGroups(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object sql(String name) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getJdbcConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getS3Connector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getRedshiftConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getSnowflakeConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getAdlsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for beam"); + } + + @Override + public Object getKafkaConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getBigqueryConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getOnlineStorageConnector() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getGcsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public TrainingDatasetBase getTrainingDataset(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public TrainingDatasetBase getTrainingDataset(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainingDatasets(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureView.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureView.java new file mode 100644 index 000000000..48c54f127 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureView.java @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.FeatureViewBase; +import com.logicalclocks.hsfs.beam.constructor.Query; +import org.apache.beam.sdk.values.PCollection; + +import java.io.IOException; +import java.text.ParseException; +import java.util.Map; + +public class FeatureView extends FeatureViewBase> { + @Override + public void addTag(String name, Object value) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map getTags() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTag(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void deleteTag(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void addTrainingDatasetTag(Integer version, String name, Object value) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map getTrainingDatasetTags(Integer version) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainingDatasetTag(Integer version, String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void deleteTrainingDatasetTag(Integer version, String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void delete() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void clean(FeatureStore featureStore, String featureViewName, Integer featureViewVersion) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public FeatureView update(FeatureView other) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public String getBatchQuery() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public String getBatchQuery(String startTime, String endTime) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection getBatchData() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection getBatchData(String startTime, String endTime) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection getBatchData(String startTime, String endTime, Map readOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainingData(Integer version, Map readOptions) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainTestSplit(Integer version, Map readOptions) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object getTrainValidationTestSplit(Integer version, Map readOptions) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void purgeTrainingData(Integer version) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void purgeAllTrainingData() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void deleteTrainingDataset(Integer version) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void deleteAllTrainingDatasets() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/HopsworksConnection.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/HopsworksConnection.java new file mode 100644 index 000000000..8b19103f5 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/HopsworksConnection.java @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.HopsworksConnectionBase; +import com.logicalclocks.hsfs.SecretStore; +import com.logicalclocks.hsfs.metadata.Credentials; +import com.logicalclocks.hsfs.metadata.HopsworksClient; +import com.logicalclocks.hsfs.metadata.HopsworksHttpClient; + +import lombok.Builder; + +import software.amazon.awssdk.regions.Region; + +import java.io.IOException; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; + +public class HopsworksConnection extends HopsworksConnectionBase { + + @Builder + public HopsworksConnection(String host, int port, String project, Region region, SecretStore secretStore, + boolean hostnameVerification, String trustStorePath, String certPath, String apiKeyFilePath, + String apiKeyValue) throws IOException, FeatureStoreException, CertificateException, KeyStoreException, + NoSuchAlgorithmException { + this.host = host; + this.port = port; + this.project = getProjectName(project); + this.region = region; + this.secretStore = secretStore; + this.hostnameVerification = hostnameVerification; + this.trustStorePath = trustStorePath; + this.certPath = certPath; + this.apiKeyFilePath = apiKeyFilePath; + this.apiKeyValue = apiKeyValue; + + HopsworksClient.setupHopsworksClient(host, port, region, secretStore, + hostnameVerification, trustStorePath, this.apiKeyFilePath, this.apiKeyValue); + this.projectObj = getProject(); + HopsworksClient.getInstance().setProject(this.projectObj); + Credentials credentials = HopsworksClient.getInstance().getCredentials(); + HopsworksHttpClient hopsworksHttpClient = HopsworksClient.getInstance().getHopsworksHttpClient(); + hopsworksHttpClient.setTrustStorePath(credentials.gettStore()); + hopsworksHttpClient.setKeyStorePath(credentials.getkStore()); + hopsworksHttpClient.setCertKey(credentials.getPassword()); + HopsworksClient.getInstance().setHopsworksHttpClient(hopsworksHttpClient); + } + + /** + * Retrieve the project feature store. + * + * @return FeatureStore object. + * @throws IOException Generic IO exception. + * @throws FeatureStoreException If client is not connected to Hopsworks + */ + public FeatureStore getFeatureStore() throws IOException, FeatureStoreException { + return getFeatureStore(rewriteFeatureStoreName(project)); + } + + /** + * Retrieve a feature store based on name. The feature store needs to be shared with + * the connection's project. The name is the project name of the feature store. + * + * @param name the name of the feature store to get the handle for + * @return FeatureStore object. + * @throws IOException Generic IO exception. + * @throws FeatureStoreException If client is not connected to Hopsworks + */ + public FeatureStore getFeatureStore(String name) throws IOException, FeatureStoreException { + return featureStoreApi.get(rewriteFeatureStoreName(name), FeatureStore.class); + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java new file mode 100644 index 000000000..312890dff --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam; + +import com.logicalclocks.hsfs.Feature; +import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.HudiOperationType; +import com.logicalclocks.hsfs.JobConfiguration; +import com.logicalclocks.hsfs.StatisticsConfig; +import com.logicalclocks.hsfs.Storage; +import com.logicalclocks.hsfs.beam.engine.FeatureGroupEngine; +import com.logicalclocks.hsfs.beam.engine.BeamProducer; +import com.logicalclocks.hsfs.constructor.QueryBase; +import com.logicalclocks.hsfs.metadata.Statistics; +import lombok.Builder; +import lombok.NonNull; +import org.apache.beam.sdk.values.PCollection; + +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class StreamFeatureGroup extends FeatureGroupBase> { + + + protected FeatureGroupEngine featureGroupEngine = new FeatureGroupEngine(); + + @Builder + public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, + List primaryKeys, List partitionKeys, String hudiPrecombineKey, + boolean onlineEnabled, List features, + StatisticsConfig statisticsConfig, String onlineTopicName, String eventTime) { + this(); + this.featureStore = featureStore; + this.name = name; + this.version = version; + this.description = description; + this.primaryKeys = primaryKeys != null + ? primaryKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; + this.partitionKeys = partitionKeys != null + ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; + this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; + this.onlineEnabled = onlineEnabled; + this.features = features; + this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); + this.onlineTopicName = onlineTopicName; + this.eventTime = eventTime; + } + + public StreamFeatureGroup() { + this.type = "streamFeatureGroupDTO"; + } + + // used for updates + public StreamFeatureGroup(Integer id, String description, List features) { + this(); + this.id = id; + this.description = description; + this.features = features; + } + + public StreamFeatureGroup(FeatureStore featureStore, int id) { + this(); + this.featureStore = featureStore; + this.id = id; + } + + @Override + public PCollection read() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(boolean online, Map readOptions) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(String wallclockTime) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection read(String wallclockTime, Map readOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase asOf(String wallclockTime) throws FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase asOf(String wallclockTime, String excludeUntil) throws FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void show(int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void show(int numRows, boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData) throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, Storage storage) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, boolean overwrite) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, Storage storage, boolean overwrite) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, boolean online, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, HudiOperationType hudiOperationType) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, Storage storage, boolean online, + HudiOperationType hudiOperationType, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, JobConfiguration jobConfiguration) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void insert(PCollection featureData, boolean online, Map writeOptions, + JobConfiguration jobConfiguration) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void commitDeleteRecord(PCollection featureData) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void commitDeleteRecord(PCollection featureData, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map> commitDetails() throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map> commitDetails(Integer integer) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map> commitDetails(String limit) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Map> commitDetails(String wallclockTime, Integer limit) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase selectFeatures(List features) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase select(List features) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase selectAll() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase selectExceptFeatures(List features) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public QueryBase selectExcept(List features) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + /** + * Ingest a feature data to the online feature store using Beam Pipeline object. Currently, + * only org.apache.beam.sdk.values.Row types as feature data type are supported. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *
+   *        // get feature group handle
+   *        StreamFeatureGroup fg = fs.getStreamFeatureGroup("taxi_ride", 1);
+   *
+   *        // create Beam pipeline
+   *        Pipeline pipeline = Pipeline.create();
+   *        pipeline
+   *         .apply("read stream from the source", PubsubIO.readStrings().fromTopic(options.getInputTopic()))
+   *         .apply("Parse JSON to Beam Rows", JsonToRow.withSchema(schema))
+   *         .apply("insert streaming feature data", fg.insertStream());
+   * }
+   * 
+ * + * @return BeamProducer object, that can be wrapped inside Beam Pipeline `apply` method. + */ + public BeamProducer insertStream() throws Exception { + return featureGroupEngine.insertStream(this, null); + } + + public BeamProducer insertStream(Map writeOptions) throws Exception { + return featureGroupEngine.insertStream(this, writeOptions); + } + + @Override + public Object insertStream(PCollection featureData) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, Map writeOptions) throws Exception { + return null; + } + + @Override + public Object insertStream(PCollection featureData, String queryName, Map writeOptions) + throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData,String queryName, String outputMode, + boolean awaitTermination, Long timeout) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation, Map writeOptions) + throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + boolean awaitTermination, String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object insertStream(PCollection featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation, Map writeOptions, + JobConfiguration jobConfiguration) throws Exception { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void updateFeatures(List feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void updateFeatures(Feature feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void appendFeatures(List feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void appendFeatures(Feature feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Statistics computeStatistics() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Statistics computeStatistics(String wallclockTime) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Statistics getStatistics() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/constructor/Query.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/constructor/Query.java new file mode 100644 index 000000000..e3abe7df7 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/constructor/Query.java @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.constructor; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.Storage; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import com.logicalclocks.hsfs.constructor.QueryBase; + +import org.apache.beam.sdk.values.PCollection; + +import java.io.IOException; +import java.util.Map; + +public class Query extends QueryBase> { + @Override + public String sql() { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public String sql(Storage storage) { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object read() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object read(boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public Object read(boolean online, Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void show(int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public void show(boolean online, int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamEngine.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamEngine.java new file mode 100644 index 000000000..13ff573a1 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamEngine.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import com.google.common.base.Strings; +import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import com.logicalclocks.hsfs.metadata.DatasetApi; +import com.logicalclocks.hsfs.engine.EngineBase; +import com.logicalclocks.hsfs.engine.FeatureGroupUtils; +import com.logicalclocks.hsfs.metadata.HopsworksInternalClient; +import org.apache.avro.Schema; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class BeamEngine extends EngineBase { + private static BeamEngine INSTANCE = null; + private FeatureGroupUtils featureGroupUtils = new FeatureGroupUtils(); + + public static synchronized BeamEngine getInstance() throws FeatureStoreException { + if (INSTANCE == null) { + INSTANCE = new BeamEngine(); + } + return INSTANCE; + } + + private BeamEngine() throws FeatureStoreException { + } + + public BeamProducer insertStream(StreamFeatureGroup streamFeatureGroup, Map writeOptions) + throws FeatureStoreException, IOException { + Map complexFeatureSchemas = new HashMap<>(); + for (String featureName: streamFeatureGroup.getComplexFeatures()) { + complexFeatureSchemas.put(featureName, + new Schema.Parser().parse(streamFeatureGroup.getFeatureAvroSchema(featureName))); + } + Schema deserializedEncodedSchema = new Schema.Parser().parse(streamFeatureGroup.getEncodedAvroSchema()); + + return new BeamProducer(streamFeatureGroup.getOnlineTopicName(), + getKafkaConfig(streamFeatureGroup, writeOptions), + streamFeatureGroup.getDeserializedAvroSchema(), deserializedEncodedSchema, complexFeatureSchemas, + streamFeatureGroup.getPrimaryKeys(), streamFeatureGroup); + } + + @Override + public String addFile(String filePath) throws IOException, FeatureStoreException { + if (Strings.isNullOrEmpty(filePath)) { + return filePath; + } + // this is used for unit testing + if (!filePath.startsWith("file://")) { + filePath = "hdfs://" + filePath; + } + String targetPath = System.getProperty("java.io.tmpdir") + filePath.substring(filePath.lastIndexOf("/")); + try (FileOutputStream outputStream = new FileOutputStream(targetPath)) { + outputStream.write(DatasetApi.readContent(filePath, featureGroupUtils.getDatasetType(filePath))); + } + return targetPath; + } + + @Override + public Map getKafkaConfig(FeatureGroupBase featureGroup, Map writeOptions) + throws FeatureStoreException, IOException { + boolean external = !(System.getProperties().containsKey(HopsworksInternalClient.REST_ENDPOINT_SYS) + || (writeOptions != null + && Boolean.parseBoolean(writeOptions.getOrDefault("internal_kafka", "false")))); + + StorageConnector.KafkaConnector storageConnector = + storageConnectorApi.getKafkaStorageConnector(featureGroup.getFeatureStore(), external); + storageConnector.setSslTruststoreLocation(addFile(storageConnector.getSslTruststoreLocation())); + storageConnector.setSslKeystoreLocation(addFile(storageConnector.getSslKeystoreLocation())); + + Map config = storageConnector.kafkaOptions(); + + if (writeOptions != null) { + config.putAll(writeOptions); + } + return config; + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamKafkaProducer.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamKafkaProducer.java new file mode 100644 index 000000000..c1bbd2748 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamKafkaProducer.java @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import lombok.Setter; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.producer.Callback; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.clients.producer.RecordMetadata; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Future; + +public class BeamKafkaProducer extends KafkaProducer { + @Setter + private Map headerMap = new HashMap<>(); + + public BeamKafkaProducer(Map configs) { + super(configs); + } + + public Future send(ProducerRecord record) { + addHeaders(record); + return super.send(record); + } + + public Future send(ProducerRecord record, Callback callback) { + addHeaders(record); + return super.send(record, callback); + } + + private void addHeaders(ProducerRecord record) { + for (Map.Entry entry: headerMap.entrySet()) { + record.headers().add(entry.getKey(), entry.getValue()); + } + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamProducer.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamProducer.java new file mode 100644 index 000000000..e2b13e074 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/BeamProducer.java @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import lombok.NonNull; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.EncoderFactory; + +import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; +import org.apache.beam.sdk.extensions.avro.schemas.utils.AvroUtils; +import org.apache.beam.sdk.io.kafka.KafkaIO; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; +import org.apache.beam.sdk.values.Row; + +import org.apache.kafka.common.config.SslConfigs; +import org.apache.kafka.common.serialization.StringSerializer; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public class BeamProducer extends PTransform<@NonNull PCollection, @NonNull PDone> { + private String topic; + private Map properties; + private transient Schema schema; + private transient Schema encodedSchema; + private Map deserializedComplexFeatureSchemas; + private List primaryKeys; + private final Map headerMap = new HashMap<>(); + + public BeamProducer(String topic, Map properties, Schema schema, Schema encodedSchema, + Map deserializedComplexFeatureSchemas, List primaryKeys, + StreamFeatureGroup streamFeatureGroup) throws FeatureStoreException, IOException { + this.schema = schema; + this.encodedSchema = encodedSchema; + this.topic = topic; + this.properties = properties; + this.deserializedComplexFeatureSchemas = deserializedComplexFeatureSchemas; + this.primaryKeys = primaryKeys; + + headerMap.put("projectId", + String.valueOf(streamFeatureGroup.getFeatureStore().getProjectId()).getBytes(StandardCharsets.UTF_8)); + headerMap.put("featureGroupId", String.valueOf(streamFeatureGroup.getId()).getBytes(StandardCharsets.UTF_8)); + headerMap.put("subjectId", + String.valueOf(streamFeatureGroup.getSubject().getId()).getBytes(StandardCharsets.UTF_8)); + } + + @Override + public PDone expand(PCollection input) { + + PCollection featureGroupAvroRecord = input + .apply("Convert to avro generic record", ParDo.of(new DoFn() { + @ProcessElement + public void processElement(ProcessContext c) { + GenericRecord genericRecord = AvroUtils.toGenericRecord(c.element(), schema); + c.output(genericRecord); + } + })).setCoder(AvroCoder.of(GenericRecord.class, schema)); + + if (!deserializedComplexFeatureSchemas.keySet().isEmpty()) { + featureGroupAvroRecord = featureGroupAvroRecord + .apply("Serialize complex features", ParDo.of(new DoFn() { + @ProcessElement + public void processElement(ProcessContext c) throws IOException { + GenericRecord encodedRecord = new GenericData.Record(encodedSchema); + for (Schema.Field field: c.element().getSchema().getFields()) { + if (deserializedComplexFeatureSchemas.containsKey(field.name())) { + GenericDatumWriter complexFeatureDatumWriter = + new GenericDatumWriter<>(deserializedComplexFeatureSchemas.get(field.name())); + ByteArrayOutputStream complexFeatureByteArrayOutputStream = new ByteArrayOutputStream(); + complexFeatureByteArrayOutputStream.reset(); + BinaryEncoder complexFeatureBinaryEncoder = + new EncoderFactory().binaryEncoder(complexFeatureByteArrayOutputStream, null); + complexFeatureDatumWriter.write(field.name(), complexFeatureBinaryEncoder); + complexFeatureBinaryEncoder.flush(); + encodedRecord.put(field.name(), ByteBuffer.wrap(complexFeatureByteArrayOutputStream.toByteArray())); + } + } + c.output(encodedRecord); + } + })); + } + + return featureGroupAvroRecord.apply("Convert To KV of primaryKey:GenericRecord", + ParDo.of(new DoFn>() { + @ProcessElement + public void processElement(ProcessContext c) { + List primaryKeyValues = new ArrayList<>(); + for (String primaryKey: primaryKeys) { + primaryKeyValues.add(c.element().get(primaryKey).toString()); + } + c.output(KV.of(String.join(";", primaryKeyValues), c.element())); + } + }) + ) + .apply("Sync to online feature group kafka topic", KafkaIO.write() + .withBootstrapServers(properties.get("bootstrap.servers").toString()) + .withTopic(topic) + //.withProducerConfigUpdates(properties) + .withKeySerializer(StringSerializer.class) + .withValueSerializer(GenericAvroSerializer.class) + .withInputTimestamp() + .withProducerFactoryFn(props -> { + // copy jks files from resources to dataflow workers + try { + Path keyStorePath = Paths.get(properties.get(SslConfigs.SSL_KEYSTORE_LOCATION_CONFIG)); + InputStream keyStoreStream = Objects.requireNonNull(BeamProducer.class.getClassLoader() + .getResourceAsStream(keyStorePath.getFileName().toString())); + if (!Files.exists(keyStorePath)) { + Files.copy(keyStoreStream, keyStorePath, StandardCopyOption.REPLACE_EXISTING); + } + Path trustStorePath = Paths.get(properties.get(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG)); + InputStream trustStoreStream = Objects.requireNonNull(BeamProducer.class.getClassLoader() + .getResourceAsStream(trustStorePath.getFileName().toString())); + if (!Files.exists(trustStorePath)) { + Files.copy(trustStoreStream, trustStorePath, StandardCopyOption.REPLACE_EXISTING); + } + } catch (IOException e) { + e.printStackTrace(); + } + props.putAll(properties); + BeamKafkaProducer producer = new BeamKafkaProducer(props); + producer.setHeaderMap(headerMap); + return producer; + }) + ); + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureGroupEngine.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureGroupEngine.java new file mode 100644 index 000000000..c56fb2673 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureGroupEngine.java @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.beam.FeatureStore; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import com.logicalclocks.hsfs.engine.FeatureGroupEngineBase; +import lombok.SneakyThrows; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class FeatureGroupEngine extends FeatureGroupEngineBase { + + @SneakyThrows + public BeamProducer insertStream(StreamFeatureGroup streamFeatureGroup, Map writeOptions) { + return BeamEngine.getInstance().insertStream(streamFeatureGroup, writeOptions); + } + + public StreamFeatureGroup getStreamFeatureGroup(FeatureStore featureStore, String fgName, Integer fgVersion) + throws IOException, FeatureStoreException { + StreamFeatureGroup[] streamFeatureGroups = + featureGroupApi.getInternal(featureStore, fgName, fgVersion, StreamFeatureGroup[].class); + + // There can be only one single feature group with a specific name and version in a feature store + // There has to be one otherwise an exception would have been thrown. + StreamFeatureGroup resultFg = streamFeatureGroups[0]; + resultFg.setFeatureStore(featureStore); + return resultFg; + } + + public List getStreamFeatureGroups(FeatureStore featureStore, String fgName) + throws FeatureStoreException, IOException { + StreamFeatureGroup[] streamFeatureGroups = + featureGroupApi.getInternal(featureStore, fgName, null, StreamFeatureGroup[].class); + + return Arrays.asList(streamFeatureGroups); + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureViewEngine.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureViewEngine.java new file mode 100644 index 000000000..0402d4c9b --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/FeatureViewEngine.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.beam.FeatureStore; +import com.logicalclocks.hsfs.beam.FeatureView; +import com.logicalclocks.hsfs.beam.StreamFeatureGroup; +import com.logicalclocks.hsfs.beam.constructor.Query; +import com.logicalclocks.hsfs.engine.FeatureViewEngineBase; +import org.apache.beam.sdk.values.PCollection; + +import java.io.IOException; +import java.util.Date; +import java.util.List; +import java.util.Map; + +public class FeatureViewEngine extends FeatureViewEngineBase> { + @Override + public FeatureView update(FeatureView featureView) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public FeatureView get(FeatureStore featureStore, String name, Integer version) + throws FeatureStoreException, IOException { + FeatureView featureView = get(featureStore, name, version, FeatureView.class); + featureView.setFeatureStore(featureStore); + return featureView; + } + + @Override + public Query getBatchQuery(FeatureView featureView, Date startTime, Date endTime, Boolean withLabels, + Integer trainingDataVersion) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public String getBatchQueryString(FeatureView featureView, Date startTime, Date endTime, Integer trainingDataVersion) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public FeatureView getOrCreateFeatureView(FeatureStore featureStore, String name, Integer version, Query query, + String description, List labels) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } + + @Override + public PCollection getBatchData(FeatureView featureView, Date startTime, Date endTime, + Map readOptions, Integer trainingDataVersion) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Beam"); + } +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/GenericAvroSerializer.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/GenericAvroSerializer.java new file mode 100644 index 000000000..556426a47 --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/GenericAvroSerializer.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.beam.engine; + +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.DatumWriter; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.reflect.ReflectDatumWriter; +import org.apache.kafka.common.serialization.Serializer; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class GenericAvroSerializer implements Serializer { + + @Override + public byte[] serialize(String topic, GenericRecord genericRecord) { + DatumWriter datumWriter = new ReflectDatumWriter<>(genericRecord.getSchema()); + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + byteArrayOutputStream.reset(); + + List records = new ArrayList<>(); + records.add(genericRecord); + + BinaryEncoder binaryEncoder = new EncoderFactory().binaryEncoder(byteArrayOutputStream, null); + for (GenericRecord segment: records) { + try { + datumWriter.write(segment, binaryEncoder); + } catch (IOException e) { + e.printStackTrace(); + } + } + try { + binaryEncoder.flush(); + } catch (IOException e) { + e.printStackTrace(); + } + return byteArrayOutputStream.toByteArray(); + } + +} diff --git a/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/KeySerializer.java b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/KeySerializer.java new file mode 100644 index 000000000..d0af9a7eb --- /dev/null +++ b/hsfs/java/beam/src/main/java/com/logicalclocks/hsfs/beam/engine/KeySerializer.java @@ -0,0 +1,26 @@ +package com.logicalclocks.hsfs.beam.engine; + +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.common.serialization.Serializer; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +public class KeySerializer implements Serializer { + + List primaryKeys; + + public KeySerializer(List primaryKeys) { + this.primaryKeys = primaryKeys; + } + + @Override + public byte[] serialize(String topic, GenericRecord record) { + List primaryKeyValues = new ArrayList<>(); + for (String primaryKey: this.primaryKeys) { + primaryKeyValues.add(record.get(primaryKey).toString()); + } + return String.join(";", primaryKeyValues).getBytes(StandardCharsets.UTF_8); + } +} diff --git a/hsfs/java/flink/pom.xml b/hsfs/java/flink/pom.xml new file mode 100644 index 000000000..d2d7b87e0 --- /dev/null +++ b/hsfs/java/flink/pom.xml @@ -0,0 +1,92 @@ + + + + hsfs-parent + com.logicalclocks + 4.0.0-SNAPSHOT + + 4.0.0 + + hsfs-flink + + + 1.17.1.0 + 2.13.4.2 + + + + + com.logicalclocks + hsfs + ${project.version} + compile + + + com.fasterxml.jackson.core + * + + + javax.xml.bind + jaxb-api + + + com.databricks + * + + + org.scala-lang + * + + + + + + org.apache.flink + flink-streaming-java + ${flink.version} + provided + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + + org.apache.flink + flink-connector-kafka + ${flink.version} + provided + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + + org.apache.flink + flink-avro + ${flink.version} + + + org.apache.flink + flink-shaded-hadoop2 + + + + + + com.fasterxml.jackson.core + jackson-databind + ${fasterxml.version} + + + + diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java new file mode 100644 index 000000000..3ab0dfe24 --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink; + +import com.logicalclocks.hsfs.FeatureStoreBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.StatisticsConfig; +import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.TimeTravelFormat; +import com.logicalclocks.hsfs.TrainingDatasetBase; +import com.logicalclocks.hsfs.flink.constructor.Query; +import com.logicalclocks.hsfs.flink.engine.FeatureViewEngine; +import com.logicalclocks.hsfs.metadata.StorageConnectorApi; + +import com.logicalclocks.hsfs.flink.engine.FeatureGroupEngine; + +import lombok.NonNull; + +import java.io.IOException; +import java.util.List; + +public class FeatureStore extends FeatureStoreBase { + + private FeatureGroupEngine featureGroupEngine; + private FeatureViewEngine featureViewEngine; + + public FeatureStore() { + storageConnectorApi = new StorageConnectorApi(); + featureViewEngine = new FeatureViewEngine(); + featureGroupEngine = new FeatureGroupEngine(); + } + + @Override + public Object createFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getFeatureGroups(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + + @Override + public Object getOrCreateFeatureGroup(String name, Integer integer, List primaryKeys, + boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, List primaryKeys, + List partitionKeys, boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getOrCreateFeatureGroup(String name, Integer version, String description, List primaryKeys, + List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, String topicName, String notificationTopicName, String eventTime) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + /** + * Get a stream feature group object from the feature store. + * + *

Getting a stream feature group metadata handle enables to interact with the feature group, + * such as read the data or use the `Query`-API to perform joins between feature groups and create feature + * views. + * + *

+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureGroup fg = fs.getStreamFeatureGroup("electricity_prices", 1);
+   * }
+   * 
+ * + * @param name the name of the feature group + * @return StreamFeatureGroup The stream feature group metadata object. + * @throws FeatureStoreException If unable to retrieve feature group from the feature store. + * @throws IOException Generic IO exception. + */ + @Override + public StreamFeatureGroup getStreamFeatureGroup(String name) throws FeatureStoreException, IOException { + LOGGER.info("VersionWarning: No version provided for getting feature group `" + name + "`, defaulting to `" + + DEFAULT_VERSION + "`."); + return getStreamFeatureGroup(name, DEFAULT_VERSION); + } + + /** + * Get a stream feature group object from the feature store. + * + *

Getting a stream feature group metadata handle enables to interact with the feature group, + * such as read the data or use the `Query`-API to perform joins between feature groups and create feature + * views. + * + *

+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureGroup fg = fs.getStreamFeatureGroup("electricity_prices", 1);
+   * }
+   * 
+ * + * @param name the name of the feature group + * @param version the version of the feature group + * @return StreamFeatureGroup The stream feature group metadata object. + * @throws FeatureStoreException If unable to retrieve feature group from the feature store. + * @throws IOException Generic IO exception. + */ + @Override + public StreamFeatureGroup getStreamFeatureGroup(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + return featureGroupEngine.getStreamFeatureGroup(this, name, version); + } + + @Override + public StreamFeatureGroup.StreamFeatureGroupBuilder createStreamFeatureGroup() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, + boolean onlineEnabled, String eventTime) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, List primaryKeys, + List partitionKeys, boolean onlineEnabled, + String eventTime) throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, + List primaryKeys, List partitionKeys, + String hudiPrecombineKey, boolean onlineEnabled, + StatisticsConfig statisticsConfig, + String eventTime) + throws IOException, FeatureStoreException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object createExternalFeatureGroup() { + return null; + } + + @Override + public Object createFeatureView() { + return null; + } + + @Override + public StorageConnector getStorageConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getHopsFsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getJdbcConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getOnlineStorageConnector() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getGcsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getS3Connector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getRedshiftConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getSnowflakeConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getAdlsConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getKafkaConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getBigqueryConnector(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getExternalFeatureGroups(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object sql(String query) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public TrainingDatasetBase getTrainingDataset(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public TrainingDatasetBase getTrainingDataset(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainingDatasets(@NonNull String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public FeatureView getOrCreateFeatureView(String name, Query query, Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public FeatureView getOrCreateFeatureView(String name, Query query, Integer version, String description, + List labels) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + /** + * Get a feature view object from the selected feature store. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureView fv = fs.getFeatureView("fv_name", 1);
+   * }
+   * 
+ * + * @param name Name of the feature view. + * @param version Version to get. + * @return FeatureView The feature view metadata object. + * @throws FeatureStoreException If unable to retrieve FeatureView from the feature store. + * @throws IOException Generic IO exception. + */ + public FeatureView getFeatureView(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + return featureViewEngine.get(this, name, version); + } + + /** + * Get a feature view object from the selected feature store. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *        FeatureView fv = fs.getFeatureView("fv_name", 1);
+   * }
+   * 
+ * + * @param name Name of the feature view. + * @return FeatureView The feature view metadata object. + * @throws FeatureStoreException If unable to retrieve FeatureView from the feature store. + * @throws IOException Generic IO exception. + */ + public FeatureView getFeatureView(String name) throws FeatureStoreException, IOException { + LOGGER.info("VersionWarning: No version provided for getting feature view `" + name + "`, defaulting to `" + + DEFAULT_VERSION + "`."); + return getFeatureView(name, DEFAULT_VERSION); + } + + @Override + public Object getExternalFeatureGroup(@NonNull String name, @NonNull Integer version) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getExternalFeatureGroup(String name) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureView.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureView.java new file mode 100644 index 000000000..e95578acd --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureView.java @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.FeatureViewBase; +import com.logicalclocks.hsfs.flink.constructor.Query; + +import org.apache.flink.streaming.api.datastream.DataStream; + +import lombok.NoArgsConstructor; + +import java.io.IOException; +import java.text.ParseException; + +import java.util.Map; + +@NoArgsConstructor +public class FeatureView extends FeatureViewBase> { + + @Override + public void addTag(String s, Object o) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map getTags() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTag(String s) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void deleteTag(String s) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void addTrainingDatasetTag(Integer integer, String s, Object o) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map getTrainingDatasetTags(Integer integer) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainingDatasetTag(Integer integer, String s) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void deleteTrainingDatasetTag(Integer integer, String s) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void delete() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void clean(FeatureStore featureStore, String s, Integer integer) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public FeatureView update(FeatureView featureView) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public String getBatchQuery() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public String getBatchQuery(String s, String s1) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream getBatchData() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream getBatchData(String s, String s1) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream getBatchData(String s, String s1, Map map) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainingData(Integer integer, Map map) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainTestSplit(Integer integer, Map map) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object getTrainValidationTestSplit(Integer integer, Map map) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void purgeTrainingData(Integer integer) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void purgeAllTrainingData() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void deleteTrainingDataset(Integer integer) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void deleteAllTrainingDatasets() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/HopsworksConnection.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/HopsworksConnection.java new file mode 100644 index 000000000..3d8d71d0f --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/HopsworksConnection.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.HopsworksConnectionBase; +import com.logicalclocks.hsfs.SecretStore; +import com.logicalclocks.hsfs.flink.engine.FlinkEngine; +import com.logicalclocks.hsfs.metadata.HopsworksClient; + +import com.logicalclocks.hsfs.metadata.HopsworksHttpClient; +import com.logicalclocks.hsfs.metadata.HopsworksInternalClient; +import lombok.Builder; + +import software.amazon.awssdk.regions.Region; + +import java.io.IOException; + +public class HopsworksConnection extends HopsworksConnectionBase { + + @Builder + public HopsworksConnection(String host, int port, String project, Region region, SecretStore secretStore, + boolean hostnameVerification, String trustStorePath, + String certPath, String apiKeyFilePath, String apiKeyValue) + throws IOException, FeatureStoreException { + this.host = host; + this.port = port; + this.project = getProjectName(project); + this.region = region; + this.secretStore = secretStore; + this.hostnameVerification = hostnameVerification; + this.trustStorePath = trustStorePath; + this.certPath = certPath; + this.apiKeyFilePath = apiKeyFilePath; + this.apiKeyValue = apiKeyValue; + + HopsworksClient.setupHopsworksClient(host, port, region, secretStore, + hostnameVerification, trustStorePath, this.apiKeyFilePath, this.apiKeyValue); + this.projectObj = getProject(); + HopsworksClient.getInstance().setProject(this.projectObj); + if (!System.getProperties().containsKey(HopsworksInternalClient.REST_ENDPOINT_SYS)) { + HopsworksHttpClient hopsworksHttpClient = HopsworksClient.getInstance().getHopsworksHttpClient(); + hopsworksHttpClient.setTrustStorePath(FlinkEngine.getInstance().getTrustStorePath()); + hopsworksHttpClient.setKeyStorePath(FlinkEngine.getInstance().getKeyStorePath()); + hopsworksHttpClient.setCertKey(HopsworksHttpClient.readCertKey(FlinkEngine.getInstance().getCertKey())); + HopsworksClient.getInstance().setHopsworksHttpClient(hopsworksHttpClient); + } + } + + /** + * Retrieve the project feature store. + * + * @return FeatureStore object. + * @throws IOException Generic IO exception. + * @throws FeatureStoreException If client is not connected to Hopsworks + */ + public FeatureStore getFeatureStore() throws IOException, FeatureStoreException { + return getFeatureStore(rewriteFeatureStoreName(project)); + } + + /** + * Retrieve a feature store based on name. The feature store needs to be shared with + * the connection's project. The name is the project name of the feature store. + * + * @param name the name of the feature store to get the handle for + * @return FeatureStore object. + * @throws IOException Generic IO exception. + * @throws FeatureStoreException If client is not connected to Hopsworks + */ + public FeatureStore getFeatureStore(String name) throws IOException, FeatureStoreException { + return featureStoreApi.get(rewriteFeatureStoreName(name), FeatureStore.class); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java new file mode 100644 index 000000000..41b121d8b --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.logicalclocks.hsfs.Feature; +import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.HudiOperationType; +import com.logicalclocks.hsfs.JobConfiguration; +import com.logicalclocks.hsfs.StatisticsConfig; +import com.logicalclocks.hsfs.Storage; +import com.logicalclocks.hsfs.constructor.QueryBase; + +import com.logicalclocks.hsfs.metadata.Statistics; + +import com.logicalclocks.hsfs.flink.engine.FeatureGroupEngine; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.NonNull; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; + +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +@AllArgsConstructor +@JsonIgnoreProperties(ignoreUnknown = true) +public class StreamFeatureGroup extends FeatureGroupBase> { + + protected FeatureGroupEngine featureGroupEngine = new FeatureGroupEngine(); + + @Builder + public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, + List primaryKeys, List partitionKeys, String hudiPrecombineKey, + boolean onlineEnabled, List features, StatisticsConfig statisticsConfig, + String onlineTopicName, String topicName, String notificationTopicName, String eventTime) { + this(); + this.featureStore = featureStore; + this.name = name; + this.version = version; + this.description = description; + this.primaryKeys = primaryKeys != null + ? primaryKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; + this.partitionKeys = partitionKeys != null + ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; + this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; + this.onlineEnabled = onlineEnabled; + this.features = features; + this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); + this.onlineTopicName = onlineTopicName; + this.topicName = topicName; + this.notificationTopicName = notificationTopicName; + this.eventTime = eventTime; + } + + public StreamFeatureGroup() { + this.type = "streamFeatureGroupDTO"; + } + + // used for updates + public StreamFeatureGroup(Integer id, String description, List features) { + this(); + this.id = id; + this.description = description; + this.features = features; + } + + public StreamFeatureGroup(FeatureStore featureStore, int id) { + this(); + this.featureStore = featureStore; + this.id = id; + } + + @Override + public DataStream read() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(boolean online, Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(String wallclockTime) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream read(String wallclockTime, Map readOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase asOf(String wallclockTime) throws FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase asOf(String wallclockTime, String excludeUntil) throws FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void show(int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void show(int numRows, boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData) throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, Storage storage) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, boolean overwrite) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, Storage storage, boolean overwrite) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, boolean online, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, HudiOperationType hudiOperationType) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, Storage storage, boolean online, HudiOperationType hudiOperationType, + Map writeOptions) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, JobConfiguration jobConfiguration) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void insert(DataStream featureData, boolean online, Map writeOptions, + JobConfiguration jobConfiguration) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void commitDeleteRecord(DataStream featureData) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void commitDeleteRecord(DataStream featureData, Map writeOptions) + throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map> commitDetails() throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map> commitDetails(Integer integer) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map> commitDetails(String limit) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Map> commitDetails(String wallclockTime, Integer limit) + throws IOException, FeatureStoreException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase selectFeatures(List features) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase select(List features) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase selectAll() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase selectExceptFeatures(List features) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public QueryBase selectExcept(List features) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + /** + * Ingest a feature data to the online feature store using Flink DataStream API. Currently, only POJO + * types as feature data type are supported. + * + *
+   * {@code
+   *        // get feature store handle
+   *        FeatureStore fs = HopsworksConnection.builder().build().getFeatureStore();
+   *
+   *        // get feature group handle
+   *        StreamFeatureGroup fg = fs.getStreamFeatureGroup("card_transactions", 1);
+   *
+   *        // read stream from the source and aggregate stream
+   *        DataStream aggregationStream =
+   *          env.fromSource(transactionSource, customWatermark, "Transaction Kafka Source")
+   *          .keyBy(r -> r.getCcNum())
+   *          .window(SlidingEventTimeWindows.of(Time.minutes(windowLength), Time.minutes(1)))
+   *          .aggregate(new TransactionCountAggregate());
+   *
+   *        // insert streaming feature data
+   *        fg.insertStream(featureData);
+   * }
+   * 
+ * + * @param featureData Features in Streaming Dataframe to be saved. + * @return DataStreamSink object. + */ + @Override + public DataStreamSink insertStream(DataStream featureData) throws Exception { + return featureGroupEngine.insertStream(this, featureData, null); + } + + @Override + public DataStreamSink insertStream(DataStream featureData, Map writeOptions) throws Exception { + return featureGroupEngine.insertStream(this, featureData, writeOptions); + } + + @Override + public Object insertStream(DataStream featureData, String queryName) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, Map writeOptions) + throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, + String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData,String queryName, String outputMode, + boolean awaitTermination, Long timeout) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation) + throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, + boolean awaitTermination, Long timeout, String checkpointLocation, + Map writeOptions) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, boolean awaitTermination, + String checkpointLocation) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object insertStream(DataStream featureData, String queryName, String outputMode, boolean awaitTermination, + Long timeout, String checkpointLocation, Map writeOptions, + JobConfiguration jobConfiguration) throws Exception { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void updateFeatures(Feature feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void updateFeatures(List feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void appendFeatures(List feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void appendFeatures(Feature feature) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Statistics computeStatistics() throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Statistics computeStatistics(String wallclockTime) throws FeatureStoreException, IOException, ParseException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Statistics getStatistics() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/FsQuery.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/FsQuery.java new file mode 100644 index 000000000..c9527119d --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/FsQuery.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.constructor; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.logicalclocks.hsfs.constructor.FsQueryBase; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import lombok.AllArgsConstructor; + +import java.util.Map; + +@JsonIgnoreProperties(ignoreUnknown = true) +@AllArgsConstructor +public class FsQuery extends FsQueryBase { + @Override + public void registerOnDemandFeatureGroups() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void registerHudiFeatureGroups(Map readOptions) { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/Query.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/Query.java new file mode 100644 index 000000000..4d1c85359 --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/constructor/Query.java @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.constructor; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.Storage; +import com.logicalclocks.hsfs.constructor.QueryBase; + +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import lombok.NoArgsConstructor; +import org.apache.flink.streaming.api.datastream.DataStream; + +import java.io.IOException; +import java.util.Map; + +@NoArgsConstructor +public class Query extends QueryBase> { + + @Override + public String sql() { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public String sql(Storage storage) { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object read() throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object read(boolean online) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public Object read(boolean online, Map readOptions) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void show(int i) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public void show(boolean online, int numRows) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureGroupEngine.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureGroupEngine.java new file mode 100644 index 000000000..7fe3d0b82 --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureGroupEngine.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.engine.FeatureGroupEngineBase; + +import com.logicalclocks.hsfs.flink.FeatureStore; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import lombok.SneakyThrows; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class FeatureGroupEngine extends FeatureGroupEngineBase { + + @SneakyThrows + public DataStreamSink insertStream(StreamFeatureGroup streamFeatureGroup, DataStream featureData, + Map writeOptions) { + return FlinkEngine.getInstance().writeDataStream(streamFeatureGroup, featureData, writeOptions); + } + + public StreamFeatureGroup getStreamFeatureGroup(FeatureStore featureStore, String fgName, Integer fgVersion) + throws IOException, FeatureStoreException { + StreamFeatureGroup[] streamFeatureGroups = + featureGroupApi.getInternal(featureStore, fgName, fgVersion, StreamFeatureGroup[].class); + + // There can be only one single feature group with a specific name and version in a feature store + // There has to be one otherwise an exception would have been thrown. + StreamFeatureGroup resultFg = streamFeatureGroups[0]; + resultFg.setFeatureStore(featureStore); + return resultFg; + } + + public List getStreamFeatureGroups(FeatureStore featureStore, String fgName) + throws FeatureStoreException, IOException { + StreamFeatureGroup[] streamFeatureGroups = + featureGroupApi.getInternal(featureStore, fgName, null, StreamFeatureGroup[].class); + + return Arrays.asList(streamFeatureGroups); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureViewEngine.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureViewEngine.java new file mode 100644 index 000000000..6562b4ae4 --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FeatureViewEngine.java @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.engine.FeatureViewEngineBase; + +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import com.logicalclocks.hsfs.flink.constructor.Query; +import com.logicalclocks.hsfs.flink.FeatureView; +import com.logicalclocks.hsfs.flink.FeatureStore; + +import org.apache.flink.streaming.api.datastream.DataStream; + +import java.io.IOException; +import java.util.Date; +import java.util.List; +import java.util.Map; + +public class FeatureViewEngine extends FeatureViewEngineBase> { + + @Override + public FeatureView update(FeatureView featureView) throws FeatureStoreException, IOException { + featureViewApi.update(featureView, FeatureView.class); + return featureView; + } + + @Override + public FeatureView get(FeatureStore featureStore, String name, Integer version) + throws FeatureStoreException, IOException { + FeatureView featureView = get(featureStore, name, version, FeatureView.class); + featureView.setFeatureStore(featureStore); + return featureView; + } + + @Override + public Query getBatchQuery(FeatureView featureView, Date date, Date date1, Boolean withLabels, Integer integer) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public String getBatchQueryString(FeatureView featureView, Date startTime, Date endTime, Integer trainingDataVersion) + throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public FeatureView getOrCreateFeatureView(FeatureStore featureStore, String name, Integer version, Query query, + String description, List labels) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } + + @Override + public DataStream getBatchData(FeatureView featureView, Date startTime, Date endTime, Map readOptions, + Integer trainingDataVersion) throws FeatureStoreException, IOException { + throw new UnsupportedOperationException("Not supported for Flink"); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FlinkEngine.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FlinkEngine.java new file mode 100644 index 000000000..9e0645e96 --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/FlinkEngine.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import com.google.common.base.Strings; +import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.engine.EngineBase; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; + +import com.logicalclocks.hsfs.metadata.HopsworksInternalClient; +import lombok.Getter; + +import org.apache.avro.generic.GenericRecord; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.GlobalConfiguration; +import org.apache.flink.connector.base.DeliveryGuarantee; +import org.apache.flink.connector.kafka.sink.KafkaSink; +import org.apache.flink.core.fs.Path; +import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.util.FileUtils; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +import static org.apache.flink.configuration.ConfigOptions.key; + +public class FlinkEngine extends EngineBase { + private static FlinkEngine INSTANCE = null; + + public static synchronized FlinkEngine getInstance() throws FeatureStoreException { + if (INSTANCE == null) { + INSTANCE = new FlinkEngine(); + } + return INSTANCE; + } + + @Getter + private StreamExecutionEnvironment streamExecutionEnvironment; + + private final Configuration flinkConfig = GlobalConfiguration.loadConfiguration(); + private final ConfigOption keyStorePath = + key("flink.hadoop.hops.ssl.keystore.name") + .stringType() + .defaultValue("trustStore.jks") + .withDescription("path to keyStore.jks"); + private final ConfigOption trustStorePath = + key("flink.hadoop.hops.ssl.truststore.name") + .stringType() + .defaultValue("trustStore.jks") + .withDescription("path to trustStore.jks"); + private final ConfigOption materialPasswdPath = + key("flink.hadoop.hops.ssl.keystores.passwd.name") + .stringType() + .defaultValue("material_passwd") + .withDescription("path to material_passwd"); + + private FlinkEngine() throws FeatureStoreException { + streamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment(); + // Configure the streamExecutionEnvironment + streamExecutionEnvironment.getConfig().enableObjectReuse(); + } + + public DataStreamSink writeDataStream(StreamFeatureGroup streamFeatureGroup, DataStream dataStream, + Map writeOptions) throws FeatureStoreException, IOException { + + DataStream genericDataStream = (DataStream) dataStream; + Properties properties = new Properties(); + properties.putAll(getKafkaConfig(streamFeatureGroup, writeOptions)); + + KafkaSink sink = KafkaSink.builder() + .setBootstrapServers(properties.getProperty("bootstrap.servers")) + .setKafkaProducerConfig(properties) + .setRecordSerializer(new KafkaRecordSerializer(streamFeatureGroup)) + .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE) + .build(); + Map complexFeatureSchemas = new HashMap<>(); + for (String featureName: streamFeatureGroup.getComplexFeatures()) { + complexFeatureSchemas.put(featureName, streamFeatureGroup.getFeatureAvroSchema(featureName)); + } + + DataStream avroRecordDataStream = + genericDataStream.map(new PojoToAvroRecord( + streamFeatureGroup.getDeserializedAvroSchema(), + streamFeatureGroup.getDeserializedEncodedAvroSchema(), + complexFeatureSchemas)) + .returns( + new GenericRecordAvroTypeInfo(streamFeatureGroup.getDeserializedEncodedAvroSchema()) + ); + + return avroRecordDataStream.sinkTo(sink); + } + + @Override + public String addFile(String filePath) throws IOException { + if (Strings.isNullOrEmpty(filePath)) { + return filePath; + } + // this is used for unit testing + if (!filePath.startsWith("file://")) { + filePath = "hdfs://" + filePath; + } + String targetPath = FileUtils.getCurrentWorkingDirectory().toString() + + filePath.substring(filePath.lastIndexOf("/")); + FileUtils.copy(new Path(filePath), new Path(targetPath), false); + return targetPath; + } + + @Override + public Map getKafkaConfig(FeatureGroupBase featureGroup, Map writeOptions) + throws FeatureStoreException, IOException { + boolean external = !(System.getProperties().containsKey(HopsworksInternalClient.REST_ENDPOINT_SYS) + || (writeOptions != null + && Boolean.parseBoolean(writeOptions.getOrDefault("internal_kafka", "false")))); + + StorageConnector.KafkaConnector storageConnector = + storageConnectorApi.getKafkaStorageConnector(featureGroup.getFeatureStore(), external); + storageConnector.setSslTruststoreLocation(addFile(storageConnector.getSslTruststoreLocation())); + storageConnector.setSslKeystoreLocation(addFile(storageConnector.getSslKeystoreLocation())); + + Map config = storageConnector.kafkaOptions(); + + if (writeOptions != null) { + config.putAll(writeOptions); + } + config.put("enable.idempotence", "false"); + return config; + } + + public String getTrustStorePath() { + return flinkConfig.getString(trustStorePath); + } + + public String getKeyStorePath() { + return flinkConfig.getString(keyStorePath); + } + + public String getCertKey() { + return flinkConfig.getString(materialPasswdPath); + } +} diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/KafkaRecordSerializer.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/KafkaRecordSerializer.java new file mode 100644 index 000000000..b1729f75d --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/KafkaRecordSerializer.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import com.logicalclocks.hsfs.FeatureStoreException; +import com.logicalclocks.hsfs.flink.StreamFeatureGroup; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.DatumWriter; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.reflect.ReflectDatumWriter; +import org.apache.flink.api.common.serialization.SerializationSchema; +import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema; +import org.apache.kafka.clients.producer.ProducerRecord; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class KafkaRecordSerializer implements KafkaRecordSerializationSchema { + + private final String topic; + private final List primaryKeys; + private final Map headerMap = new HashMap<>(); + + KafkaRecordSerializer(StreamFeatureGroup streamFeatureGroup) throws FeatureStoreException, IOException { + this.topic = streamFeatureGroup.getOnlineTopicName(); + this.primaryKeys = streamFeatureGroup.getPrimaryKeys(); + + headerMap.put("projectId", + String.valueOf(streamFeatureGroup.getFeatureStore().getProjectId()).getBytes(StandardCharsets.UTF_8)); + headerMap.put("featureGroupId", String.valueOf(streamFeatureGroup.getId()).getBytes(StandardCharsets.UTF_8)); + headerMap.put("subjectId", + String.valueOf(streamFeatureGroup.getSubject().getId()).getBytes(StandardCharsets.UTF_8)); + } + + @Override + public void open(SerializationSchema.InitializationContext context, + KafkaRecordSerializationSchema.KafkaSinkContext sinkContext) { + // TODO not needed + } + + @Override + public ProducerRecord serialize(GenericRecord genericRecord, + KafkaRecordSerializationSchema.KafkaSinkContext context, + Long timestamp) { + byte[] key = this.serializeKey(genericRecord); + byte[] value = this.serializeValue(genericRecord); + ProducerRecord producerRecord = new ProducerRecord<>(topic, null, timestamp, key, value); + for (Map.Entry entry: headerMap.entrySet()) { + producerRecord.headers().add(entry.getKey(), entry.getValue()); + } + return producerRecord; + } + + public byte[] serializeKey(GenericRecord genericRecord) { + List primaryKeyValues = new ArrayList<>(); + for (String primaryKey: primaryKeys) { + primaryKeyValues.add(genericRecord.get(primaryKey).toString()); + } + return String.join(";", primaryKeyValues).getBytes(StandardCharsets.UTF_8); + } + + public byte[] serializeValue(GenericRecord genericRecord) { + DatumWriter datumWriter = new ReflectDatumWriter<>(genericRecord.getSchema()); + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + byteArrayOutputStream.reset(); + + BinaryEncoder binaryEncoder = new EncoderFactory().binaryEncoder(byteArrayOutputStream, null); + try { + datumWriter.write(genericRecord, binaryEncoder); + binaryEncoder.flush(); + } catch (IOException e) { + e.printStackTrace(); + } + return byteArrayOutputStream.toByteArray(); + } +} \ No newline at end of file diff --git a/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/PojoToAvroRecord.java b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/PojoToAvroRecord.java new file mode 100644 index 000000000..d2a37c26e --- /dev/null +++ b/hsfs/java/flink/src/main/java/com/logicalclocks/hsfs/flink/engine/PojoToAvroRecord.java @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs.flink.engine; + +import org.apache.avro.Schema; +import org.apache.avro.SchemaValidationException; +import org.apache.avro.SchemaValidatorBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.reflect.ReflectData; +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.lang.reflect.Field; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class PojoToAvroRecord extends RichMapFunction implements + ResultTypeQueryable { + + private final String schema; + private final String encodedSchema; + private final Map complexFeatureSchemas; + + // org.apache.avro.Schema$Field is not serializable. Create in open() and reused later on + private transient Schema deserializedSchema; + private transient Schema deserializedEncodedSchema; + private transient Map deserializedComplexFeatureSchemas; + private transient GenericRecordAvroTypeInfo producedType; + + public PojoToAvroRecord(Schema schema, Schema encodedSchema, Map complexFeatureSchemas) { + this.schema = schema.toString(); + this.encodedSchema = encodedSchema.toString(); + this.complexFeatureSchemas = complexFeatureSchemas; + } + + @Override + public GenericRecord map(T input) throws Exception { + + // validate + validatePojoAgainstSchema(input, this.deserializedSchema); + + // Create a new Avro record based on the given schema + GenericRecord record = new GenericData.Record(this.deserializedEncodedSchema); + // Get the fields of the POJO class populate fields of the Avro record + List fields = + Arrays.stream(input.getClass().getDeclaredFields()) + .filter(f -> f.getName().equals("SCHEMA$")) + .collect(Collectors.toList()); + if (!fields.isEmpty()) { + // it means POJO was generated from avro schema + Field schemaField = input.getClass().getDeclaredField("SCHEMA$"); + schemaField.setAccessible(true); + Schema fieldSchema = (Schema) schemaField.get(null); + for (Schema.Field field : fieldSchema.getFields()) { + String fieldName = field.name(); + Field pojoField = input.getClass().getDeclaredField(fieldName); + pojoField.setAccessible(true); + Object fieldValue = pojoField.get(input); + populateAvroRecord(record, fieldName, fieldValue); + } + } else { + for (Field field : fields) { + String fieldName = field.getName(); + Object fieldValue = field.get(input); + populateAvroRecord(record, fieldName, fieldValue); + } + } + return record; + } + + @Override + public void open(Configuration configuration) throws Exception { + super.open(configuration); + this.deserializedSchema = new Schema.Parser().parse(this.schema); + this.deserializedEncodedSchema = new Schema.Parser().parse(this.encodedSchema); + this.deserializedComplexFeatureSchemas = new HashMap<>(); + for (String featureName: this.complexFeatureSchemas.keySet()) { + deserializedComplexFeatureSchemas.put(featureName, + new Schema.Parser().parse(this.complexFeatureSchemas.get(featureName))); + } + this.producedType = new GenericRecordAvroTypeInfo(deserializedEncodedSchema); + } + + @Override + public TypeInformation getProducedType() { + return producedType; + } + + private void populateAvroRecord(GenericRecord record, String fieldName, Object fieldValue) throws IOException { + if (this.deserializedComplexFeatureSchemas.containsKey(fieldName)) { + GenericDatumWriter complexFeatureDatumWriter = + new GenericDatumWriter<>(this.deserializedComplexFeatureSchemas.get(fieldName)); + ByteArrayOutputStream complexFeatureByteArrayOutputStream = new ByteArrayOutputStream(); + complexFeatureByteArrayOutputStream.reset(); + BinaryEncoder complexFeatureBinaryEncoder = + new EncoderFactory().binaryEncoder(complexFeatureByteArrayOutputStream, null); + complexFeatureDatumWriter.write(fieldValue, complexFeatureBinaryEncoder); + complexFeatureBinaryEncoder.flush(); + record.put(fieldName, ByteBuffer.wrap(complexFeatureByteArrayOutputStream.toByteArray())); + complexFeatureByteArrayOutputStream.flush(); + complexFeatureByteArrayOutputStream.close(); + } else { + record.put(fieldName, fieldValue); + } + } + + private void validatePojoAgainstSchema(Object pojo, Schema avroSchema) throws SchemaValidationException { + Schema pojoSchema = ReflectData.get().getSchema(pojo.getClass()); + SchemaValidatorBuilder builder = new SchemaValidatorBuilder(); + builder.canReadStrategy().validateAll().validate(avroSchema, Collections.singletonList(pojoSchema)); + } +} diff --git a/hsfs/java/hsfs/pom.xml b/hsfs/java/hsfs/pom.xml new file mode 100644 index 000000000..56847be5d --- /dev/null +++ b/hsfs/java/hsfs/pom.xml @@ -0,0 +1,100 @@ + + + + hsfs-parent + com.logicalclocks + 4.0.0-SNAPSHOT + + 4.0.0 + + hsfs + + + + 2.2.11 + + + + + + com.fasterxml.jackson.core + jackson-databind + ${fasterxml.jackson.databind.version} + provided + + + + + org.apache.avro + avro + ${avro.version} + + + + + javax.xml.bind + jaxb-api + ${javax.version} + provided + + + + software.amazon.awssdk + ssm + ${awssdk.version} + + + org.apache.httpcomponents + * + + + + + + software.amazon.awssdk + sts + ${awssdk.version} + + + org.apache.httpcomponents + * + + + + + + software.amazon.awssdk + secretsmanager + ${awssdk.version} + + + org.apache.httpcomponents + * + + + + + + com.databricks + dbutils-api_${scala-short.version} + ${dbutils.version} + provided + + + + + org.scala-lang + scala-library + ${scala.version} + + + + + commons-io + commons-io + 2.11.0 + + + diff --git a/hsfs/java/hsfs/src/main/java/com/logicalclocks/hsfs/DataFormat.java b/hsfs/java/hsfs/src/main/java/com/logicalclocks/hsfs/DataFormat.java new file mode 100644 index 000000000..89aab31c9 --- /dev/null +++ b/hsfs/java/hsfs/src/main/java/com/logicalclocks/hsfs/DataFormat.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020-2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public enum DataFormat { + @JsonProperty("csv") + CSV, + @JsonProperty("tsv") + TSV, + @JsonProperty("parquet") + PARQUET, + @JsonProperty("avro") + AVRO, + @JsonProperty("image") + IMAGE, + @JsonProperty("orc") + ORC, + @JsonProperty("tfrecords") + TFRECORDS, + @JsonProperty("tfrecord") + TFRECORD +} diff --git a/hsfs/java/hsfs/src/main/java/com/logicalclocks/hsfs/DeltaStreamerJobConf.java b/hsfs/java/hsfs/src/main/java/com/logicalclocks/hsfs/DeltaStreamerJobConf.java new file mode 100644 index 000000000..7990f2e42 --- /dev/null +++ b/hsfs/java/hsfs/src/main/java/com/logicalclocks/hsfs/DeltaStreamerJobConf.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022-2023. Hopsworks AB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * + * See the License for the specific language governing permissions and limitations under the License. + * + */ + +package com.logicalclocks.hsfs; + +import com.logicalclocks.hsfs.metadata.Option; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; + +import java.util.List; + +@NoArgsConstructor +@AllArgsConstructor +public class DeltaStreamerJobConf { + + @Getter + @Setter + private List