Skip to content

Commit

Permalink
feat(pointcloud): Add HDBSCAN to point cloud (#57)
Browse files Browse the repository at this point in the history
* Restore umap_demo deleted

* Add HDBSCAN clustering

* Fix style

* Fix json int bug

* Update src/phoenix/pcloud/pcloud.py

Co-authored-by: Mikyo King <[email protected]>

* Update src/phoenix/pcloud/projectors.py

Co-authored-by: Mikyo King <[email protected]>

* Delete _fit_transform method

* Restore umap_demo deleted

* Add HDBSCAN clustering

* Fix style

* Fix json int bug

* Delete _fit_transform method

* Update src/phoenix/pcloud/pcloud.py

Co-authored-by: Mikyo King <[email protected]>

* Update src/phoenix/pcloud/projectors.py

Co-authored-by: Mikyo King <[email protected]>

* fix types:

* more fixes

* black

* Add .coverage to gitignore

* Add tables dependency toml file

* wip

* Use pull_request_target, better for forked repos

* Add PR lint action

* Not use pull_request_target

* Add edited trigger

* Ignore type hints from umap and hdbscan

* Rename module pcloud -> pointcloud

* Ignore E203 flake8

* HDBSCAN finished

* wip

* Define and use Coordinates classes

* Parametrize HDBSCAN hyperparams

* Rename to DriftPointCloud

* Update .github/workflows/pull-requests.yaml

Co-authored-by: Mikyo King <[email protected]>

Co-authored-by: Mikyo King <[email protected]>
  • Loading branch information
Francisco Castillo and mikeldking authored Nov 28, 2022
1 parent 7629de0 commit 9aa2a3f
Show file tree
Hide file tree
Showing 9 changed files with 443 additions and 253 deletions.
17 changes: 17 additions & 0 deletions .github/workflows/pull-requests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: Semantic PR

on:
pull_request:
types:
- opened
- edited
- synchronize

jobs:
main:
name: Validate PR title
runs-on: ubuntu-latest
steps:
- uses: amannn/action-semantic-pull-request@v5
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
200 changes: 100 additions & 100 deletions .github/workflows/python-CI.yml
Original file line number Diff line number Diff line change
@@ -1,113 +1,113 @@
name: Python CI

on:
push:
branches: [main]
pull_request:
paths:
- "**.py"
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
push:
branches: [ main ]
pull_request:
paths:
- "**.py"
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

concurrency:
group: test-python-${{ github.head_ref }}
cancel-in-progress: true
group: test-python-${{ github.head_ref }}
cancel-in-progress: true

env:
pip-version: 22.3.1
pip-version: 22.3.1

jobs:
types:
name: Type Check
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Check types
run: |
hatch run types:check
types:
name: Type Check
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Check types
run: |
hatch run type:check
code-format:
name: Code Format
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Check format
run: |
hatch run style:check
code-format:
name: Code Format
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Check format
run: |
hatch run style:check
test-coverage:
name: Test Coverage
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Test coverage
run: |
hatch run coverage
test-coverage:
name: Test Coverage
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Test coverage
run: |
hatch run coverage
docs-coverage:
name: Documentation Coverage
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Check documentation coverage
run: |
hatch run docs:check
docs-coverage:
name: Documentation Coverage
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Check documentation coverage
run: |
hatch run docs:check
test:
name: Test Python
runs-on: ${{ matrix.os }}
needs: [code-format, test-coverage, docs-coverage]
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.10"]
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Run tests
run: |
hatch run tests
test:
name: Test Python
runs-on: ${{ matrix.os }}
needs: [ code-format, test-coverage, docs-coverage ]
strategy:
matrix:
os: [ ubuntu-latest, macos-latest, windows-latest ]
python-version: [ "3.10" ]
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip==${{ env.pip-version }}
pip install hatch
- name: Run tests
run: |
hatch run tests
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated dirs
.vscode
.idea
.coverage
node_modules
dist
*__pycache__*
Expand Down
62 changes: 58 additions & 4 deletions examples/umap_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"source": [
"# Phoenix Embeddings\n",
"\n",
"A simple demo of the `UMAPWidget`"
"This small tutorial goes over creating Phoenix's `Dataset` objects and using them to obtain a UMAP pointcloud using the `UMAPWidget`"
]
},
{
Expand All @@ -15,7 +15,9 @@
"metadata": {},
"outputs": [],
"source": [
"from phoenix.widgets import UMAPWidget, demo_json"
"from phoenix.datasets import Dataset, EmbeddingColumnNames, Schema\n",
"from phoenix.pointcloud import DriftPointCloud, UMAPProjector\n",
"from phoenix.widgets import UMAPWidget"
]
},
{
Expand All @@ -24,7 +26,59 @@
"metadata": {},
"outputs": [],
"source": [
"widget = UMAPWidget(demo_json())\n",
"test_filename = \"NLP_sentiment_classification_language_drift\"\n",
"\n",
"features = [\n",
" \"reviewer_age\",\n",
" \"reviewer_gender\",\n",
" \"product_category\",\n",
" \"language\",\n",
"]\n",
"\n",
"embedding_features = {\n",
" \"embedding_feature\": EmbeddingColumnNames(\n",
" vector_column_name=\"text_vector\", # Will be name of embedding feature in the app\n",
" data_column_name=\"text\",\n",
" ),\n",
"}\n",
"\n",
"# Define a Schema() object for Arize to pick up data from the correct columns for logging\n",
"schema = Schema(\n",
" prediction_id_column_name=\"prediction_id\",\n",
" timestamp_column_name=\"prediction_ts\",\n",
" prediction_label_column_name=\"pred_label\",\n",
" actual_label_column_name=\"label\",\n",
" feature_column_names=features,\n",
" embedding_feature_column_names=embedding_features,\n",
")\n",
"\n",
"train_ds = Dataset.from_hdf(f\"./fixtures/{test_filename}.hdf5\", schema=schema, key=\"training\")\n",
"prod_ds = Dataset.from_hdf(f\"./fixtures/{test_filename}.hdf5\", schema=schema, key=\"production\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Obtain the point cloud\n",
"UMAP_hyperparameters = {\n",
" \"n_components\": 3,\n",
" \"min_dist\": 0,\n",
"}\n",
"projector = UMAPProjector(hyperparameters=UMAP_hyperparameters)\n",
"primary_pts, reference_pts, clusters = projector.project(prod_ds, train_ds, \"embedding_feature\")\n",
"pc = DriftPointCloud(primary_pts, reference_pts, clusters)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"widget = UMAPWidget(pc.to_json())\n",
"widget.show()"
]
}
Expand All @@ -45,7 +99,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.3"
"version": "3.10.0"
},
"vscode": {
"interpreter": {
Expand Down
19 changes: 14 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ dependencies = [
"numpy",
"pandas",
"umap-learn",
"hdbscan",
"tables",
]
dynamic = ["version"]

Expand Down Expand Up @@ -57,7 +59,7 @@ dependencies = [
"pytest-cov",
]

[tool.hatch.envs.types]
[tool.hatch.envs.type]
dependencies = [
"mypy",
"pytest",
Expand Down Expand Up @@ -110,7 +112,7 @@ exclude_lines = [
"if TYPE_CHECKING:",
]

[tool.hatch.envs.types.scripts]
[tool.hatch.envs.type.scripts]
check = [
"mypy .",
]
Expand All @@ -119,12 +121,12 @@ check = [
check = [
"isort --check-only --diff .",
"black --check --diff --color .",
"flake8 --max-line-length=100 --exclude=__init__.py .",
"flake8 --extend-ignore E203 --max-line-length=100 --exclude=__init__.py .",
]
fix = [
"isort .",
"black .",
"flake8 --max-line-length=100 --exclude=__init__.py .",
"flake8 --extend-ignore E203 --max-line-length=100 --exclude=__init__.py .",
]

[tool.isort]
Expand Down Expand Up @@ -152,4 +154,11 @@ ignore-property-decorators = false
ignore-module = false
ignore-nested-functions = false
ignore-nested-classes = false
ignore-setters = false
ignore-setters = false

[[tool.mypy.overrides]]
module = [
"hdbscan",
"umap",
]
ignore_missing_imports = true
2 changes: 1 addition & 1 deletion src/phoenix/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.1"

3 changes: 2 additions & 1 deletion src/phoenix/pointcloud/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .pointcloud import CalculateUMAP
from .pointcloud import DriftPointCloud
from .projectors import UMAPProjector
Loading

0 comments on commit 9aa2a3f

Please sign in to comment.