Skip to content

Commit

Permalink
Merge branch 'main' into feature/fast-hierarchical
Browse files Browse the repository at this point in the history
  • Loading branch information
wpfl-dbt committed Dec 12, 2024
2 parents bb8c1ce + 2eac995 commit 5de8fb3
Show file tree
Hide file tree
Showing 33 changed files with 644 additions and 131 deletions.
18 changes: 4 additions & 14 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,6 @@ on:
branches: [ main ]
workflow_dispatch:

env:
MB__BATCH_SIZE: 10_000
MB__BACKEND_TYPE: postgres
MB__DATASETS_CONFIG: datasets.toml
# PostgreSQL backend settings
MB__POSTGRES__HOST: localhost
MB__POSTGRES__PORT: 5432
MB__POSTGRES__USER: matchbox_user
MB__POSTGRES__PASSWORD: matchbox_password
MB__POSTGRES__DATABASE: matchbox
MB__POSTGRES__DB_SCHEMA: mb


jobs:
run-unit-tests:
name: tests
Expand All @@ -34,8 +21,11 @@ jobs:

- name: Install the project
run: uv sync --all-extras --dev

- name: Copy environment variables
run: cp environments/dev_docker.env .env

- name: Set up PostgreSQL
- name: Run DBs and API
run: |
docker compose up -d --wait
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,9 @@ dmypy.json
# Cython debug symbols
cython_debug/

# Mac things
.DS_Store

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
Expand Down
11 changes: 0 additions & 11 deletions .gitlab/merge_request_templates/merge_template.md

This file was deleted.

20 changes: 20 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,26 @@ services:
- "5432:5432"
volumes:
- matchbox_data:/var/lib/postgresql/data
api:
build:
context: .
dockerfile: src/matchbox/server/Dockerfile
ports:
- "8000:8000"
depends_on:
- matchbox-postgres

develop:
# https://docs.docker.com/compose/file-watch/#compose-watch-versus-bind-mounts
watch:
# Sync the working directory with the `/app` directory in the container
- action: sync
path: ./src/matchbox/server
target: /code/src/matchbox/server

# Rebuild the image on changes to the `pyproject.toml`
- action: rebuild
path: ./pyproject.toml

volumes:
warehouse_data:
Expand Down
12 changes: 12 additions & 0 deletions environments/dev_docker.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
MB__BATCH_SIZE=250_000
MB__BACKEND_TYPE=postgres
MB__DATASETS_CONFIG=datasets.toml

MB__POSTGRES__HOST=matchbox-postgres
MB__POSTGRES__PORT=5432
MB__POSTGRES__USER=matchbox_user
MB__POSTGRES__PASSWORD=matchbox_password
MB__POSTGRES__DATABASE=matchbox
MB__POSTGRES__DB_SCHEMA=mb

API__ROOT=http://localhost:8000
12 changes: 12 additions & 0 deletions environments/dev_local.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
MB__BATCH_SIZE=250_000
MB__BACKEND_TYPE=postgres
MB__DATASETS_CONFIG=datasets.toml

MB__POSTGRES__HOST=localhost
MB__POSTGRES__PORT=5432
MB__POSTGRES__USER=matchbox_user
MB__POSTGRES__PASSWORD=matchbox_password
MB__POSTGRES__DATABASE=matchbox
MB__POSTGRES__DB_SCHEMA=mb

API__ROOT=http://localhost:8000
12 changes: 12 additions & 0 deletions environments/sample.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
MB__BATCH_SIZE=
MB__BACKEND_TYPE=
MB__DATASETS_CONFIG=

MB__POSTGRES__HOST=
MB__POSTGRES__PORT=
MB__POSTGRES__USER=
MB__POSTGRES__PASSWORD=
MB__POSTGRES__DATABASE=
MB__POSTGRES__DB_SCHEMA=

API__ROOT=
6 changes: 1 addition & 5 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,4 @@ scan:
# Run Python tests
test:
docker compose up -d --wait
uv run pytest

# Run development version of API
api:
uv run fastapi dev src/matchbox/server/api.py
uv run pytest
6 changes: 2 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies = [
"click>=8.1.7",
"connectorx>=0.3.3",
"duckdb>=1.1.1",
"httpx>=0.28.0",
"matplotlib>=3.9.2",
"pandas>=2.2.3",
"psycopg2>=2.9.10",
Expand Down Expand Up @@ -38,6 +39,7 @@ dev = [
"pytest-env>=1.1.5",
"ruff>=0.6.8",
"docker>=7.1.0",
"vcrpy>=6.0.2",
]
typing = [
"polars>=1.11.0",
Expand Down Expand Up @@ -89,7 +91,3 @@ log_cli = false
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"

[tool.pytest_env]
MB__POSTGRES__SCHEMA = "test"
MB__BATCH_SIZE = "400"
10 changes: 0 additions & 10 deletions sample.env

This file was deleted.

6 changes: 6 additions & 0 deletions src/matchbox/client/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from matchbox.client.visualisation import draw_resolution_graph

__all__ = (
# Visualisation
draw_resolution_graph,
)
21 changes: 21 additions & 0 deletions src/matchbox/client/_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from os import getenv

import httpx

from matchbox.common.graph import ResolutionGraph


def url(path: str) -> str:
"""
Return path prefixed by API root, determined from environment
"""
api_root = getenv("API__ROOT")
if api_root is None:
raise RuntimeError("API__ROOT needs to be defined in the environment")

return api_root + path


def get_resolution_graph() -> str:
res = httpx.get(url("/report/resolutions")).json()
return ResolutionGraph.model_validate(res)
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,28 @@
from matplotlib.figure import Figure
from rustworkx.visualization import mpl_draw

from matchbox.server.base import MatchboxDBAdapter, inject_backend
from matchbox.client._handler import get_resolution_graph


@inject_backend
def draw_model_tree(backend: MatchboxDBAdapter) -> Figure:
def draw_resolution_graph() -> Figure:
"""
Draws the model subgraph.
Draws the resolution graph.
"""
G = backend.get_model_subgraph()
G: rx.PyDiGraph = get_resolution_graph().to_rx()

node_indices = G.node_indices()
datasets = {
G[node_indices[i]]["id"]: i
for i in node_indices
if G[node_indices[i]]["type"] == "dataset"
if G[node_indices[i]]["kind"] == "dataset"
}

colours = []
for i in node_indices:
type = G[node_indices[i]]["type"]
if type == "dataset":
kind = G[node_indices[i]]["kind"]
if kind == "dataset":
colours.append((0, 0, 1, 0.2))
elif type == "model":
elif kind == "model":
colours.append((1, 0, 0, 0.2))

return mpl_draw(
Expand All @@ -37,6 +36,5 @@ def draw_model_tree(backend: MatchboxDBAdapter) -> Figure:
node_color=colours,
with_labels=True,
labels=lambda node: node["name"],
edge_labels=lambda edge: edge["type"],
font_size=8,
)
43 changes: 43 additions & 0 deletions src/matchbox/common/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from enum import StrEnum

import rustworkx as rx
from matchbox.common.hash import hash_to_str
from pydantic import BaseModel


class ResolutionNodeKind(StrEnum):
DATASET = "dataset"
MODEL = "model"
HUMAN = "human"


class ResolutionNode(BaseModel):
hash: bytes
name: str
kind: ResolutionNodeKind

def __hash__(self):
return hash(self.hash)


class ResolutionEdge(BaseModel):
parent: bytes
child: bytes

def __hash__(self):
return hash((self.parent, self.child))


class ResolutionGraph(BaseModel):
nodes: set[ResolutionNode]
edges: set[ResolutionEdge]

def to_rx(self) -> rx.PyDiGraph:
nodes = {}
G = rx.PyDiGraph()
for n in self.nodes:
node_data = {"id": hash_to_str(n.hash), "name": n.name, "kind": str(n.kind)}
nodes[n.hash] = G.add_node(node_data)
for e in self.edges:
G.add_edge(nodes[e.parent], nodes[e.child], {})
return G
5 changes: 5 additions & 0 deletions src/matchbox/common/hash.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import base64
import hashlib
from typing import TYPE_CHECKING, Any, TypeVar
from uuid import UUID
Expand All @@ -17,6 +18,10 @@
HASH_FUNC = hashlib.sha256


def hash_to_str(hash: bytes) -> str:
return base64.b64encode(hash).decode("utf-8")


def dataset_to_hashlist(dataset: Source, model_hash: bytes) -> list[dict[str, Any]]:
"""Retrieve and hash a dataset from its warehouse, ready to be inserted."""
with Session(dataset.database.engine) as warehouse_session:
Expand Down
3 changes: 0 additions & 3 deletions src/matchbox/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from matchbox.helpers.cleaner import cleaner, cleaners
from matchbox.helpers.comparison import comparison
from matchbox.helpers.selector import selector, selectors
from matchbox.helpers.visualisation import draw_model_tree

__all__ = (
# Cleaners
Expand All @@ -12,6 +11,4 @@
# Selectors
"selector",
"selectors",
# Visualisation
"draw_model_tree",
)
35 changes: 35 additions & 0 deletions src/matchbox/server/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# connectorx won't work on ARM or python-alpine
FROM --platform=amd64 python:3.11

COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

WORKDIR /code

# Enable bytecode compilation
ENV UV_COMPILE_BYTECODE=1

# Copy from the cache instead of linking since it's a mounted volume
ENV UV_LINK_MODE=copy

# Install the project's dependencies using the lockfile and settings
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --frozen --all-extras --no-install-project --no-dev

COPY ./environments/dev_docker.env /code/.env
COPY ./uv.lock /code/uv.lock
COPY ./pyproject.toml /code/pyproject.toml
COPY ./src/matchbox /code/src/matchbox

# Then, add the rest of the project source code and install it
# Installing separately from its dependencies allows optimal layer caching
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --all-extras --no-dev

# Place executables in the environment at the front of the path
ENV PATH="/code/.venv/bin:$PATH"

# Uses `fastapi dev` to enable hot-reloading when the `watch` sync occurs
# Uses `--host 0.0.0.0` to allow access from outside the container
CMD ["fastapi", "dev", "--host", "0.0.0.0", "src/matchbox/server/api.py"]
9 changes: 6 additions & 3 deletions src/matchbox/server/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fastapi import Depends, FastAPI, HTTPException
from pydantic import BaseModel

from matchbox.common.graph import ResolutionGraph
from matchbox.server.base import BackendManager, MatchboxDBAdapter

dotenv_path = find_dotenv(usecwd=True)
Expand Down Expand Up @@ -154,6 +155,8 @@ async def validate_hashes():
raise HTTPException(status_code=501, detail="Not implemented")


@app.get("/report/models")
async def get_model_subgraph():
raise HTTPException(status_code=501, detail="Not implemented")
@app.get("/report/resolutions")
async def get_resolutions(
backend: Annotated[MatchboxDBAdapter, Depends(get_backend)],
) -> ResolutionGraph:
return backend.get_resolution_graph()
4 changes: 2 additions & 2 deletions src/matchbox/server/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from dotenv import find_dotenv, load_dotenv
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
from rustworkx import PyDiGraph
from sqlalchemy import Engine

from matchbox.common.db import Source
from matchbox.common.graph import ResolutionGraph

if TYPE_CHECKING:
from pandas import DataFrame as PandasDataFrame
Expand Down Expand Up @@ -261,7 +261,7 @@ def validate_hashes(self, hashes: list[bytes]) -> bool: ...
def get_dataset(self, db_schema: str, db_table: str, engine: Engine) -> Source: ...

@abstractmethod
def get_model_subgraph(self) -> PyDiGraph: ...
def get_resolution_graph(self) -> ResolutionGraph: ...

@abstractmethod
def get_model(self, model: str) -> MatchboxModelAdapter: ...
Expand Down
Loading

0 comments on commit 5de8fb3

Please sign in to comment.