Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
dan1elt0m committed Sep 20, 2024
1 parent efa717f commit 927ecba
Show file tree
Hide file tree
Showing 10 changed files with 990 additions and 0 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/build-and-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: release
on:
release:
types: [published]
jobs:
publish:
environment: release # needed for PyPI OIDC
runs-on: ubuntu-latest
permissions:
id-token: write
steps:
- uses: actions/checkout@v4
name: checkout
- name: Install uv
uses: astral-sh/setup-uv@v2
- name: "Set up Python"
uses: actions/setup-python@v5
with:
python-version-file: "pyproject.toml"
- name: Install dependencies
run: uv build # build
- uses: pypa/gh-action-pypi-publish@release/v1 # publish
70 changes: 70 additions & 0 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"

on:
push:
branches: [ main ]
pull_request:
# The branches below must be a subset of the branches above
branches: [ main ]
schedule:
- cron: '42 15 * * 5'

jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write

strategy:
fail-fast: false
matrix:
language: [ 'python' ]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
# Learn more about CodeQL language support at https://git.io/codeql-language-support

steps:
- name: Checkout repository
uses: actions/checkout@v2

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main

# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v2

# ℹ️ Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl

# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project
# uses a compiled language

#- run: |
# make bootstrap
# make release

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
35 changes: 35 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: test
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
types: [opened, reopened, synchronize]
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12']
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v2
- name: Set up Python ${{ matrix.python-version }}
run: uv python install ${{ matrix.python-version }} && uv python pin ${{ matrix.python-version }}
- name: Install dependencies
run: uv venv && uv sync
- name: Run tests
run: |
source .venv/bin/activate
uv run pytest --junitxml=junit/report.xml --cov=.
- name: Publish Test Report
uses: mikepenz/action-junit-report@v3
if: always() # always run even if the previous step fails
with:
report_paths: "junit/report.xml"
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.envrc
.coverage
.idea/
__pycache__/
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.10
35 changes: 35 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[project]
name = "pyarrow-unity"
version = "0.1.0"
description = "Convert Pyarrow schema to Unity Catalog schema"
readme = "README.md"

requires-python = ">=3.10"
dependencies = [
"pyarrow>=17.0.0",
"unitycatalog>=0.1.1",
]

[tool.uv]
dev-dependencies = [
"coverage>=7.6.1",
"pre-commit>=3.8.0",
"pytest-cov>=5.0.0",
"pytest>=8.3.3",
"ruff>=0.6.5",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.sdist]
include = [
"/src/pyarrow_unity",
]
[tool.ruff]
line-length = 88
lint.select = ["E", "F", "W", "C90"]
lint.ignore = ["E501"]
lint.per-file-ignores = {"__init__.py" = ["F401", "F403"]}
exclude = ["build", "dist", "venv"]
Empty file added src/pyarrow_unity/__init__.py
Empty file.
108 changes: 108 additions & 0 deletions src/pyarrow_unity/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import json
from typing import Literal

import pyarrow as pa

from unitycatalog.types.table_create_params import Column

UCSupportedTypeLiteral = Literal[
"BOOLEAN",
"BYTE",
"SHORT",
"INT",
"LONG",
"FLOAT",
"DOUBLE",
"DATE",
"TIMESTAMP",
"TIMESTAMP_NTZ",
"STRING",
"BINARY",
"DECIMAL",
"INTERVAL",
"ARRAY",
"STRUCT",
"MAP",
"CHAR",
"NULL",
"USER_DEFINED_TYPE",
"TABLE_TYPE",
]

UCSupportedFormatLiteral = Literal["DELTA", "CSV", "JSON", "AVRO", "PARQUET", "ORC", "TEXT"]


def pyarrow_type_to_supported_uc_json_type(data_type: pa.DataType) -> UCSupportedTypeLiteral:
"""Convert a PyArrow data type to a supported Unitycatalog JSON type."""
if pa.types.is_boolean(data_type):
return "BOOLEAN"
elif pa.types.is_int8(data_type):
return "BYTE"
elif pa.types.is_int16(data_type):
return "SHORT"
elif pa.types.is_int32(data_type):
return "INT"
elif pa.types.is_int64(data_type):
return "LONG"
elif pa.types.is_float32(data_type):
return "FLOAT"
elif pa.types.is_float64(data_type):
return "DOUBLE"
elif pa.types.is_date32(data_type):
return "DATE"
elif pa.types.is_timestamp(data_type):
return "TIMESTAMP"
elif pa.types.is_string(data_type):
return "STRING"
elif pa.types.is_binary(data_type):
return "BINARY"
elif pa.types.is_decimal(data_type):
return "DECIMAL"
elif pa.types.is_duration(data_type):
return "INTERVAL"
elif pa.types.is_list(data_type):
return "ARRAY"
elif pa.types.is_struct(data_type):
return "STRUCT"
elif pa.types.is_map(data_type):
return "MAP"
elif pa.types.is_null(data_type):
return "NULL"
else:
raise NotImplementedError(f"Type {data_type} not supported")

def model_unity_schema(schema: pa.Schema) -> list[Column]:
"""Convert a PyArrow schema to a list of Unitycatalog Column objects."""
columns = []

for i, field in enumerate(schema):
data_type = field.type
json_type = pyarrow_type_to_supported_uc_json_type(data_type)

column = Column(
name=field.name,
type_name=json_type,
nullable=field.nullable,
comment=f"Field {field.name}", # Generic comment, modify as needed
position=i,
type_json=json.dumps(
{
"name": field.name,
"type": json_type,
"nullable": field.nullable,
"metadata": field.metadata or {},
}
),
type_precision=0,
type_scale=0,
type_text=json_type,
)

# Adjust type precision and scale for decimal types
if pa.types.is_decimal(data_type):
column["type_precision"] = data_type.precision
column["type_scale"] = data_type.scale

columns.append(column)

return columns
78 changes: 78 additions & 0 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest
import pyarrow as pa
from pyarrow_unity.model import pyarrow_type_to_supported_uc_json_type, model_unity_schema

def test_pyarrow_type_to_supported_uc_json_type_boolean():
assert pyarrow_type_to_supported_uc_json_type(pa.bool_()) == "BOOLEAN"

def test_pyarrow_type_to_supported_uc_json_type_int8():
assert pyarrow_type_to_supported_uc_json_type(pa.int8()) == "BYTE"

def test_pyarrow_type_to_supported_uc_json_type_int16():
assert pyarrow_type_to_supported_uc_json_type(pa.int16()) == "SHORT"

def test_pyarrow_type_to_supported_uc_json_type_int32():
assert pyarrow_type_to_supported_uc_json_type(pa.int32()) == "INT"

def test_pyarrow_type_to_supported_uc_json_type_int64():
assert pyarrow_type_to_supported_uc_json_type(pa.int64()) == "LONG"

def test_pyarrow_type_to_supported_uc_json_type_float32():
assert pyarrow_type_to_supported_uc_json_type(pa.float32()) == "FLOAT"

def test_pyarrow_type_to_supported_uc_json_type_float64():
assert pyarrow_type_to_supported_uc_json_type(pa.float64()) == "DOUBLE"

def test_pyarrow_type_to_supported_uc_json_type_date32():
assert pyarrow_type_to_supported_uc_json_type(pa.date32()) == "DATE"

def test_pyarrow_type_to_supported_uc_json_type_timestamp():
assert pyarrow_type_to_supported_uc_json_type(pa.timestamp('s')) == "TIMESTAMP"

def test_pyarrow_type_to_supported_uc_json_type_string():
assert pyarrow_type_to_supported_uc_json_type(pa.string()) == "STRING"

def test_pyarrow_type_to_supported_uc_json_type_binary():
assert pyarrow_type_to_supported_uc_json_type(pa.binary()) == "BINARY"

def test_pyarrow_type_to_supported_uc_json_type_decimal():
assert pyarrow_type_to_supported_uc_json_type(pa.decimal128(10, 2)) == "DECIMAL"

def test_pyarrow_type_to_supported_uc_json_type_duration():
assert pyarrow_type_to_supported_uc_json_type(pa.duration('s')) == "INTERVAL"

def test_pyarrow_type_to_supported_uc_json_type_list():
assert pyarrow_type_to_supported_uc_json_type(pa.list_(pa.int32())) == "ARRAY"

def test_pyarrow_type_to_supported_uc_json_type_struct():
assert pyarrow_type_to_supported_uc_json_type(pa.struct([pa.field('f1', pa.int32())])) == "STRUCT"

def test_pyarrow_type_to_supported_uc_json_type_map():
assert pyarrow_type_to_supported_uc_json_type(pa.map_(pa.string(), pa.int32())) == "MAP"

def test_pyarrow_type_to_supported_uc_json_type_null():
assert pyarrow_type_to_supported_uc_json_type(pa.null()) == "NULL"

def test_pyarrow_type_to_supported_uc_json_type_not_supported():
with pytest.raises(NotImplementedError):
pyarrow_type_to_supported_uc_json_type(pa.time32('s'))

def test_model_unity_schema():
schema = pa.schema([
pa.field('col1', pa.int32(), nullable=True),
pa.field('col2', pa.string(), nullable=False),
pa.field('col3', pa.decimal128(10, 2), nullable=True)
])
columns = model_unity_schema(schema)
assert len(columns) == 3
assert columns[0]['name'] == 'col1'
assert columns[0]['type_name'] == 'INT'
assert columns[0]['nullable'] is True
assert columns[1]['name'] == 'col2'
assert columns[1]['type_name'] == 'STRING'
assert columns[1]['nullable'] is False
assert columns[2]['name'] == 'col3'
assert columns[2]['type_name'] == 'DECIMAL'
assert columns[2]['nullable'] is True
assert columns[2]['type_precision'] == 10
assert columns[2]['type_scale'] == 2
Loading

0 comments on commit 927ecba

Please sign in to comment.