Skip to content

#49 Implement Python native write with PyArrow #51

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dev/dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ setuptools>=18.0
wheel
py4j==0.10.9.7
pyarrow>=5.0.0
polars>=1.31.0
fastavro>=1.9.0
zstandard>=0.23.0
pandas>=1.3.0
Expand Down
34 changes: 0 additions & 34 deletions pypaimon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,3 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#################################################################################

from .api import Schema
from .py4j import Catalog
from .py4j import CommitMessage
from .py4j import Predicate
from .py4j import PredicateBuilder
from .py4j import ReadBuilder
from .py4j import RowType
from .py4j import Split
from .py4j import Table
from .py4j import BatchTableCommit
from .py4j import TableRead
from .py4j import TableScan
from .py4j import Plan
from .py4j import BatchTableWrite
from .py4j import BatchWriteBuilder

__all__ = [
'Schema',
'Catalog',
'CommitMessage',
'Predicate',
'PredicateBuilder',
'ReadBuilder',
'RowType',
'Split',
'Table',
'BatchTableCommit',
'TableRead',
'TableScan',
'Plan',
'BatchTableWrite',
'BatchWriteBuilder'
]
5 changes: 4 additions & 1 deletion pypaimon/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
from .table_commit import BatchTableCommit
from .table_write import BatchTableWrite
from .write_builder import BatchWriteBuilder
from .table import Table, Schema
from .schema import Schema
from .table import Table
from .database import Database
from .catalog import Catalog

__all__ = [
Expand All @@ -40,6 +42,7 @@
'BatchWriteBuilder',
'Table',
'Schema',
'Database',
'Catalog',
'Predicate',
'PredicateBuilder'
Expand Down
7 changes: 3 additions & 4 deletions pypaimon/api/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from abc import ABC, abstractmethod
from typing import Optional
from pypaimon.api import Table, Schema
from pypaimon.api import Table, Schema, Database


class Catalog(ABC):
Expand All @@ -27,10 +27,9 @@ class Catalog(ABC):
metadata such as database/table from a paimon catalog.
"""

@staticmethod
@abstractmethod
def create(catalog_options: dict) -> 'Catalog':
"""Create catalog from configuration."""
def get_database(self, name: str) -> 'Database':
"""Get paimon database identified by the given name."""

@abstractmethod
def get_table(self, identifier: str) -> Table:
Expand Down
18 changes: 18 additions & 0 deletions pypaimon/api/catalog_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from pypaimon.api.catalog import Catalog


class CatalogFactory:

@staticmethod
def create(catalog_options: dict) -> Catalog:
from pypaimon.pynative.catalog.catalog_option import CatalogOptions
from pypaimon.pynative.catalog.abstract_catalog import AbstractCatalog
from pypaimon.pynative.catalog.filesystem_catalog import FileSystemCatalog # noqa: F401
from pypaimon.pynative.catalog.hive_catalog import HiveCatalog # noqa: F401

identifier = catalog_options.get(CatalogOptions.METASTORE, "filesystem")
subclasses = AbstractCatalog.__subclasses__()
for subclass in subclasses:
if subclass.identifier() == identifier:
return subclass(catalog_options)
raise ValueError(f"Unknown catalog identifier: {identifier}")
28 changes: 28 additions & 0 deletions pypaimon/api/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#################################################################################

from typing import Optional


class Database:
"""Structure of a Database."""

def __init__(self, name: str, properties: dict, comment: Optional[str] = None):
self.name = name
self.properties = properties
self.comment = comment
37 changes: 37 additions & 0 deletions pypaimon/api/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#################################################################################

import pyarrow as pa

from typing import Optional, List


class Schema:
"""Schema of a table."""

def __init__(self,
pa_schema: pa.Schema,
partition_keys: Optional[List[str]] = None,
primary_keys: Optional[List[str]] = None,
options: Optional[dict] = None,
comment: Optional[str] = None):
self.pa_schema = pa_schema
self.partition_keys = partition_keys
self.primary_keys = primary_keys
self.options = options
self.comment = comment
19 changes: 0 additions & 19 deletions pypaimon/api/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,8 @@
# limitations under the License.
#################################################################################

import pyarrow as pa

from abc import ABC, abstractmethod
from pypaimon.api import ReadBuilder, BatchWriteBuilder
from typing import Optional, List


class Table(ABC):
Expand All @@ -33,19 +30,3 @@ def new_read_builder(self) -> ReadBuilder:
@abstractmethod
def new_batch_write_builder(self) -> BatchWriteBuilder:
"""Returns a builder for building batch table write and table commit."""


class Schema:
"""Schema of a table."""

def __init__(self,
pa_schema: pa.Schema,
partition_keys: Optional[List[str]] = None,
primary_keys: Optional[List[str]] = None,
options: Optional[dict] = None,
comment: Optional[str] = None):
self.pa_schema = pa_schema
self.partition_keys = partition_keys
self.primary_keys = primary_keys
self.options = options
self.comment = comment
6 changes: 5 additions & 1 deletion pypaimon/api/table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from abc import ABC, abstractmethod
from pypaimon.api import Split
from typing import List, Optional, TYPE_CHECKING
from typing import List, Optional, TYPE_CHECKING, Iterator

if TYPE_CHECKING:
import ray
Expand All @@ -31,6 +31,10 @@
class TableRead(ABC):
"""To read data from data splits."""

@abstractmethod
def to_iterator(self, splits: List[Split]) -> Iterator[tuple]:
"""Read data from splits and converted to pyarrow.Table format."""

@abstractmethod
def to_arrow(self, splits: List[Split]) -> pa.Table:
"""Read data from splits and converted to pyarrow.Table format."""
Expand Down
4 changes: 2 additions & 2 deletions pypaimon/api/table_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ class BatchTableWrite(ABC):
"""A table write for batch processing. Recommended for one-time committing."""

@abstractmethod
def write_arrow(self, table: pa.Table):
def write_arrow(self, table: pa.Table, row_kind: List[int] = None):
""" Write an arrow table to the writer."""

@abstractmethod
def write_arrow_batch(self, record_batch: pa.RecordBatch):
def write_arrow_batch(self, record_batch: pa.RecordBatch, row_kind: List[int] = None):
""" Write an arrow record batch to the writer."""

@abstractmethod
Expand Down
3 changes: 1 addition & 2 deletions pypaimon/py4j/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .java_implementation import \
(Catalog, Table, ReadBuilder, TableScan, Plan, RowType, Split,
TableRead, BatchWriteBuilder, BatchTableWrite, CommitMessage,
BatchTableCommit, Predicate, PredicateBuilder)
BatchTableCommit, PredicateBuilder)

__all__ = [
'constants',
Expand All @@ -36,6 +36,5 @@
'BatchTableWrite',
'CommitMessage',
'BatchTableCommit',
'Predicate',
'PredicateBuilder'
]
1 change: 0 additions & 1 deletion pypaimon/py4j/java_gateway.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
def get_gateway():
# type: () -> JavaGateway
global _gateway
global _lock
with _lock:
if _gateway is None:
# Set the level to WARN to mute the noisy INFO level logs
Expand Down
Loading