Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add write_parquet to pylibcudf #17263

Open
wants to merge 2 commits into
base: branch-24.12
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 73 additions & 2 deletions python/pylibcudf/pylibcudf/io/parquet.pxd
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libc.stdint cimport int64_t
from libc.stdint cimport int64_t, uint8_t
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector
from pylibcudf.expressions cimport Expression
from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
from pylibcudf.io.types cimport (
compression_type,
dictionary_policy,
statistics_freq,
SinkInfo,
SourceInfo,
TableInputMetadata,
TableWithMetadata,
)
from pylibcudf.libcudf.io.parquet cimport (
chunked_parquet_reader as cpp_chunked_parquet_reader,
parquet_writer_options,
parquet_writer_options_builder,
)
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.table cimport Table
from pylibcudf.types cimport DataType


Expand All @@ -33,3 +45,62 @@ cpdef read_parquet(
# ReaderColumnSchema reader_column_schema = *,
# DataType timestamp_type = *
)

cdef class ParquetWriterOptions:
cdef parquet_writer_options options

@staticmethod
cdef ParquetWriterOptionsBuilder builder(SinkInfo sink, Table table)

cpdef void set_partitions(self, list partitions)

cpdef void set_column_chunks_file_paths(self, list file_paths)

cpdef void set_row_group_size_bytes(self, int size_bytes)

cpdef void set_row_group_size_rows(self, int size_rows)

cpdef void set_max_page_size_bytes(self, int size_bytes)

cpdef void set_max_page_size_rows(self, int size_rows)

cpdef void set_max_dictionary_size(self, int size_rows)

cdef class ParquetWriterOptionsBuilder:
cdef parquet_writer_options_builder builder

cpdef ParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata)

cpdef ParquetWriterOptionsBuilder key_value_metadata(self, list metadata)

cpdef ParquetWriterOptionsBuilder compression(self, compression_type compression)

cpdef ParquetWriterOptionsBuilder stats_level(self, statistics_freq sf)

cpdef ParquetWriterOptionsBuilder int96_timestamps(self, bool enabled)

cpdef ParquetWriterOptionsBuilder write_v2_headers(self, bool enabled)

cpdef ParquetWriterOptionsBuilder dictionary_policy(self, dictionary_policy val)

cpdef ParquetWriterOptionsBuilder utc_timestamps(self, bool enabled)

cpdef ParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled)

cpdef ParquetWriterOptions build(self)


cdef class BufferArrayFromVector:
cdef Py_ssize_t length
cdef unique_ptr[vector[uint8_t]] in_vec

# these two things declare part of the buffer interface
cdef Py_ssize_t shape[1]
cdef Py_ssize_t strides[1]

@staticmethod
cdef BufferArrayFromVector from_unique_ptr(
unique_ptr[vector[uint8_t]] in_vec
)

cpdef BufferArrayFromVector write_parquet(ParquetWriterOptions options)
Loading
Loading