diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 0d12accda4e..11578e5f8c7 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -138,6 +138,17 @@ jobs: continue-on-error: true run: archery docker push ${{ matrix.image }} + - name: Type check with mypy and pyright + run: |- + python -m pip install mypy pyright ty griffe libcst pytest hypothesis fsspec scipy-stubs pandas-stubs types-python-dateutil types-psutil types-requests griffe libcst sphinx types-cffi + pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pyarrow + cd python + mypy + pyright + ty check + cd .. + python ./dev/update_stub_docstrings.py -f ./python/pyarrow-stubs + macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 runs-on: macos-${{ matrix.macos-version }} diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py new file mode 100644 index 00000000000..7eb1ee2925d --- /dev/null +++ b/dev/update_stub_docstrings.py @@ -0,0 +1,214 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Utility to extract docstrings from pyarrow and update +# docstrings in stubfiles. +# +# Usage +# ===== +# +# python ./dev/update_stub_docstrings.py -f ./python/pyarrow-stubs + + +from pathlib import Path +from textwrap import indent + +import click +# TODO: perhaps replace griffe with importlib +import griffe +from griffe import AliasResolutionError +import libcst +from libcst import matchers as m + + +def _get_docstring(name, package, indentation): + # print("extract_docstrings", name) + try: + obj = package.get_member(name) + except (KeyError, ValueError, AliasResolutionError): + # Some cython __init__ symbols can't be found + # e.g. pyarrow.lib.OSFile.__init__ + stack = name.split(".") + parent_name = ".".join(stack[:-1]) + + try: + obj = package.get_member(parent_name).all_members[stack[-1]] + except (KeyError, ValueError, AliasResolutionError): + print(f"{name} not found in {package.name}, it's probably ok.") + return None + + if obj.has_docstring: + docstring = obj.docstring.value + # Remove signature if present in docstring + if docstring.startswith(obj.name) or ( + (hasattr(obj.parent, "name") and + docstring.startswith(f"{obj.parent.name}.{obj.name}"))): + docstring = "\n".join(docstring.splitlines()[2:]) + # Skip empty docstrings + if docstring.strip() == "": + return None + # Indent docstring + indentation_prefix = indentation * " " + docstring = indent(docstring + '\n"""', indentation_prefix) + docstring = '"""\n' + docstring + return docstring + return None + + +class ReplaceEllipsis(libcst.CSTTransformer): + def __init__(self, package, namespace): + self.package = package + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 + + # Insert module level docstring if _clone_signature is used + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign( + value=m.Call(func=m.Name(value="_clone_signature")) + ), m.ZeroOrMore()] + ) + for statement in updated_node.body: + new_body.append(statement) + if m.matches(statement, clone_matcher): + name = statement.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.package, 0) + if docstring is not None: + new_expr = libcst.Expr(value=libcst.SimpleString(docstring)) + new_line = libcst.SimpleStatementLine(body=[new_expr]) + new_body.append(new_line) + + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + class_matcher_1 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.SimpleStatementLine( + body=[m.Expr(m.Ellipsis()), m.ZeroOrMore()] + ), m.ZeroOrMore()] + ) + ) + class_matcher_2 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()] + ) + ) + + if m.matches(updated_node, class_matcher_1): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_node = libcst.SimpleString(value=docstring) + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, new_node) + + if m.matches(updated_node, class_matcher_2): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + list(updated_node.body.body) + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + function_matcher = m.FunctionDef( + name=m.Name(), + body=m.SimpleStatementSuite( + body=[m.Expr( + m.Ellipsis() + )])) + if m.matches(original_node, function_matcher): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + +@click.command() +@click.option('--pyarrow_folder', '-f', type=click.Path(resolve_path=True)) +def add_docs_to_stub_files(pyarrow_folder): + print("Updating docstrings of stub files in:", pyarrow_folder) + package = griffe.load("pyarrow", try_relative_path=True, + force_inspection=True, resolve_aliases=True) + lib_modules = ["array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", + "_types"] + + for stub_file in Path(pyarrow_folder).rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue + module = stub_file.with_suffix('').name + print(f"[{stub_file} {module}]") + + with open(stub_file, 'r') as f: + tree = libcst.parse_module(f.read()) + + if module in lib_modules: + module = "lib" + elif stub_file.parent.name in ["parquet", "interchange"]: + module = f"{stub_file.parent.name}.{module}" + elif module == "__init__": + module = "" + + modified_tree = tree.visit(ReplaceEllipsis(package, module)) + with open(stub_file, "w") as f: + f.write(modified_tree.code) + print("\n") + + +if __name__ == "__main__": + docstrings_map = {} + add_docs_to_stub_files(obj={}) diff --git a/python/pyarrow-stubs/pyarrow/__init__.pyi b/python/pyarrow-stubs/pyarrow/__init__.pyi new file mode 100644 index 00000000000..ff0bd7fd5b8 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/__init__.pyi @@ -0,0 +1,694 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow.lib as _lib + +from pyarrow.lib import ( + BuildInfo, + CppBuildInfo, + RuntimeInfo, + set_timezone_db_path, + MonthDayNano, + VersionInfo, + build_info, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, + cpu_count, + set_cpu_count, + enable_signal_handlers, + io_thread_count, + set_io_thread_count, +) + +from pyarrow.lib import ( + null, + bool_, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + time32, + time64, + timestamp, + date32, + date64, + duration, + month_day_nano_interval, + float16, + float32, + float64, + binary, + string, + utf8, + binary_view, + string_view, + large_binary, + large_string, + large_utf8, + decimal32, + decimal64, + decimal128, + decimal256, + list_, + large_list, + list_view, + large_list_view, + map_, + struct, + union, + sparse_union, + dense_union, + dictionary, + run_end_encoded, + json_, + uuid, + fixed_shape_tensor, + bool8, + opaque, + field, + type_for_alias, + DataType, + DictionaryType, + StructType, + ListType, + LargeListType, + FixedSizeListType, + ListViewType, + LargeListViewType, + MapType, + UnionType, + SparseUnionType, + DenseUnionType, + TimestampType, + Time32Type, + Time64Type, + DurationType, + FixedSizeBinaryType, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + BaseExtensionType, + ExtensionType, + RunEndEncodedType, + FixedShapeTensorType, + Bool8Type, + UuidType, + JsonType, + OpaqueType, + UnknownExtensionType, + register_extension_type, + unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, + Tensor, + array, + arange, + chunked_array, + record_batch, + nulls, + repeat, + SparseCOOTensor, + SparseCSRMatrix, + SparseCSCMatrix, + SparseCSFTensor, + infer_type, + from_numpy_dtype, + NullArray, + NumericArray, + IntegerArray, + FloatingPointArray, + BooleanArray, + Int8Array, + UInt8Array, + Int16Array, + UInt16Array, + Int32Array, + UInt32Array, + Int64Array, + UInt64Array, + HalfFloatArray, + FloatArray, + DoubleArray, + ListArray, + LargeListArray, + FixedSizeListArray, + ListViewArray, + LargeListViewArray, + MapArray, + UnionArray, + BinaryArray, + StringArray, + LargeBinaryArray, + LargeStringArray, + BinaryViewArray, + StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, + Date64Array, + TimestampArray, + Time32Array, + Time64Array, + DurationArray, + MonthDayNanoIntervalArray, + Decimal32Array, + Decimal64Array, + Decimal128Array, + Decimal256Array, + StructArray, + ExtensionArray, + RunEndEncodedArray, + FixedShapeTensorArray, + Bool8Array, + UuidArray, + JsonArray, + OpaqueArray, + scalar, + NA, + _NULL as NULL, + Scalar, + NullScalar, + BooleanScalar, + Int8Scalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + UInt8Scalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + HalfFloatScalar, + FloatScalar, + DoubleScalar, + Decimal32Scalar, + Decimal64Scalar, + Decimal128Scalar, + Decimal256Scalar, + ListScalar, + LargeListScalar, + FixedSizeListScalar, + ListViewScalar, + LargeListViewScalar, + Date32Scalar, + Date64Scalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, + LargeBinaryScalar, + BinaryViewScalar, + StringScalar, + LargeStringScalar, + StringViewScalar, + FixedSizeBinaryScalar, + DictionaryScalar, + MapScalar, + StructScalar, + UnionScalar, + RunEndEncodedScalar, + ExtensionScalar, + Bool8Scalar, + UuidScalar, + JsonScalar, + OpaqueScalar, +) + +# Buffers, allocation +from pyarrow.lib import ( + DeviceAllocationType, + Device, + MemoryManager, + default_cpu_memory_manager +) + +from pyarrow.lib import ( + Buffer, + ResizableBuffer, + foreign_buffer, + py_buffer, + Codec, + compress, + decompress, + allocate_buffer, +) + +from pyarrow.lib import ( + MemoryPool, + LoggingMemoryPool, + ProxyMemoryPool, + total_allocated_bytes, + set_memory_pool, + default_memory_pool, + system_memory_pool, + jemalloc_memory_pool, + mimalloc_memory_pool, + logging_memory_pool, + proxy_memory_pool, + log_memory_allocations, + jemalloc_set_decay_ms, + supported_memory_backends, +) + +# I/O +from pyarrow.lib import ( + NativeFile, + PythonFile, + BufferedInputStream, + BufferedOutputStream, + CacheOptions, + CompressedInputStream, + CompressedOutputStream, + TransformInputStream, + transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, + BufferOutputStream, + OSFile, + MemoryMappedFile, + memory_map, + create_memory_map, + MockOutputStream, + input_stream, + output_stream, + have_libhdfs, +) + +from pyarrow.lib import ( + ChunkedArray, + RecordBatch, + Table, + table, + concat_arrays, + concat_batches, + concat_tables, + TableGroupBy, + RecordBatchReader, +) + +# Exceptions +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc +import pyarrow.lib as lib +import pyarrow.types as types +import pyarrow.feather as feather +import pyarrow.compute as compute +import pyarrow.csv as csv +import pyarrow.json as json +import pyarrow.dataset as dataset + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) + + +__version__: str +_gc_enabled: bool + + +def show_versions() -> None: ... +def show_info() -> None: ... +def _module_is_available(module: str) -> bool: ... +def _filesystem_is_available(fs: str) -> bool: ... + + +def get_include() -> str: ... +def _get_pkg_config_executable() -> str: ... +def _has_pkg_config(pkgname: str) -> bool: ... +def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... +def get_libraries() -> list[str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... + + +__all__ = [ + "__version__", + "_lib", + "_gc_enabled", + "BuildInfo", + "CppBuildInfo", + "RuntimeInfo", + "set_timezone_db_path", + "MonthDayNano", + "VersionInfo", + "build_info", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "cpu_count", + "set_cpu_count", + "enable_signal_handlers", + "io_thread_count", + "set_io_thread_count", + "show_versions", + "show_info", + "_module_is_available", + "_filesystem_is_available", + "null", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "duration", + "month_day_nano_interval", + "float16", + "float32", + "float64", + "binary", + "string", + "utf8", + "binary_view", + "string_view", + "large_binary", + "large_string", + "large_utf8", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "struct", + "union", + "sparse_union", + "dense_union", + "dictionary", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "field", + "type_for_alias", + "DataType", + "DictionaryType", + "StructType", + "ListType", + "LargeListType", + "FixedSizeListType", + "ListViewType", + "LargeListViewType", + "MapType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "BaseExtensionType", + "ExtensionType", + "RunEndEncodedType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "DictionaryMemo", + "KeyValueMetadata", + "Field", + "Schema", + "schema", + "unify_schemas", + "Array", + "Tensor", + "array", + "arange", + "chunked_array", + "record_batch", + "nulls", + "repeat", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", + "infer_type", + "from_numpy_dtype", + "NullArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "BooleanArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "ListArray", + "LargeListArray", + "FixedSizeListArray", + "ListViewArray", + "LargeListViewArray", + "MapArray", + "UnionArray", + "BinaryArray", + "StringArray", + "LargeBinaryArray", + "LargeStringArray", + "BinaryViewArray", + "StringViewArray", + "FixedSizeBinaryArray", + "DictionaryArray", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "StructArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "RunEndEncodedArray", + "FixedShapeTensorArray", + "scalar", + "NA", + "NULL", + "Scalar", + "NullScalar", + "BooleanScalar", + "Int8Scalar", + "Int16Scalar", + "Int32Scalar", + "Int64Scalar", + "UInt8Scalar", + "UInt16Scalar", + "UInt32Scalar", + "UInt64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "ListScalar", + "LargeListScalar", + "FixedSizeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "BinaryViewScalar", + "StringScalar", + "LargeStringScalar", + "StringViewScalar", + "FixedSizeBinaryScalar", + "DictionaryScalar", + "MapScalar", + "StructScalar", + "UnionScalar", + "RunEndEncodedScalar", + "ExtensionScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", + "Buffer", + "ResizableBuffer", + "foreign_buffer", + "py_buffer", + "Codec", + "compress", + "decompress", + "allocate_buffer", + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "total_allocated_bytes", + "set_memory_pool", + "default_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "logging_memory_pool", + "proxy_memory_pool", + "log_memory_allocations", + "jemalloc_set_decay_ms", + "supported_memory_backends", + "NativeFile", + "PythonFile", + "BufferedInputStream", + "BufferedOutputStream", + "CacheOptions", + "CompressedInputStream", + "CompressedOutputStream", + "TransformInputStream", + "transcoding_input_stream", + "FixedSizeBufferWriter", + "BufferReader", + "BufferOutputStream", + "OSFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "MockOutputStream", + "input_stream", + "output_stream", + "have_libhdfs", + "ChunkedArray", + "RecordBatch", + "Table", + "table", + "concat_arrays", + "concat_batches", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", + "lib", + "ipc", + "types", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "get_include", + "_get_pkg_config_executable", + "compute", + "feather", + "csv", + "json", + "_has_pkg_config", + "_read_pkg_config_variable", + "get_libraries", + "create_library_symlinks", + "dataset", + "get_library_dirs", +] diff --git a/python/pyarrow-stubs/pyarrow/_acero.pyi b/python/pyarrow-stubs/pyarrow/_acero.pyi new file mode 100644 index 00000000000..85ed9683e7e --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_acero.pyi @@ -0,0 +1,163 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +from collections.abc import Iterable, Collection, Sequence + +if sys.version_info >= (3, 11): + from typing import Self, LiteralString +else: + from typing_extensions import Self, LiteralString +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Literal + +from . import lib +from .compute import Expression +from .dataset import InMemoryDataset, Dataset +from .table import Aggregation, AggregateOptions + +_StrOrExpr: TypeAlias = str | Expression + +IntoField: TypeAlias = str | int | Expression +Target: TypeAlias = ( + IntoField + | tuple[IntoField, ...] + | list[str] + | list[int] + | list[Expression] + | list[IntoField] +) + +UserDefinedAggregation: TypeAlias = LiteralString +OutputName: TypeAlias = str +AggregationSpec: TypeAlias = tuple[ + Target, Aggregation | UserDefinedAggregation, AggregateOptions | None, OutputName +] + + +class Declaration(lib._Weakrefable): + def __init__( + self, + factory_name: str, + options: ExecNodeOptions, + inputs: list[Declaration] | None = None, + ) -> None: ... + @classmethod + def from_sequence(cls, decls: Iterable[Declaration]) -> Self: ... + def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... + def to_table(self, use_threads: bool = True) -> lib.Table: ... + + +class ExecNodeOptions(lib._Weakrefable): + ... + + +class TableSourceNodeOptions(ExecNodeOptions): + def __init__(self, table: lib.Table | lib.RecordBatch | None) -> None: ... + + +class FilterNodeOptions(ExecNodeOptions): + def __init__(self, filter_expression: Expression | None) -> None: ... + + +class ProjectNodeOptions(ExecNodeOptions): + def __init__(self, expressions: Collection[Expression], + names: Collection[str] | None = None) -> None: ... + + +class AggregateNodeOptions(ExecNodeOptions): + def __init__( + self, + aggregates: Iterable[ + tuple[ + Target, + Aggregation | UserDefinedAggregation, + AggregateOptions | None, + OutputName, + ] + ], + keys: Iterable[str | Expression] | None = None, + ) -> None: ... + + +class OrderByNodeOptions(ExecNodeOptions): + def __init__( + self, + sort_keys: + Iterable[tuple[str | Expression | int, Literal["ascending", "descending"]]] + = (), + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> None: ... + + +class HashJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ], + left_keys: _StrOrExpr | Sequence[_StrOrExpr], + right_keys: _StrOrExpr | Sequence[_StrOrExpr], + left_output: Sequence[_StrOrExpr] | None = None, + right_output: Sequence[_StrOrExpr] | None = None, + output_suffix_for_left: str = "", + output_suffix_for_right: str = "", + filter_expression: + lib.BooleanScalar | lib.BooleanArray | Expression | None = None, + ) -> None: ... + + +class AsofJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + left_on: _StrOrExpr, + left_by: _StrOrExpr | Sequence[_StrOrExpr], + right_on: _StrOrExpr, + right_by: _StrOrExpr | Sequence[_StrOrExpr], + tolerance: int, + ) -> None: ... + + +def _perform_join( + join_type: str, + left_operand: lib.Table | Dataset, + left_keys: str | list[str], + right_operand: lib.Table | Dataset, + right_keys: str | list[str], + left_suffix: str, + right_suffix: str, + use_threads: bool, + coalesce_keys: bool, + output_type: type[lib.Table | InMemoryDataset] = lib.Table, + filter_expression: Expression | None = None, +) -> lib.Table | InMemoryDataset: ... + + +def _filter_table( + table: lib.Table | lib.RecordBatch, filter_expression: Expression, + use_threads: bool = True) -> lib.Table | lib.RecordBatch: ... diff --git a/python/pyarrow-stubs/pyarrow/_azurefs.pyi b/python/pyarrow-stubs/pyarrow/_azurefs.pyi new file mode 100644 index 00000000000..5872de03825 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_azurefs.pyi @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Literal + +from ._fs import FileSystem + + +class AzureFileSystem(FileSystem): + def __init__( + self, + account_name: str | None = None, + account_key: str | None = None, + blob_storage_authority: str | None = None, + dfs_storage_authority: str | None = None, + blob_storage_scheme: Literal["http", "https"] = "https", + dfs_storage_scheme: Literal["http", "https"] = "https", + sas_token: str | None = None, + tenant_id: str | None = None, + client_id: str | None = None, + client_secret: str | None = None, + ) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_compute.pyi b/python/pyarrow-stubs/pyarrow/_compute.pyi new file mode 100644 index 00000000000..a32b50067a4 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_compute.pyi @@ -0,0 +1,671 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import types as stdlib_types +from collections.abc import ( + Callable, + Iterable, + Mapping, + Sequence, +) + +from typing import ( + Any, + Literal, + TypeAlias, + TypedDict, +) + +from . import lib + +_Order: TypeAlias = Literal["ascending", "descending"] +_Placement: TypeAlias = Literal["at_start", "at_end"] + + +class Kernel(lib._Weakrefable): + ... + + +class Function(lib._Weakrefable): + @property + def arity(self) -> int | stdlib_types.EllipsisType: ... + + @property + def kind( + self, + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: ... + @property + def name(self) -> str: ... + @property + def num_kernels(self) -> int: ... + + @property + def kernels( + self, + ) -> list[ + ScalarKernel | VectorKernel | ScalarAggregateKernel | HashAggregateKernel + ]: ... + + def call( + self, + args: Iterable, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, + ) -> Any: ... + + +class FunctionOptions(lib._Weakrefable): + def serialize(self) -> lib.Buffer: ... + @classmethod + def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + + +class FunctionRegistry(lib._Weakrefable): + def get_function(self, name: str) -> Function: ... + def list_functions(self) -> list[str]: ... + + +class HashAggregateFunction(Function): + ... + + +class HashAggregateKernel(Kernel): + ... + + +class ScalarAggregateFunction(Function): + ... + + +class ScalarAggregateKernel(Kernel): + ... + + +class ScalarFunction(Function): + ... + + +class ScalarKernel(Kernel): + ... + + +class VectorFunction(Function): + ... + + +class VectorKernel(Kernel): + ... + +# ==================== _compute.pyx Option classes ==================== + + +class ArraySortOptions(FunctionOptions): + def __init__( + self, + order: _Order = "ascending", + null_placement: _Placement = "at_end", + ) -> None: ... + + +class AssumeTimezoneOptions(FunctionOptions): + def __init__( + self, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + ) -> None: ... + + +class CastOptions(FunctionOptions): + allow_int_overflow: bool + allow_time_truncate: bool + allow_time_overflow: bool + allow_decimal_truncate: bool + allow_float_truncate: bool + allow_invalid_utf8: bool + + def __init__( + self, + target_type: lib.DataType | None = None, + *, + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, + ) -> None: ... + @staticmethod + def safe(target_type: lib.DataType | None = None) -> CastOptions: ... + @staticmethod + def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... + def is_safe(self) -> bool: ... + + +class CountOptions(FunctionOptions): + def __init__(self, mode: Literal["only_valid", + "only_null", "all"] = "only_valid") -> None: ... + + +class CumulativeOptions(FunctionOptions): + def __init__(self, start: lib.Scalar | None = None, + *, skip_nulls: bool = False) -> None: ... + + +class CumulativeSumOptions(FunctionOptions): + def __init__(self, start: lib.Scalar | None = None, + *, skip_nulls: bool = False) -> None: ... + + +class DayOfWeekOptions(FunctionOptions): + def __init__(self, *, count_from_zero: bool = True, + week_start: int = 1) -> None: ... + + +class DictionaryEncodeOptions(FunctionOptions): + def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... + + +class RunEndEncodeOptions(FunctionOptions): + # TODO: default is DataType(int32) + def __init__(self, run_end_type: lib.DataType | str = ...) -> None: ... + + +class ElementWiseAggregateOptions(FunctionOptions): + def __init__(self, *, skip_nulls: bool = True) -> None: ... + + +class ExtractRegexOptions(FunctionOptions): + def __init__(self, pattern: str) -> None: ... + + +class ExtractRegexSpanOptions(FunctionOptions): + def __init__(self, pattern: str) -> None: ... + + +class FilterOptions(FunctionOptions): + def __init__(self, + null_selection_behavior: Literal["drop", + "emit_null"] = "drop") -> None: ... + + +class IndexOptions(FunctionOptions): + def __init__(self, value: lib.Scalar) -> None: ... + + +class JoinOptions(FunctionOptions): + def __init__( + self, + null_handling: + Literal["emit_null", "skip", "replace"] + = "emit_null", *, null_replacement: str = "") -> None: ... + + +class ListSliceOptions(FunctionOptions): + def __init__( + self, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + ) -> None: ... + + +class ListFlattenOptions(FunctionOptions): + def __init__(self, recursive: bool = False) -> None: ... + + +class MakeStructOptions(FunctionOptions): + def __init__( + self, + field_names: Sequence[str] = (), + *, + field_nullability: Sequence[bool] | None = None, + field_metadata: Sequence[lib.KeyValueMetadata] | None = None, + ) -> None: ... + + +class MapLookupOptions(FunctionOptions): + # TODO: query_key: Scalar or Object can be converted to Scalar + def __init__( + self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] + ) -> None: ... + + +class MatchSubstringOptions(FunctionOptions): + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... + + +class ModeOptions(FunctionOptions): + def __init__(self, n: int = 1, *, skip_nulls: bool = True, + min_count: int = 0) -> None: ... + + +class NullOptions(FunctionOptions): + def __init__(self, *, nan_is_null: bool = False) -> None: ... + + +class PadOptions(FunctionOptions): + def __init__( + self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True + ) -> None: ... + + +class PairwiseOptions(FunctionOptions): + def __init__(self, period: int = 1) -> None: ... + + +class PartitionNthOptions(FunctionOptions): + def __init__(self, pivot: int, *, + null_placement: _Placement = "at_end") -> None: ... + + +class WinsorizeOptions(FunctionOptions): + def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + + +class QuantileOptions(FunctionOptions): + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + interpolation: Literal["linear", "lower", + "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + + +class RandomOptions(FunctionOptions): + def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... + + +class RankOptions(FunctionOptions): + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + ) -> None: ... + + +class RankQuantileOptions(FunctionOptions): + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + ) -> None: ... + + +class PivotWiderOptions(FunctionOptions): + def __init__( + self, + key_names: Sequence[str], + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + ) -> None: ... + + +class ReplaceSliceOptions(FunctionOptions): + def __init__(self, start: int, stop: int, replacement: str) -> None: ... + + +class ReplaceSubstringOptions(FunctionOptions): + def __init__( + self, pattern: str, replacement: str, *, max_replacements: int | None = None + ) -> None: ... + + +_RoundMode: TypeAlias = Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", +] + + +class RoundBinaryOptions(FunctionOptions): + def __init__( + self, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + + +class RoundOptions(FunctionOptions): + def __init__( + self, + ndigits: int = 0, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + + +_DateTimeUint: TypeAlias = Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + + +class RoundTemporalOptions(FunctionOptions): + def __init__( + self, + multiple: int = 1, + unit: _DateTimeUint = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + ) -> None: ... + + +class RoundToMultipleOptions(FunctionOptions): + def __init__(self, multiple: float = 1.0, + round_mode: _RoundMode = "half_to_even") -> None: ... + + +class ScalarAggregateOptions(FunctionOptions): + def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... + + +class SelectKOptions(FunctionOptions): + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... + + +class SetLookupOptions(FunctionOptions): + def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... + + +class SliceOptions(FunctionOptions): + def __init__( + self, start: int, stop: int | None = None, step: int = 1) -> None: ... + + +class SortOptions(FunctionOptions): + def __init__( + self, + sort_keys: Sequence[tuple[str, _Order]], + *, + null_placement: _Placement = "at_end" + ) -> None: ... + + +class SplitOptions(FunctionOptions): + def __init__(self, *, max_splits: int | None = None, + reverse: bool = False) -> None: ... + + +class SplitPatternOptions(FunctionOptions): + def __init__( + self, pattern: str, *, max_splits: int | None = None, reverse: bool = False + ) -> None: ... + + +class StrftimeOptions(FunctionOptions): + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C") -> None: ... + + +class StrptimeOptions(FunctionOptions): + def __init__(self, + format: str, + unit: Literal["s", + "ms", + "us", + "ns"], + error_is_null: bool = False) -> None: ... + + +class StructFieldOptions(FunctionOptions): + def __init__(self, indices: list[str] | list[bytes] | + list[int] | Expression | bytes | str | int) -> None: ... + + +class TakeOptions(FunctionOptions): + def __init__(self, boundscheck: bool = True) -> None: ... + + +class TDigestOptions(FunctionOptions): + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + + +class TrimOptions(FunctionOptions): + def __init__(self, characters: str) -> None: ... + + +class Utf8NormalizeOptions(FunctionOptions): + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... + + +class VarianceOptions(FunctionOptions): + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, + min_count: int = 0) -> None: ... + + +class SkewOptions(FunctionOptions): + def __init__( + self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 + ) -> None: ... + + +class WeekOptions(FunctionOptions): + def __init__( + self, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + ) -> None: ... + + +class ZeroFillOptions(FunctionOptions): + def __init__(self, width: int, padding: str = "0") -> None: ... + +# ==================== _compute.pyx Functions ==================== + + +def call_function( + name: str, + args: list, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, +) -> Any: ... +def function_registry() -> FunctionRegistry: ... +def get_function(name: str) -> Function: ... +def list_functions() -> list[str]: ... + +# ==================== _compute.pyx Udf ==================== + + +def call_tabular_function( + function_name: str, + args: Iterable | None = None, + func_registry: FunctionRegistry | None = None) -> lib.RecordBatchReader: ... + + +class _FunctionDoc(TypedDict): + summary: str + description: str + + +def register_scalar_function( + func: Callable | None, + function_name: str | None, + function_doc: _FunctionDoc | dict[str, str], + in_types: Mapping[str, lib.DataType] | None, + out_type: lib.DataType | None, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_tabular_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc | dict[str, str], + in_types: Mapping[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_aggregate_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc | dict[str, str], + in_types: Mapping[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +def register_vector_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc | dict[str, str], + in_types: Mapping[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: ... + + +class UdfContext: + @property + def batch_length(self) -> int: ... + @property + def memory_pool(self) -> lib.MemoryPool: ... + + +def _get_udf_context(memory_pool: lib.MemoryPool, batch_length: int) -> UdfContext: ... + +# ==================== _compute.pyx Expression ==================== + + +class Expression(lib._Weakrefable): + @staticmethod + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: ... + + def to_substrait(self, schema: lib.Schema, + allow_arrow_extensions: bool = False) -> lib.Buffer: ... + + @staticmethod + def _call( + func_name: str, args: list, options: FunctionOptions | None = None + ) -> Expression: ... + + @staticmethod + def _field(name_or_index: str | int) -> Expression: ... + + @staticmethod + def _nested_field(name: str) -> Expression: ... + + @staticmethod + def _scalar(value: Any) -> Expression: ... + + def __invert__(self) -> Expression: ... + + def __and__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __rand__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __or__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __ror__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __add__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __radd__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __mul__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __rmul__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __sub__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __rsub__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __eq__(self, value: object) -> Expression: ... # type: ignore[override] + def __ne__(self, value: object) -> Expression: ... # type: ignore[override] + def __gt__(self, value: object) -> Expression: ... + def __lt__(self, value: object) -> Expression: ... + def __ge__(self, value: object) -> Expression: ... + def __le__(self, value: object) -> Expression: ... + + def __truediv__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def __rtruediv__( + self, other: Expression | lib.Scalar | lib.Array | object) -> Expression: ... + + def is_valid(self) -> Expression: ... + def is_null(self, nan_is_null: bool = False) -> Expression: ... + def is_nan(self) -> Expression: ... + + def cast( + self, + type: lib.DataType | str, safe: bool = True, options: CastOptions | None = None + ) -> Expression: ... + + def isin(self, values: lib.Array | Iterable | Any) -> Expression: ... + def equals(self, other: object) -> bool: ... + + # Attributes and methods for materialized expressions (used in tests) + @property + def type(self) -> lib.DataType: ... + def to_pylist(self) -> list: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> Any: ... + def to_pandas(self, **kwargs) -> Any: ... + def as_py(self) -> Any: ... + def tolist(self) -> list: ... + def slice(self, offset: int = 0, length: int | None = None) -> Expression: ... + +# ==================== _compute.py ==================== diff --git a/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi b/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi new file mode 100644 index 00000000000..514a4e4269c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_compute_docstring.pyi @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +function_doc_additions: dict[str, str] diff --git a/python/pyarrow-stubs/pyarrow/_csv.pyi b/python/pyarrow-stubs/pyarrow/_csv.pyi new file mode 100644 index 00000000000..6c911a8b0c1 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_csv.pyi @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Sequence +from dataclasses import dataclass, field +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from . import lib + + +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): + use_threads: bool = field(default=True, kw_only=False) # noqa: Y015 + block_size: int | float | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: Sequence[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): + delimiter: str = field(default=",", kw_only=False) # noqa: Y015 + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], str] | None = None + + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): + check_utf8: bool = field(default=True, kw_only=False) # noqa: Y015 + column_types: lib.Schema | dict | Sequence[tuple[str, lib.DataType]] | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: Sequence[str | lib._Weakrefable] | None = None + + def validate(self) -> None: ... + + +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + include_header: bool = field(default=True, kw_only=False) # noqa: Y015 + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" + quoting_header: Literal["needed", "all_valid", "none"] = "needed" + + def validate(self) -> None: ... + + +@dataclass +class InvalidRow(lib._Weakrefable): + expected_columns: int + actual_columns: int + number: int | None + text: str + + +class CSVWriter(lib._CRecordBatchWriter): + def __init__( + self, + # TODO: OutputStream + sink: StrPath | IO[Any], + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class CSVStreamingReader(lib.RecordBatchReader): + ... + + +ISO8601: lib._Weakrefable + + +def open_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: ... + + +def read_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: ... + + +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: StrPath | lib.NativeFile | IO[Any], + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_cuda.pyi b/python/pyarrow-stubs/pyarrow/_cuda.pyi new file mode 100644 index 00000000000..d484fc5cf5f --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_cuda.pyi @@ -0,0 +1,158 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-untyped, import-not-found] # noqa: E501 + +from . import lib +from ._stubs_typing import ArrayLike + + +class Context(lib._Weakrefable): + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: ... + + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: ... + + def to_numba(self) -> _numba_driver.Context: ... + + @staticmethod + def get_num_devices() -> int: ... + + @property + def device_number(self) -> int: ... + + @property + def handle(self) -> int: ... + + def synchronize(self) -> None: ... + + @property + def bytes_allocated(self) -> int: ... + + def get_device_address(self, address: int) -> int: ... + + def new_buffer(self, nbytes: int) -> CudaBuffer: ... + + @property + def memory_manager(self) -> lib.MemoryManager: ... + + @property + def device(self) -> lib.Device: ... + + def foreign_buffer(self, address: int, size: int, base: Any | + None = None) -> CudaBuffer: ... + + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: ... + + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: ... + + def buffer_from_object(self, obj: Any) -> CudaBuffer: ... + + +class IpcMemHandle(lib._Weakrefable): + @staticmethod + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: ... + + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: ... + + +class CudaBuffer(lib.Buffer): + @staticmethod + def from_buffer(buf: lib.Buffer) -> CudaBuffer: ... + + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: ... + + def to_numba(self) -> _numba_driver.MemoryPointer: ... + + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, + resizable: bool = False, + ) -> lib.Buffer: ... + + def copy_from_host( + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: ... + + def copy_from_device(self, buf: CudaBuffer, position: int = 0, + nbytes: int = -1) -> int: ... + + def export_for_ipc(self) -> IpcMemHandle: ... + + @property + def context(self) -> Context: ... + + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: ... + + def to_pybytes(self) -> bytes: ... + + +class HostBuffer(lib.Buffer): + @property + def size(self) -> int: ... + + +class BufferReader(lib.NativeFile): + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: ... + + +class BufferWriter(lib.NativeFile): + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: ... + + @property + def buffer_size(self) -> int: ... + + @buffer_size.setter + def buffer_size(self, buffer_size: int): ... + + @property + def num_bytes_buffered(self) -> int: ... + + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: ... + + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: ... + + +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: ... + + +def read_record_batch( + buffer: lib.Buffer, + object: lib.Schema, + *, + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: ... diff --git a/python/pyarrow-stubs/pyarrow/_dataset.pyi b/python/pyarrow-stubs/pyarrow/_dataset.pyi new file mode 100644 index 00000000000..c8cd3d97089 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_dataset.pyi @@ -0,0 +1,682 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Collection, Callable, Iterator, Iterable +from typing import ( + IO, + Any, + Generic, + Literal, + NamedTuple, + TypeVar, +) + +from _typeshed import StrPath + +from . import csv, _json, _parquet, lib +from ._fs import FileSelector, FileSystem, SupportedFileSystem +from ._stubs_typing import Indices, JoinType, Order +from .acero import ExecNodeOptions +from .compute import Expression +from .ipc import IpcWriteOptions, RecordBatchReader + + +class Dataset(lib._Weakrefable): + @property + def partition_expression(self) -> Expression: ... + + def replace_schema(self, schema: lib.Schema) -> Self: ... + + def get_fragments(self, filter: Expression | None = None): ... + + def scanner( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: ... + + def to_table( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def count_rows( + self, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: ... + + @property + def schema(self) -> lib.Schema: ... + + def filter(self, expression: Expression | None) -> Self: ... + + def sort_by(self, sorting: str | + list[tuple[str, Order]], **kwargs) -> InMemoryDataset: ... + + def join( + self, + right_dataset: Dataset, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> InMemoryDataset: ... + + def join_asof( + self, + right_dataset: Dataset, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> InMemoryDataset: ... + + @property + def format(self) -> FileFormat: ... + + +class InMemoryDataset(Dataset): + def __init__( + self, + source: lib.Table + | lib.RecordBatch + | lib.RecordBatchReader + | Iterable[lib.RecordBatch] + | list[Any], + schema: lib.Schema | None = None, + ) -> None: ... + + +class UnionDataset(Dataset): + def __init__( + self, + schema: lib.Schema | None = None, + children: list[Dataset] | None = None, + ) -> None: ... + + @property + def children(self) -> list[Dataset]: ... + + +class FileSystemDataset(Dataset): + def __init__( + self, + fragments: list[Fragment], + schema: lib.Schema, + format: FileFormat, + filesystem: SupportedFileSystem | None = None, + root_partition: Expression | None = None, + ) -> None: ... + + @classmethod + def from_paths( + cls, + paths: list[str], + schema: lib.Schema | None = None, + format: FileFormat | None = None, + filesystem: SupportedFileSystem | None = None, + partitions: list[Expression] | None = None, + root_partition: Expression | None = None, + ) -> FileSystemDataset: ... + + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning | None: ... + + @property + def files(self) -> list[str]: ... + + +class FileWriteOptions(lib._Weakrefable): + @property + def format(self) -> FileFormat: ... + + +class FileFormat(lib._Weakrefable): + def inspect( + self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None + ) -> lib.Schema: ... + + def make_fragment( + self, + file: StrPath | IO | lib.Buffer | lib.BufferReader, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + *, + file_size: int | None = None, + ) -> Fragment: ... + + def make_write_options(self) -> FileWriteOptions: ... + @property + def default_extname(self) -> str: ... + @property + def default_fragment_scan_options(self) -> FragmentScanOptions: ... + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + + +class Fragment(lib._Weakrefable): + def open(self) -> lib.NativeFile | lib.BufferReader: ... + @property + def path(self) -> str: ... + @property + def row_groups(self) -> list[int]: ... + + @property + def filesystem(self) -> SupportedFileSystem: ... + + @property + def physical_schema(self) -> lib.Schema: ... + + @property + def partition_expression(self) -> Expression: ... + + def scanner( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + def to_batches( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: ... + + def to_table( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: ... + + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: ... + + +class FileFragment(Fragment): + def open(self) -> lib.NativeFile: ... + + @property + def path(self) -> str: ... + + @property + def filesystem(self) -> FileSystem: ... + + @property + def buffer(self) -> lib.Buffer: ... + + @property + def format(self) -> FileFormat: ... + + +class FragmentScanOptions(lib._Weakrefable): + @property + def type_name(self) -> str: ... + + +class IpcFileWriteOptions(FileWriteOptions): + @property + def write_options(self) -> IpcWriteOptions: ... + @write_options.setter + def write_options(self, write_options: IpcWriteOptions) -> None: ... + + +class IpcFileFormat(FileFormat): + def equals(self, other: IpcFileFormat) -> bool: ... + def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... + @property + def default_extname(self) -> str: ... + + +class FeatherFileFormat(IpcFileFormat): + ... + + +class CsvFileFormat(FileFormat): + def __init__( + self, + parse_options: csv.ParseOptions | None = None, + default_fragment_scan_options: CsvFragmentScanOptions | None = None, + convert_options: csv.ConvertOptions | None = None, + read_options: csv.ReadOptions | None = None, + ) -> None: ... + def make_write_options( + self, **kwargs) -> CsvFileWriteOptions: ... # type: ignore[override] + + @property + def parse_options(self) -> csv.ParseOptions: ... + @parse_options.setter + def parse_options(self, parse_options: csv.ParseOptions) -> None: ... + def equals(self, other: CsvFileFormat) -> bool: ... + + +class CsvFragmentScanOptions(FragmentScanOptions): + convert_options: csv.ConvertOptions + read_options: csv.ReadOptions + + def __init__( + self, + convert_options: csv.ConvertOptions | None = None, + read_options: csv.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: CsvFragmentScanOptions) -> bool: ... + + +class CsvFileWriteOptions(FileWriteOptions): + write_options: csv.WriteOptions + + +class JsonFileFormat(FileFormat): + def __init__( + self, + default_fragment_scan_options: JsonFragmentScanOptions | None = None, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFileFormat) -> bool: ... + + +class JsonFragmentScanOptions(FragmentScanOptions): + parse_options: _json.ParseOptions + read_options: _json.ReadOptions + + def __init__( + self, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFragmentScanOptions) -> bool: ... + + +class Partitioning(lib._Weakrefable): + def parse(self, path: str) -> Expression: ... + + def format(self, expr: Expression) -> tuple[str, str]: ... + + @property + def schema(self) -> lib.Schema: ... + + @property + def dictionaries(self) -> list[Any]: ... + + +class PartitioningFactory(lib._Weakrefable): + @property + def type_name(self) -> str: ... + + +class KeyValuePartitioning(Partitioning): + @property + def dictionaries(self) -> list[Any]: ... + + +class DirectoryPartitioning(KeyValuePartitioning): + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + +class HivePartitioning(KeyValuePartitioning): + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + null_fallback: str = "__HIVE_DEFAULT_PARTITION__", + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + @staticmethod + def discover( + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + +class FilenamePartitioning(KeyValuePartitioning): + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: ... + + +class DatasetFactory(lib._Weakrefable): + root_partition: Expression + def finish(self, schema: lib.Schema | None = None) -> Dataset: ... + + def inspect( + self, + *, + promote_options: str = "default", + fragments: list[Fragment] | int | str | None = None, + ) -> lib.Schema: ... + + def inspect_schemas(self) -> list[lib.Schema]: ... + + +class FileSystemFactoryOptions(lib._Weakrefable): + partitioning: Partitioning + partitioning_factory: PartitioningFactory + partition_base_dir: str + exclude_invalid_files: bool + selector_ignore_prefixes: list[str] + + def __init__( + self, + partition_base_dir: str | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + exclude_invalid_files: bool | None = True, + selector_ignore_prefixes: list[str] | None = None, + ) -> None: ... + + +class FileSystemDatasetFactory(DatasetFactory): + def __init__( + self, + filesystem: SupportedFileSystem, + paths_or_selector: Collection[str] | FileSelector, + format: FileFormat, + options: FileSystemFactoryOptions | None = None, + ) -> None: ... + + +class UnionDatasetFactory(DatasetFactory): + def __init__(self, factories: list[DatasetFactory]) -> None: ... + + +_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) + + +class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + def __iter__(self) -> Self: ... + def __next__(self) -> _RecordBatchT: ... + + +class TaggedRecordBatch(NamedTuple): + record_batch: lib.RecordBatch + fragment: Fragment + + +class TaggedRecordBatchIterator(lib._Weakrefable): + def __iter__(self) -> Self: ... + def __next__(self) -> TaggedRecordBatch: ... + + +class Scanner(lib._Weakrefable): + @staticmethod + def from_dataset( + dataset: Dataset, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @staticmethod + def from_fragment( + fragment: Fragment, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @staticmethod + def from_batches( + source: Iterator[lib.RecordBatch] | RecordBatchReader | Any, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + + @property + def dataset_schema(self) -> lib.Schema: ... + + @property + def projected_schema(self) -> lib.Schema: ... + + def to_batches(self) -> Iterator[lib.RecordBatch]: ... + + def scan_batches(self) -> TaggedRecordBatchIterator: ... + + def to_table(self) -> lib.Table: ... + + def take(self, indices: Indices) -> lib.Table: ... + + def head(self, num_rows: int) -> lib.Table: ... + + def count_rows(self) -> int: ... + + def to_reader(self) -> RecordBatchReader: ... + + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ... + + +class WrittenFile(lib._Weakrefable): + def __init__(self, path: str, metadata: _parquet.FileMetaData | + None, size: int) -> None: ... + + +def _filesystemdataset_write( + data: Scanner, + base_dir: StrPath, + basename_template: str, + filesystem: SupportedFileSystem, + partitioning: Partitioning, + preserve_order: bool, + file_options: FileWriteOptions, + max_partitions: int, + file_visitor: Callable[[str], None] | None, + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], + max_open_files: int, + max_rows_per_file: int, + min_rows_per_group: int, + max_rows_per_group: int, + create_dir: bool, +): ... + + +class _ScanNodeOptions(ExecNodeOptions): + def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + + +class ScanNodeOptions(_ScanNodeOptions): + def __init__( + self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs + ) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi b/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi new file mode 100644 index 00000000000..62f49bf5d30 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_dataset_orc.pyi @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from ._dataset import FileFormat + + +class OrcFileFormat(FileFormat): + def equals(self, other: OrcFileFormat) -> bool: ... + @property + def default_extname(self): ... diff --git a/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi b/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi new file mode 100644 index 00000000000..cb909444ffb --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_dataset_parquet.pyi @@ -0,0 +1,189 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable +from dataclasses import dataclass +from typing import IO, Any, TypedDict + +from _typeshed import StrPath + +from ._compute import Expression +from ._dataset import ( + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + Partitioning, + PartitioningFactory, +) +from ._dataset_parquet_encryption import ParquetDecryptionConfig +from ._fs import SupportedFileSystem +from ._parquet import FileDecryptionProperties, FileMetaData +from ._types import DataType, LargeListType, ListType +from .lib import CacheOptions, Schema, _Weakrefable, NativeFile, Buffer, BufferReader + +parquet_encryption_enabled: bool + + +class ParquetFileFormat(FileFormat): + def __init__( + self, + read_options: ParquetReadOptions | None = None, + default_fragment_scan_options: ParquetFragmentScanOptions | None = None, + **kwargs, + ) -> None: ... + @property + def read_options(self) -> ParquetReadOptions: ... + def make_write_options( + self, **kwargs) -> ParquetFileWriteOptions: ... # type: ignore[override] + + def equals(self, other: ParquetFileFormat) -> bool: ... + @property + def default_extname(self) -> str: ... + + def make_fragment( + self, + file: StrPath | IO | Buffer | BufferReader, + + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + row_groups: Iterable[int] | None = None, + *, + file_size: int | None = None, + ) -> Fragment: ... + + +class _NameStats(TypedDict): + min: Any + max: Any + + +class RowGroupInfo: + id: int + metadata: FileMetaData + schema: Schema + + def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def statistics(self) -> dict[str, _NameStats]: ... + + +class ParquetFileFragment(FileFragment): + def ensure_complete_metadata(self) -> None: ... + @property + def path(self) -> str: ... + @property + def filesystem(self) -> SupportedFileSystem: ... + def open(self) -> NativeFile: ... + + @property + def row_groups(self) -> list[int]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def num_row_groups(self) -> int: ... + + def split_by_row_group( + self, filter: Expression | None = None, schema: Schema | None = None + ) -> list[Fragment]: ... + + def subset( + self, + filter: Expression | None = None, + schema: Schema | None = None, + row_group_ids: list[int] | None = None, + ) -> ParquetFileFormat: ... + + +class ParquetReadOptions(_Weakrefable): + def __init__( + self, + dictionary_columns: list[str] | set[str] | None = None, + coerce_int96_timestamp_unit: str | None = None, + binary_type: DataType | None = None, + list_type: type[ListType | LargeListType] | None = None, + ) -> None: ... + + @property + def dictionary_columns(self) -> set[str]: ... + @dictionary_columns.setter + def dictionary_columns(self, columns: list[str] | set[str]) -> None: ... + + @property + def coerce_int96_timestamp_unit(self) -> str: ... + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit: str) -> None: ... + + @property + def binary_type(self) -> DataType: ... + @binary_type.setter + def binary_type(self, type: DataType | None) -> None: ... + + @property + def list_type(self) -> type[ListType | LargeListType]: ... + @list_type.setter + def list_type(self, type: type[ListType | LargeListType] | None) -> None: ... + + def equals(self, other: ParquetReadOptions) -> bool: ... + + +class ParquetFileWriteOptions(FileWriteOptions): + def update(self, **kwargs) -> None: ... + def _set_properties(self) -> None: ... + def _set_arrow_properties(self) -> None: ... + def _set_encryption_config(self) -> None: ... + # accept passthrough options used in tests + def __init__(self, **kwargs) -> None: ... + + +@dataclass(kw_only=True) +class ParquetFragmentScanOptions(FragmentScanOptions): + use_buffered_stream: bool = False + buffer_size: int = 8192 + pre_buffer: bool = True + cache_options: CacheOptions | None = None + thrift_string_size_limit: int | None = None + thrift_container_size_limit: int | None = None + decryption_config: ParquetDecryptionConfig | None = None + decryption_properties: FileDecryptionProperties | None = None + page_checksum_verification: bool = False + + def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + + +@dataclass +class ParquetFactoryOptions(_Weakrefable): + + partition_base_dir: str | None = None + partitioning: Partitioning | PartitioningFactory | None = None + validate_column_chunk_paths: bool = False + + +class ParquetDatasetFactory(DatasetFactory): + def __init__( + self, + metadata_path: str, + filesystem: SupportedFileSystem, + format: FileFormat, + options: ParquetFactoryOptions | None = None, + ) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi b/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi new file mode 100644 index 00000000000..b36f18522e5 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_dataset_parquet_encryption.pyi @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions +from ._parquet import FileDecryptionProperties +from ._parquet_encryption import (CryptoFactory, EncryptionConfiguration, + DecryptionConfiguration, KmsConnectionConfig) +from .lib import _Weakrefable + + +class ParquetEncryptionConfig(_Weakrefable): + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + + +class ParquetDecryptionConfig(_Weakrefable): + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration, + ) -> None: ... + + +def set_encryption_config( + opts: ParquetFileWriteOptions, + config: ParquetEncryptionConfig, +) -> None: ... + + +def set_decryption_properties( + opts: ParquetFragmentScanOptions, + config: FileDecryptionProperties, +): ... + + +def set_decryption_config( + opts: ParquetFragmentScanOptions, + config: ParquetDecryptionConfig, +): ... diff --git a/python/pyarrow-stubs/pyarrow/_feather.pyi b/python/pyarrow-stubs/pyarrow/_feather.pyi new file mode 100644 index 00000000000..2f4757cd5f1 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_feather.pyi @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Literal +from collections.abc import Sequence + +from _typeshed import StrPath + +from .lib import Buffer, NativeFile, Table, _Weakrefable + + +class FeatherError(Exception): + ... + + +def write_feather( + table: Table, + dest: StrPath | IO | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +): ... + + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: StrPath | IO | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: Sequence[int]) -> Table: ... + def read_names(self, names: Sequence[str]) -> Table: ... diff --git a/python/pyarrow-stubs/pyarrow/_flight.pyi b/python/pyarrow-stubs/pyarrow/_flight.pyi new file mode 100644 index 00000000000..03d6c6580ab --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_flight.pyi @@ -0,0 +1,660 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import asyncio +import enum +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Generator, Iterable, Iterator, Sequence +from typing import Any, Generic, NamedTuple, TypeVar +from datetime import datetime +from typing_extensions import deprecated + +from .ipc import _ReadPandasMixin, ReadStats +from .lib import ( + ArrowCancelled, + ArrowException, + ArrowInvalid, + Buffer, + IpcReadOptions, + IpcWriteOptions, + RecordBatch, + RecordBatchReader, + Scalar, + Schema, + Table, + _CRecordBatchWriter, + _Weakrefable, +) + +_T = TypeVar("_T") + + +class FlightCallOptions(_Weakrefable): + def __init__( + self, + timeout: float | None = None, + write_options: IpcWriteOptions | None = None, + headers: list[tuple[str | bytes, str | bytes]] | None = None, + read_options: IpcReadOptions | None = None, + ) -> None: ... + + +class CertKeyPair(NamedTuple): + cert: str | bytes | None + key: str | bytes | None + + +class FlightError(Exception): + extra_info: bytes + + +class FlightInternalError(FlightError, ArrowException): + ... + + +class FlightTimedOutError(FlightError, ArrowException): + ... + + +class FlightCancelledError(FlightError, ArrowCancelled): + def __init__(self, message: str, *, extra_info: bytes | None = None) -> None: ... + + +class FlightServerError(FlightError, ArrowException): + ... + + +class FlightUnauthenticatedError(FlightError, ArrowException): + ... + + +class FlightUnauthorizedError(FlightError, ArrowException): + ... + + +class FlightUnavailableError(FlightError, ArrowException): + ... + + +class FlightWriteSizeExceededError(ArrowInvalid): + limit: int + actual: int + + +class Action(_Weakrefable): + def __init__( + self, action_type: bytes | str, buf: Buffer | bytes | None) -> None: ... + + @property + def type(self) -> str: ... + + @property + def body(self) -> Buffer: ... + + def serialize(self) -> bytes: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class ActionType(NamedTuple): + type: str + description: str + + def make_action(self, buf: Buffer | bytes) -> Action: ... + + +class Result(_Weakrefable): + def __init__(self, buf: Buffer | bytes) -> None: ... + + @property + def body(self) -> Buffer: ... + + def serialize(self) -> bytes: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class BasicAuth(_Weakrefable): + def __init__( + self, username: str | bytes | None = None, password: str | bytes | None = None + ) -> None: ... + + @property + def username(self) -> bytes: ... + @property + def password(self) -> bytes: ... + def serialize(self) -> str: ... + @staticmethod + def deserialize(serialized: str | bytes) -> BasicAuth: ... + + +class DescriptorType(enum.Enum): + UNKNOWN = 0 + PATH = 1 + CMD = 2 + + +class FlightMethod(enum.Enum): + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + + +class FlightDescriptor(_Weakrefable): + @staticmethod + def for_path(*path: str | bytes) -> FlightDescriptor: ... + + @staticmethod + def for_command(command: str | bytes) -> FlightDescriptor: ... + + @property + def descriptor_type(self) -> DescriptorType: ... + + @property + def path(self) -> list[bytes] | None: ... + + @property + def command(self) -> bytes | None: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class Ticket(_Weakrefable): + def __init__(self, ticket: str | bytes) -> None: ... + @property + def ticket(self) -> bytes: ... + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class Location(_Weakrefable): + def __init__(self, uri: str | bytes) -> None: ... + @property + def uri(self) -> bytes: ... + def equals(self, other: Location) -> bool: ... + @staticmethod + def for_grpc_tcp(host: str | bytes, port: int) -> Location: ... + + @staticmethod + def for_grpc_tls(host: str | bytes, port: int) -> Location: ... + + @staticmethod + def for_grpc_unix(path: str | bytes) -> Location: ... + + +class FlightEndpoint(_Weakrefable): + def __init__( + self, + ticket: Ticket | str | bytes | object, + locations: list[str | bytes | Location | object], + expiration_time: Scalar[Any] | str | datetime | None = ..., + app_metadata: bytes | str | object = ..., + ): ... + + @property + def ticket(self) -> Ticket: ... + + @property + def locations(self) -> list[Location]: ... + + def serialize(self) -> bytes: ... + @property + def expiration_time(self) -> Scalar[Any] | None: ... + + @property + def app_metadata(self) -> bytes | str: ... + + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class SchemaResult(_Weakrefable): + def __init__(self, schema: Schema) -> None: ... + + @property + def schema(self) -> Schema: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class FlightInfo(_Weakrefable): + def __init__( + self, + schema: Schema | None, + descriptor: FlightDescriptor, + endpoints: list[FlightEndpoint], + total_records: int | None = ..., + total_bytes: int | None = ..., + ordered: bool = ..., + app_metadata: bytes | str = ..., + ) -> None: ... + + @property + def schema(self) -> Schema | None: ... + + @property + def descriptor(self) -> FlightDescriptor: ... + + @property + def endpoints(self) -> list[FlightEndpoint]: ... + + @property + def total_records(self) -> int: ... + + @property + def total_bytes(self) -> int: ... + + @property + def ordered(self) -> bool: ... + + @property + def app_metadata(self) -> bytes | str: ... + + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + + +class FlightStreamChunk(_Weakrefable): + @property + def data(self) -> RecordBatch | None: ... + @property + def app_metadata(self) -> Buffer | None: ... + def __iter__(self): ... + + +class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + + def __iter__(self) -> Self: ... + def __next__(self) -> FlightStreamChunk: ... + @property + def schema(self) -> Schema: ... + + def read_all(self) -> Table: ... + + def read_chunk(self) -> FlightStreamChunk: ... + + def to_reader(self) -> RecordBatchReader: ... + + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): + @property + def stats(self) -> ReadStats: ... + + +class FlightStreamReader(MetadataRecordBatchReader): + @property + def stats(self) -> ReadStats: ... + + def cancel(self) -> None: ... + + def read_all(self) -> Table: ... + + def read(self) -> RecordBatch | None: ... + + +class MetadataRecordBatchWriter(_CRecordBatchWriter): + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: ... + + def write_metadata(self, buf: Buffer | bytes) -> None: ... + + def write_batch(self, batch: RecordBatch) -> None: ... # type: ignore[override] + + def write_table(self, table: Table, max_chunksize: int | + None = None, **kwargs) -> None: ... + + def close(self) -> None: ... + + def write_with_metadata(self, batch: RecordBatch, buf: Buffer | bytes) -> None: ... + + +class FlightStreamWriter(MetadataRecordBatchWriter): + def done_writing(self) -> None: ... + + +class FlightMetadataReader(_Weakrefable): + def read(self) -> Buffer | None: ... + + +class FlightMetadataWriter(_Weakrefable): + def write(self, message: Buffer) -> None: ... + + +class AsyncioCall(Generic[_T]): + _future: asyncio.Future[_T] + + def as_awaitable(self) -> asyncio.Future[_T]: ... + def wakeup(self, result_or_exception: BaseException | _T) -> None: ... + + +class AsyncioFlightClient: + def __init__(self, client: FlightClient) -> None: ... + + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions | None = None, + ): ... + + +class FlightClient(_Weakrefable): + def __init__( + self, + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, + ): ... + + @property + def supports_async(self) -> bool: ... + def as_async(self) -> AsyncioFlightClient: ... + def wait_for_available(self, timeout: int = 5) -> None: ... + + @classmethod + @deprecated( + "Use the ``FlightClient`` constructor or " + "``pyarrow.flight.connect`` function instead." + ) + def connect( + cls, + location: str | tuple[str, int] | Location, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + disable_server_verification: bool = False, + ) -> FlightClient: ... + + def authenticate( + self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None + ) -> None: ... + + def authenticate_basic_token( + self, username: str | bytes, password: str | bytes, + options: FlightCallOptions | None = None + ) -> tuple[str, str]: ... + + def list_actions(self, options: FlightCallOptions | + None = None) -> list[Action]: ... + + def do_action( + self, action: Action | tuple[bytes | str, bytes | str] | str, + options: FlightCallOptions | None = None + ) -> Iterator[Result]: ... + + def list_flights( + self, criteria: str | bytes | None = None, + options: FlightCallOptions | None = None + ) -> Generator[FlightInfo, None, None]: ... + + def get_flight_info( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> FlightInfo: ... + + def get_schema( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> SchemaResult: ... + + def do_get( + self, ticket: Ticket, options: FlightCallOptions | None = None + ) -> FlightStreamReader: ... + + def do_put( + self, + descriptor: FlightDescriptor, + schema: Schema | None, + options: FlightCallOptions | None = None, + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + + def do_exchange( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> tuple[FlightStreamWriter, FlightStreamReader]: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback) -> None: ... + + +class FlightDataStream(_Weakrefable): + ... + + +class RecordBatchStream(FlightDataStream): + def __init__(self, data_source: RecordBatchReader | Table | None = None, + options: IpcWriteOptions | None = None) -> None: ... + + +class GeneratorStream(FlightDataStream): + def __init__( + self, + schema: Schema, + generator: Iterable[ + FlightDataStream + | Table + | RecordBatch + | RecordBatchReader + | tuple[RecordBatch, bytes] + ], + options: IpcWriteOptions | None = None, + ) -> None: ... + + +class ServerCallContext(_Weakrefable): + def peer_identity(self) -> bytes: ... + + def peer(self) -> str: ... + + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + def is_cancelled(self) -> bool: ... + + def add_header(self, key: str, value: str) -> None: ... + + def add_trailer(self, key: str, value: str) -> None: ... + + def get_middleware(self, key: str) -> ServerMiddleware | None: ... + + +class ServerAuthReader(_Weakrefable): + def read(self) -> str: ... + + +class ServerAuthSender(_Weakrefable): + def write(self, message: str) -> None: ... + + +class ClientAuthReader(_Weakrefable): + def read(self) -> str: ... + + +class ClientAuthSender(_Weakrefable): + def write(self, message: str) -> None: ... + + +class ServerAuthHandler(_Weakrefable): + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): ... + + def is_valid(self, token: str) -> bool: ... + + +class ClientAuthHandler(_Weakrefable): + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): ... + + def get_token(self) -> str: ... + + +class CallInfo(NamedTuple): + method: FlightMethod + + +class ClientMiddlewareFactory(_Weakrefable): + def start_call(self, info: CallInfo) -> ClientMiddleware | None: ... + + +class ClientMiddleware(_Weakrefable): + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... + + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): ... + + def call_completed(self, exception: ArrowException): ... + + +class ServerMiddlewareFactory(_Weakrefable): + def start_call( + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> ServerMiddleware | None: ... + + +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + ... + + +class ServerMiddleware(_Weakrefable): + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: ... + + def call_completed(self, exception: ArrowException): ... + + @property + def trace_context(self) -> dict: ... + + +class TracingServerMiddleware(ServerMiddleware): + trace_context: dict + def __init__(self, trace_context: dict) -> None: ... + + +class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... + + def start_call( # type: ignore[override] + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> _ServerMiddlewareFactoryWrapper | None: ... + + +class _ServerMiddlewareWrapper(ServerMiddleware): + def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... + def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... + def call_completed(self, exception: ArrowException) -> None: ... + + +class _FlightServerFinalizer(_Weakrefable): + + def finalize(self) -> None: ... + + +class FlightServerBase(_Weakrefable): + def __init__( + self, + location: str | tuple[str, int] | Location | None = None, + auth_handler: ServerAuthHandler | None = None, + tls_certificates: list[tuple[str, str]] | None = None, + verify_client: bool = False, + root_certificates: str | None = None, + middleware: dict[str, ServerMiddlewareFactory] | None = None, + ): ... + + @property + def port(self) -> int: ... + + def list_flights(self, context: ServerCallContext, + criteria: str) -> Iterator[FlightInfo]: ... + + def get_flight_info( + self, context: ServerCallContext, descriptor: FlightDescriptor + ) -> FlightInfo: ... + + def get_schema(self, context: ServerCallContext, + descriptor: FlightDescriptor) -> Schema: ... + + def do_put( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter, + ) -> None: ... + + def do_get(self, context: ServerCallContext, + ticket: Ticket) -> FlightDataStream: ... + + def do_exchange( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: MetadataRecordBatchWriter, + ) -> None: ... + + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: ... + + def do_action(self, context: ServerCallContext, + action: Action) -> Iterable[bytes]: ... + + def serve(self) -> None: ... + + def run(self) -> None: ... + + def shutdown(self) -> None: ... + + def wait(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__( + self, exc_type: object, exc_value: object, traceback: object) -> None: ... + + +def connect( + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: Sequence[tuple[str, int | str]] | None = None, +) -> FlightClient: ... diff --git a/python/pyarrow-stubs/pyarrow/_fs.pyi b/python/pyarrow-stubs/pyarrow/_fs.pyi new file mode 100644 index 00000000000..caf23a75d99 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_fs.pyi @@ -0,0 +1,234 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt +import enum +import sys + +from abc import ABC, abstractmethod +from _typeshed import StrPath + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from fsspec import AbstractFileSystem # type: ignore + +from .lib import NativeFile, _Weakrefable + + +class FileType(enum.IntFlag): + NotFound = enum.auto() + Unknown = enum.auto() + File = enum.auto() + Directory = enum.auto() + + +class FileInfo(_Weakrefable): + def __init__( + self, + path: str, + type: FileType = FileType.Unknown, + *, + mtime: dt.datetime | float | None = None, + mtime_ns: int | None = None, + size: int | None = None, + ): ... + + def __getitem__(self, int) -> FileInfo: ... + + @property + def type(self) -> FileType: ... + + @property + def is_file(self) -> bool: ... + @property + def path(self) -> str: ... + + @property + def base_name(self) -> str: ... + + @property + def size(self) -> int: ... + + @property + def extension(self) -> str: ... + + @property + def mtime(self) -> dt.datetime | None: ... + + @property + def mtime_ns(self) -> int | None: ... + + +class FileSelector(_Weakrefable): + base_dir: str + allow_not_found: bool + recursive: bool + def __init__(self, base_dir: str, allow_not_found: bool = False, + recursive: bool = False): ... + + +class FileSystem(_Weakrefable): + @classmethod + def from_uri(cls, uri: str | StrPath) -> tuple[Self, str]: ... + + def equals(self, other: FileSystem | object) -> bool: ... + + @property + def type_name(self) -> str: ... + + def get_file_info( + self, paths_or_selector: str | list[str] | FileSelector + ) -> list[FileInfo] | FileInfo: ... + + def create_dir(self, path: str, *, recursive: bool = True) -> None: ... + + def delete_dir(self, path: str) -> None: ... + + def delete_dir_contents( + self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False + ) -> None: ... + + def move(self, src: str, dest: str) -> None: ... + + def copy_file(self, src: str, dest: str) -> None: ... + + def delete_file(self, path: str) -> None: ... + + def open_input_file(self, path: str) -> NativeFile: ... + + def open_input_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None) -> NativeFile: ... + + def open_output_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ) -> NativeFile: ... + + def open_append_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ): ... + + def normalize_path(self, path: str) -> str: ... + + +class LocalFileSystem(FileSystem): + def __init__(self, *, use_mmap: bool = False) -> None: ... + + +class SubTreeFileSystem(FileSystem): + def __init__(self, base_path: str, base_fs: FileSystem): ... + @property + def base_path(self) -> str: ... + @property + def base_fs(self) -> FileSystem: ... + + +class _MockFileSystem(FileSystem): + def __init__(self, current_time: dt.datetime | None = None) -> None: ... + + +class PyFileSystem(FileSystem): + def __init__(self, handler: FileSystemHandler | None) -> None: ... + @property + def handler(self) -> FileSystemHandler: ... + + +class FileSystemHandler(ABC): + @abstractmethod + def get_type_name(self) -> str: ... + + @abstractmethod + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: ... + + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: ... + + @abstractmethod + def create_dir(self, path: str, recursive: bool) -> None: ... + + @abstractmethod + def delete_dir(self, path: str) -> None: ... + + @abstractmethod + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + + @abstractmethod + def delete_root_dir_contents(self) -> None: ... + + @abstractmethod + def delete_file(self, path: str) -> None: ... + + @abstractmethod + def move(self, src: str, dest: str) -> None: ... + + @abstractmethod + def copy_file(self, src: str, dest: str) -> None: ... + + @abstractmethod + def open_input_stream(self, path: str) -> NativeFile: ... + + @abstractmethod + def open_input_file(self, path: str) -> NativeFile: ... + + @abstractmethod + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + + @abstractmethod + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: ... + + @abstractmethod + def normalize_path(self, path: str) -> str: ... + + +SupportedFileSystem: TypeAlias = AbstractFileSystem | FileSystem + + +def _copy_files( + source_fs: FileSystem, + source_path: str, + destination_fs: SupportedFileSystem | None, + destination_path: str, + chunk_size: int = 1048576, + use_threads: bool = True, +) -> None: ... + + +def _copy_files_selector( + source_fs: FileSystem, + source_sel: FileSelector, + destination_fs: SupportedFileSystem | None, + destination_base_dir: str, + chunk_size: int = 1048576, + use_threads: bool = True, +) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_gcsfs.pyi b/python/pyarrow-stubs/pyarrow/_gcsfs.pyi new file mode 100644 index 00000000000..a0af3fa3871 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_gcsfs.pyi @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt + +from ._fs import FileSystem +from .lib import KeyValueMetadata + + +class GcsFileSystem(FileSystem): + def __init__( + self, + *, + anonymous: bool = False, + access_token: str | None = None, + target_service_account: str | None = None, + credential_token_expiration: dt.datetime | None = None, + default_bucket_location: str = "US", + scheme: str = "https", + endpoint_override: str | None = None, + default_metadata: dict | KeyValueMetadata | None = None, + retry_time_limit: dt.timedelta | None = None, + project_id: str | None = None, + ): ... + @property + def default_bucket_location(self) -> str: ... + + @property + def project_id(self) -> str: ... diff --git a/python/pyarrow-stubs/pyarrow/_hdfs.pyi b/python/pyarrow-stubs/pyarrow/_hdfs.pyi new file mode 100644 index 00000000000..370eaf70927 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_hdfs.pyi @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from _typeshed import StrPath + +from ._fs import FileSystem + + +class HadoopFileSystem(FileSystem): + def __init__( + self, + host: str | None = None, + port: int = 8020, + *, + user: str | None = None, + replication: int = 3, + buffer_size: int = 0, + default_block_size: int | None = None, + kerb_ticket: StrPath | None = None, + extra_conf: dict | None = None, + ): ... + @staticmethod + def from_uri(uri: str | int) -> HadoopFileSystem: ... # type: ignore[override] diff --git a/python/pyarrow-stubs/pyarrow/_ipc.pyi b/python/pyarrow-stubs/pyarrow/_ipc.pyi new file mode 100644 index 00000000000..ec0557f380b --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_ipc.pyi @@ -0,0 +1,319 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum +import sys + +from io import IOBase + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from collections.abc import Iterable, Iterator, Mapping +from typing import Any, Literal, NamedTuple + +import pandas as pd + +from pyarrow._stubs_typing import SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile, BufferReader +from ._types import DictionaryMemo, KeyValueMetadata + + +class MetadataVersion(enum.IntEnum): + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + + +class Alignment(enum.IntEnum): + Any = enum.auto() + At64Byte = enum.auto() + DataTypeSpecific = enum.auto() + + +class WriteStats(NamedTuple): + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + + +class ReadStats(NamedTuple): + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + + +class IpcReadOptions(_Weakrefable): + ensure_native_endian: bool + use_threads: bool + ensure_alignment: Alignment + included_fields: list[int] | None + + def __init__( + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + ensure_alignment: Alignment = ..., + included_fields: list[int] | None = None, + ) -> None: ... + + +class IpcWriteOptions(_Weakrefable): + metadata_version: Any + allow_64bit: bool + use_legacy_format: bool + compression: Any + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + + +class Message(_Weakrefable): + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + + def serialize_to(self, sink: NativeFile, alignment: int = 8, + memory_pool: MemoryPool | None = None): ... + + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | + None = None) -> Buffer: ... + + +class MessageReader(_Weakrefable): + @classmethod + def open_stream(cls, source: bytes | NativeFile | + IOBase | SupportPyBuffer) -> Self: ... + + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: ... + + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + + +class _CRecordBatchWriter(_Weakrefable): + def write(self, table_or_batch: Table | RecordBatch): ... + + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): ... + + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: ... + + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: ... + + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + @property + def _use_legacy_format(self) -> bool: ... + @property + def _metadata_version(self) -> MetadataVersion: ... + + def _open( + self, + sink, + schema: Schema, + options: IpcWriteOptions = IpcWriteOptions(), # noqa: Y011 + metadata: dict[bytes, bytes] | None = None, + ): ... + + +class _ReadPandasMixin: + def read_pandas(self, **options) -> pd.DataFrame: ... + + +class RecordBatchReader(_Weakrefable): + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: ... + + __next__ = read_next_batch + @property + def schema(self) -> Schema: ... + + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: ... + + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: ... + + def read_all(self) -> Table: ... + + read_pandas = _ReadPandasMixin.read_pandas + def close(self) -> None: ... + + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + + @classmethod + def from_stream(cls, data: Any, + schema: Any = None) -> Self: ... + + @classmethod + def from_batches(cls, schema: Any, batches: Iterable[RecordBatch]) -> Self: ... + + +class _RecordBatchStreamReader(RecordBatchReader): + @property + def stats(self) -> ReadStats: ... + + def _open( + self, + source, + options: IpcReadOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): + ... + + +class RecordBatchWithMetadata(NamedTuple): + batch: RecordBatch + custom_metadata: KeyValueMetadata + + +class _RecordBatchFileReader(_Weakrefable): + @property + def num_record_batches(self) -> int: ... + + def get_batch(self, i: int) -> RecordBatch: ... + + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: ... + + def read_all(self) -> Table: ... + + read_pandas = _ReadPandasMixin.read_pandas + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def schema(self) -> Schema: ... + @property + def stats(self) -> ReadStats: ... + @property + def metadata(self) -> KeyValueMetadata | None: ... + + def _open( + self, + source, + footer_offset: int | None = None, + options: IpcReadOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + + +def get_tensor_size(tensor: Tensor) -> int: ... + + +def get_record_batch_size(batch: RecordBatch) -> int: ... + + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: ... + + +def read_tensor(source: NativeFile) -> Tensor: ... + + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: ... + + +def read_schema(obj: Buffer | Message | BufferReader, dictionary_memo: DictionaryMemo | + None = None) -> Schema: ... + + +def read_record_batch( + obj: Message | SupportPyBuffer, + schema: Schema, + dictionary_memo: DictionaryMemo | None = None) -> RecordBatch: ... + + +__all__ = [ + "MetadataVersion", + "Alignment", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/python/pyarrow-stubs/pyarrow/_json.pyi b/python/pyarrow-stubs/pyarrow/_json.pyi new file mode 100644 index 00000000000..bae2ff404f0 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_json.pyi @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable + + +class ReadOptions(_Weakrefable): + use_threads: bool + block_size: int + + def __init__(self, use_threads: bool | None = None, + block_size: int | None = None): ... + + def equals(self, other: ReadOptions) -> bool: ... + + +class ParseOptions(_Weakrefable): + explicit_schema: Schema + newlines_in_values: bool + unexpected_field_behavior: Literal["ignore", "error", "infer"] + + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: ... + + +class JSONStreamingReader(RecordBatchReader): + ... + + +def read_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: ... + + +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: ... diff --git a/python/pyarrow-stubs/pyarrow/_orc.pyi b/python/pyarrow-stubs/pyarrow/_orc.pyi new file mode 100644 index 00000000000..faa0f57c1fd --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_orc.pyi @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import IO, Literal, Any + +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + + +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", + "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... + + +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Any = 'UNCOMPRESSED', + compression_block_size: int | None = None, + compression_strategy: Any = 'SPEED', + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_parquet.pyi b/python/pyarrow-stubs/pyarrow/_parquet.pyi new file mode 100644 index 00000000000..79d32ece45f --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_parquet.pyi @@ -0,0 +1,523 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable, Iterator, Sequence +from typing import IO, Any, Literal, TypeAlias, TypedDict + +from _typeshed import StrPath + +from ._stubs_typing import Order +from .lib import ( + Buffer, + ChunkedArray, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, + DataType, + ListType, + LargeListType +) + +_PhysicalType: TypeAlias = Literal[ + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + "UNKNOWN", +] +_LogicTypeName: TypeAlias = Literal[ + "UNDEFINED", + "STRING", + "MAP", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME", + "TIMESTAMP", + "INT", + "FLOAT16", + "JSON", + "BSON", + "UUID", + "NONE", + "UNKNOWN", +] +_ConvertedType: TypeAlias = Literal[ + "NONE", + "UTF8", + "MAP", + "MAP_KEY_VALUE", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME_MILLIS", + "TIME_MICROS", + "TIMESTAMP_MILLIS", + "TIMESTAMP_MICROS", + "UINT_8", + "UINT_16", + "UINT_32", + "UINT_64", + "INT_8", + "INT_16", + "INT_32", + "INT_64", + "JSON", + "BSON", + "INTERVAL", + "UNKNOWN", +] +_Encoding: TypeAlias = Literal[ + "PLAIN", + "PLAIN_DICTIONARY", + "RLE", + "BIT_PACKED", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + "UNKNOWN", +] +_Compression: TypeAlias = Literal[ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "LZ4", + "ZSTD", + "UNKNOWN", +] + + +class _Statistics(TypedDict): + has_min_max: bool + min: Any | None + max: Any | None + null_count: int | None + distinct_count: int | None + num_values: int + physical_type: _PhysicalType + + +class Statistics(_Weakrefable): + def to_dict(self) -> _Statistics: ... + def equals(self, other: Statistics) -> bool: ... + @property + def has_min_max(self) -> bool: ... + @property + def has_null_count(self) -> bool: ... + @property + def has_distinct_count(self) -> bool: ... + @property + def min_raw(self) -> Any | None: ... + @property + def max_raw(self) -> Any | None: ... + @property + def min(self) -> Any | None: ... + @property + def max(self) -> Any | None: ... + @property + def null_count(self) -> int | None: ... + @property + def distinct_count(self) -> int | None: ... + @property + def num_values(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def is_min_exact(self) -> bool: ... + @property + def is_max_exact(self) -> bool: ... + + +class ParquetLogicalType(_Weakrefable): + def to_json(self) -> str: ... + @property + def type(self) -> _LogicTypeName: ... + + +class _ColumnChunkMetaData(TypedDict): + file_offset: int + file_path: str | None + physical_type: _PhysicalType + num_values: int + path_in_schema: str + is_stats_set: bool + statistics: Statistics | None + compression: _Compression + encodings: tuple[_Encoding, ...] + has_dictionary_page: bool + dictionary_page_offset: int | None + data_page_offset: int + total_compressed_size: int + total_uncompressed_size: int + + +class ColumnChunkMetaData(_Weakrefable): + def to_dict(self) -> _ColumnChunkMetaData: ... + def equals(self, other: ColumnChunkMetaData) -> bool: ... + @property + def file_offset(self) -> int: ... + @property + def file_path(self) -> str | None: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def num_values(self) -> int: ... + @property + def path_in_schema(self) -> str: ... + @property + def is_stats_set(self) -> bool: ... + @property + def statistics(self) -> Statistics | None: ... + @property + def compression(self) -> _Compression: ... + @property + def encodings(self) -> tuple[_Encoding, ...]: ... + @property + def has_dictionary_page(self) -> bool: ... + @property + def dictionary_page_offset(self) -> int | None: ... + @property + def data_page_offset(self) -> int: ... + @property + def has_index_page(self) -> bool: ... + @property + def index_page_offset(self) -> int: ... + @property + def total_compressed_size(self) -> int: ... + @property + def total_uncompressed_size(self) -> int: ... + @property + def has_offset_index(self) -> bool: ... + @property + def has_column_index(self) -> bool: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + @property + def name(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def converted_type(self) -> _ConvertedType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + + +class _SortingColumn(TypedDict): + column_index: int + descending: bool + nulls_first: bool + + +class SortingColumn: + def __init__( + self, column_index: int, descending: bool = False, nulls_first: bool = False + ) -> None: ... + + @classmethod + def from_ordering( + cls, + schema: Schema, + sort_keys: Sequence[str] + | Sequence[tuple[str, Order]] + | Sequence[str | tuple[str, Order]], + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> tuple[SortingColumn, ...]: ... + + @staticmethod + def to_ordering( + schema: Schema, sorting_columns: tuple[SortingColumn, ...] | list[SortingColumn] + ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... + def __hash__(self) -> int: ... + @property + def column_index(self) -> int: ... + @property + def descending(self) -> bool: ... + @property + def nulls_first(self) -> bool: ... + def to_dict(self) -> _SortingColumn: ... + + +class _RowGroupMetaData(TypedDict): + num_columns: int + num_rows: int + total_byte_size: int + columns: list[ColumnChunkMetaData] + sorting_columns: list[SortingColumn] + + +class RowGroupMetaData(_Weakrefable): + def __init__(self, parent: FileMetaData, index: int) -> None: ... + def equals(self, other: RowGroupMetaData) -> bool: ... + def column(self, i: int) -> ColumnChunkMetaData: ... + def to_dict(self) -> _RowGroupMetaData: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def sorting_columns(self) -> list[SortingColumn]: ... + + +class _FileMetaData(TypedDict): + created_by: str + num_columns: int + num_rows: int + num_row_groups: int + format_version: str + serialized_size: int + row_groups: list[Any] # List of row group metadata dictionaries + + +class FileMetaData(_Weakrefable): + def __hash__(self) -> int: ... + def to_dict(self) -> _FileMetaData: ... + def equals(self, other: FileMetaData) -> bool: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def serialized_size(self) -> int: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + @property + def format_version(self) -> str: ... + @property + def created_by(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def row_group(self, i: int) -> RowGroupMetaData: ... + def set_file_path(self, path: str) -> None: ... + def append_row_groups(self, other: FileMetaData) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | + NativeFile | IO) -> None: ... + + +class ParquetSchema(_Weakrefable): + def __init__(self, container: FileMetaData) -> None: ... + def __getitem__(self, i: int) -> ColumnSchema: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + @property + def names(self) -> list[str]: ... + def to_arrow_schema(self) -> Schema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def column(self, i: int) -> ColumnSchema: ... + + +class ColumnSchema(_Weakrefable): + def __init__(self, schema: ParquetSchema, index: int) -> None: ... + def equals(self, other: ColumnSchema) -> bool: ... + @property + def name(self) -> str: ... + @property + def path(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def length(self) -> int | None: ... + @property + def precision(self) -> int | None: ... + @property + def scale(self) -> int | None: ... + + +class ParquetReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + + def open( + self, + source: StrPath | Buffer | NativeFile | IO, + *, + use_memory_map: bool = False, + read_dictionary: Iterable[int] | Iterable[str] | None = None, + metadata: FileMetaData | None = None, + binary_type: DataType | None = None, + list_type: ListType | LargeListType | None = None, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + arrow_extensions_enabled: bool | None = None, + ) -> None: ... + + @property + def column_paths(self) -> list[str]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def set_use_threads(self, use_threads: bool) -> None: ... + def set_batch_size(self, batch_size: int) -> None: ... + + def iter_batches( + self, + batch_size: int = 65536, + row_groups: list[int] | range | None = None, + column_indices: list[str] | list[int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: ... + + def read_row_group( + self, i: int, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + + def read_row_groups( + self, + row_groups: Sequence[int] | range, + column_indices: list[str] | list[int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def read_all( + self, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + + def scan_contents( + self, columns: Sequence[str] | Sequence[int] | None = None, + batch_size: int = 65536 + ) -> int: ... + + def column_name_idx(self, column_name: str) -> int: ... + def read_column(self, column_index: int) -> ChunkedArray: ... + def close(self) -> None: ... + @property + def closed(self) -> bool: ... + + +class ParquetWriter(_Weakrefable): + def __init__( + self, + where: StrPath | NativeFile | IO, + schema: Schema, + use_dictionary: bool | list[str] | None = None, + compression: _Compression | dict[str, _Compression] | str | None = None, + version: str | None = None, + write_statistics: bool | list[str] | None = None, + memory_pool: MemoryPool | None = None, + use_deprecated_int96_timestamps: bool = False, + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = False, + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = False, + column_encoding: _Encoding | dict[str, _Encoding] | None = None, + writer_engine_version: str | None = None, + data_page_version: str | None = None, + use_compliant_nested_type: bool = True, + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: tuple[SortingColumn, ...] | None = None, + store_decimal_as_integer: bool = False, + write_time_adjusted_to_utc: bool = False, + ): ... + def close(self) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def use_dictionary(self) -> bool | list[str] | None: ... + @property + def use_deprecated_int96_timestamps(self) -> bool: ... + @property + def use_byte_stream_split(self) -> bool | list[str]: ... + @property + def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... + @property + def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... + @property + def allow_truncated_timestamps(self) -> bool: ... + @property + def compression(self) -> _Compression | dict[str, _Compression] | None: ... + @property + def compression_level(self) -> int | dict[str, int] | None: ... + @property + def data_page_version(self) -> str | None: ... + @property + def use_compliant_nested_type(self) -> bool: ... + @property + def version(self) -> str | None: ... + @property + def write_statistics(self) -> bool | list[str] | None: ... + @property + def writer_engine_version(self) -> str: ... + @property + def row_group_size(self) -> int: ... + @property + def data_page_size(self) -> int: ... + @property + def encryption_properties(self) -> FileDecryptionProperties: ... + @property + def write_batch_size(self) -> int: ... + @property + def dictionary_pagesize_limit(self) -> int: ... + @property + def store_schema(self) -> bool: ... + @property + def store_decimal_as_integer(self) -> bool: ... + + +class FileEncryptionProperties: + ... + + +class FileDecryptionProperties: + ... diff --git a/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi b/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi new file mode 100644 index 00000000000..b27504f47df --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_parquet_encryption.pyi @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt + +from collections.abc import Callable + +from ._parquet import FileDecryptionProperties, FileEncryptionProperties +from .lib import _Weakrefable + + +class EncryptionConfiguration(_Weakrefable): + footer_key: str + column_keys: dict[str, list[str]] + encryption_algorithm: str + plaintext_footer: bool + double_wrapping: bool + cache_lifetime: dt.timedelta + internal_key_material: bool + data_key_length_bits: int + uniform_encryption: bool + + def __init__( + self, + footer_key: str, + *, + column_keys: dict[str, str | list[str]] | None = None, + encryption_algorithm: str | None = None, + plaintext_footer: bool | None = None, + double_wrapping: bool | None = None, + cache_lifetime: dt.timedelta | None = None, + internal_key_material: bool | None = None, + data_key_length_bits: int | None = None, + uniform_encryption: bool | None = None, + ) -> None: ... + + +class DecryptionConfiguration(_Weakrefable): + cache_lifetime: dt.timedelta + def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... + + +class KmsConnectionConfig(_Weakrefable): + kms_instance_id: str + kms_instance_url: str + key_access_token: str + custom_kms_conf: dict[str, str] + + def __init__( + self, + *, + kms_instance_id: str | None = None, + kms_instance_url: str | None = None, + key_access_token: str | None = None, + custom_kms_conf: dict[str, str] | None = None, + ) -> None: ... + def refresh_key_access_token(self, value: str) -> None: ... + + +class KmsClient(_Weakrefable): + def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... + def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... + + +class CryptoFactory(_Weakrefable): + def __init__(self, kms_client_factory: Callable[[ + KmsConnectionConfig], KmsClient]): ... + + def file_encryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> FileEncryptionProperties: ... + + def file_decryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration | None = None, + ) -> FileDecryptionProperties: ... + def remove_cache_entries_for_token(self, access_token: str) -> None: ... + def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/_s3fs.pyi b/python/pyarrow-stubs/pyarrow/_s3fs.pyi new file mode 100644 index 00000000000..f82f34d2cae --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_s3fs.pyi @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from typing import Literal, TypedDict +from typing_extensions import Required, NotRequired + +from ._fs import FileSystem +from .lib import KeyValueMetadata + + +class _ProxyOptions(TypedDict): + scheme: Required[Literal["http", "https"]] + host: Required[str] + port: Required[int] + username: NotRequired[str] + password: NotRequired[str] + + +class S3LogLevel(enum.IntEnum): + Off = enum.auto() + Fatal = enum.auto() + Error = enum.auto() + Warn = enum.auto() + Info = enum.auto() + Debug = enum.auto() + Trace = enum.auto() + + +Off = S3LogLevel.Off +Fatal = S3LogLevel.Fatal +Error = S3LogLevel.Error +Warn = S3LogLevel.Warn +Info = S3LogLevel.Info +Debug = S3LogLevel.Debug +Trace = S3LogLevel.Trace + + +def initialize_s3( + log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 +) -> None: ... +def ensure_s3_initialized() -> None: ... +def finalize_s3() -> None: ... +def ensure_s3_finalized() -> None: ... +def resolve_s3_region(bucket: str) -> str: ... + + +class S3RetryStrategy: + max_attempts: int + def __init__(self, max_attempts=3) -> None: ... + + +class AwsStandardS3RetryStrategy(S3RetryStrategy): + ... + + +class AwsDefaultS3RetryStrategy(S3RetryStrategy): + ... + + +class S3FileSystem(FileSystem): + def __init__( + self, + *, + access_key: str | None = None, + secret_key: str | None = None, + session_token: str | None = None, + anonymous: bool = False, + region: str | None = None, + request_timeout: float | None = None, + connect_timeout: float | None = None, + scheme: Literal["http", "https"] = "https", + endpoint_override: str | None = None, + background_writes: bool = True, + default_metadata: dict | list | KeyValueMetadata | None = None, + role_arn: str | None = None, + session_name: str | None = None, + external_id: str | None = None, + load_frequency: int = 900, + proxy_options: _ProxyOptions | dict | tuple | str | None = None, + allow_bucket_creation: bool = False, + allow_bucket_deletion: bool = False, + allow_delayed_open: bool = False, + check_directory_existence_before_creation: bool = False, + tls_ca_file_path: str | None = None, + retry_strategy: S3RetryStrategy = + AwsStandardS3RetryStrategy(max_attempts=3), # noqa: Y011 + force_virtual_addressing: bool = False, + ): ... + @property + def region(self) -> str: ... diff --git a/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi b/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi new file mode 100644 index 00000000000..33210aac061 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt + +from collections.abc import Collection, Iterator, Sequence +from decimal import Decimal +from typing import Any, Literal, Protocol, TypeAlias, TypeVar + +import numpy as np + +from numpy.typing import NDArray + +from pyarrow.lib import BooleanArray, IntegerArray, ChunkedArray + +ArrayLike: TypeAlias = Any +ScalarLike: TypeAlias = Any +Order: TypeAlias = Literal["ascending", "descending"] +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] +Compression: TypeAlias = Literal[ + "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy" +] +NullEncoding: TypeAlias = Literal["mask", "encode"] +NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] +Mask: TypeAlias = ( + Sequence[bool | None] + | NDArray[np.bool_] + | BooleanArray + | ChunkedArray[Any] +) +Indices: TypeAlias = ( + Sequence[int | None] + | NDArray[np.integer[Any]] + | IntegerArray + | ChunkedArray[Any] +) + +PyScalar: TypeAlias = (bool | int | float | Decimal | str | bytes | + dt.date | dt.datetime | dt.time | dt.timedelta) + +_T = TypeVar("_T") +_V = TypeVar("_V", covariant=True) + +SingleOrList: TypeAlias = list[_T] | _T + + +class SupportEq(Protocol): + def __eq__(self, other) -> bool: ... + + +class SupportLt(Protocol): + def __lt__(self, other) -> bool: ... + + +class SupportGt(Protocol): + def __gt__(self, other) -> bool: ... + + +class SupportLe(Protocol): + def __le__(self, other) -> bool: ... + + +class SupportGe(Protocol): + def __ge__(self, other) -> bool: ... + + +FilterTuple: TypeAlias = ( + tuple[str, Literal["=", "==", "!="], SupportEq] + | tuple[str, Literal["<"], SupportLt] + | tuple[str, Literal[">"], SupportGt] + | tuple[str, Literal["<="], SupportLe] + | tuple[str, Literal[">="], SupportGe] + | tuple[str, Literal["in", "not in"], Collection] + | tuple[str, str, Any] # Allow general str for operator to avoid type errors +) + + +class Buffer(Protocol): + ... + + +class SupportPyBuffer(Protocol): + ... + + +class SupportArrowStream(Protocol): + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + + +class SupportPyArrowArray(Protocol): + def __arrow_array__(self, type=None) -> Any: ... + + +class SupportArrowArray(Protocol): + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + + +class SupportArrowDeviceArray(Protocol): + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + + +class SupportArrowSchema(Protocol): + def __arrow_c_schema(self) -> Any: ... + + +class NullableCollection(Protocol[_V]): # type: ignore[reportInvalidTypeVarUse] + def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... + def __len__(self) -> int: ... + def __contains__(self, item: Any, /) -> bool: ... diff --git a/python/pyarrow-stubs/pyarrow/_substrait.pyi b/python/pyarrow-stubs/pyarrow/_substrait.pyi new file mode 100644 index 00000000000..6818d9822ab --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_substrait.pyi @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable +from typing import Any + +from ._compute import Expression +from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable + + +def run_query( + plan: Buffer | int, + *, + table_provider: Callable[[list[str], Schema], Table] | None = None, + use_threads: bool = True, +) -> RecordBatchReader: ... +def _parse_json_plan(plan: bytes) -> Buffer: ... + + +class SubstraitSchema: + schema: bytes + expression: bytes + def __init__(self, schema: bytes, expression: bytes) -> None: ... + def to_pysubstrait(self) -> Any: ... + + +def serialize_schema(schema: Schema) -> SubstraitSchema: ... +def deserialize_schema(buf: Buffer | bytes | SubstraitSchema) -> Schema: ... + + +def serialize_expressions( + exprs: list[Expression], + names: list[str], + schema: Schema, + *, + allow_arrow_extensions: bool = False, +) -> Buffer: ... + + +class BoundExpressions(_Weakrefable): + @property + def schema(self) -> Schema: ... + @property + def expressions(self) -> dict[str, Expression]: ... + @classmethod + def from_substrait(cls, message: Buffer | bytes | Any) -> BoundExpressions: ... + + +def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... +def get_supported_functions() -> list[str]: ... diff --git a/python/pyarrow-stubs/pyarrow/_types.pyi b/python/pyarrow-stubs/pyarrow/_types.pyi new file mode 100644 index 00000000000..5438888902a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/_types.pyi @@ -0,0 +1,966 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime as dt # noqa: F401 +import sys + +from collections.abc import Mapping, Sequence, Iterable, Iterator +from decimal import Decimal # noqa: F401 + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import Any, Generic, Literal + +import numpy as np +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowSchema +from pyarrow.lib import ( # noqa: F401 + Array, + ChunkedArray, + ExtensionArray, + MemoryPool, + MonthDayNano, + Table, +) +from typing_extensions import TypeVar, deprecated + +from .io import Buffer +from .scalar import ExtensionScalar + + +class _Weakrefable: + ... + + +class _Metadata(_Weakrefable): + ... + + +class DataType(_Weakrefable): + def field(self, i: int) -> Field: ... + + @property + def id(self) -> int: ... + @property + def bit_width(self) -> int: ... + + @property + def byte_width(self) -> int: ... + + @property + def num_fields(self) -> int: ... + + @property + def num_buffers(self) -> int: ... + + @property + def has_variadic_buffers(self) -> bool: ... + + # Properties that exist on specific subtypes but accessed generically + @property + def list_size(self) -> int: ... + + def __hash__(self) -> int: ... + + def equals(self, other: DataType | str, *, + check_metadata: bool = False) -> bool: ... + + def to_pandas_dtype(self) -> np.generic: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_schema__(self) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: ... + + +_AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + + +class _BasicDataType(DataType, Generic[_AsPyType]): + ... + + +class NullType(_BasicDataType[None]): + ... + + +class BoolType(_BasicDataType[bool]): + ... + + +class UInt8Type(_BasicDataType[int]): + ... + + +class Int8Type(_BasicDataType[int]): + ... + + +class UInt16Type(_BasicDataType[int]): + ... + + +class Int16Type(_BasicDataType[int]): + ... + + +class UInt32Type(_BasicDataType[int]): + ... + + +class Int32Type(_BasicDataType[int]): + ... + + +class UInt64Type(_BasicDataType[int]): + ... + + +class Int64Type(_BasicDataType[int]): + ... + + +class Float16Type(_BasicDataType[float]): + ... + + +class Float32Type(_BasicDataType[float]): + ... + + +class Float64Type(_BasicDataType[float]): + ... + + +class Date32Type(_BasicDataType[dt.date]): + ... + + +class Date64Type(_BasicDataType[dt.date]): + ... + + +class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): + ... + + +class StringType(_BasicDataType[str]): + ... + + +class LargeStringType(_BasicDataType[str]): + ... + + +class StringViewType(_BasicDataType[str]): + ... + + +class BinaryType(_BasicDataType[bytes]): + ... + + +class LargeBinaryType(_BasicDataType[bytes]): + ... + + +class BinaryViewType(_BasicDataType[bytes]): + ... + + +_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) +_Tz = TypeVar("_Tz", str, None, default=None) + + +class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + + @property + def unit(self) -> _Unit: ... + + @property + def tz(self) -> _Tz: ... + + +_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) + + +class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): + @property + def unit(self) -> _Time32Unit: ... + + +_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) + + +class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): + @property + def unit(self) -> _Time64Unit: ... + + +class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): + @property + def unit(self) -> _Unit: ... + + +class FixedSizeBinaryType(_BasicDataType[Decimal]): + ... + + +_Precision = TypeVar("_Precision", default=Any) +_Scale = TypeVar("_Scale", default=Any) + + +class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + @property + def precision(self) -> _Precision: ... + + @property + def scale(self) -> _Scale: ... + + +class ListType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class LargeListType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: ... + + +class ListViewType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class LargeListViewType(DataType, Generic[_DataTypeT]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + +class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): + @property + def value_field(self) -> Field[_DataTypeT]: ... + + @property + def value_type(self) -> _DataTypeT: ... + + @property + def list_size(self) -> int: ... + + +class DictionaryMemo(_Weakrefable): + ... + + +_IndexT = TypeVar( + "_IndexT", + UInt8Type, + Int8Type, + UInt16Type, + Int16Type, + UInt32Type, + Int32Type, + UInt64Type, + Int64Type, +) +_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) +_ValueT = TypeVar("_ValueT", bound=DataType) +_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) + + +class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): + @property + def ordered(self) -> _Ordered: ... + + @property + def index_type(self) -> _IndexT: ... + + @property + def value_type(self) -> _BasicValueT: ... + + +_K = TypeVar("_K", bound=DataType) + + +class MapType(DataType, Generic[_K, _ValueT, _Ordered]): + @property + def key_field(self) -> Field[_K]: ... + + @property + def key_type(self) -> _K: ... + + @property + def item_field(self) -> Field[_ValueT]: ... + + @property + def item_type(self) -> _ValueT: ... + + @property + def keys_sorted(self) -> _Ordered: ... + + +_Size = TypeVar("_Size", default=int) + + +class StructType(DataType): + def get_field_index(self, name: str) -> int: ... + + def field(self, i: int | str) -> Field: ... + + def get_all_field_indices(self, name: str) -> list[int]: ... + + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[Field]: ... + + __getitem__ = field + @property + def names(self) -> list[str]: ... + + @property + def fields(self) -> list[Field]: ... + + +class UnionType(DataType): + @property + def mode(self) -> Literal["sparse", "dense"]: ... + + @property + def type_codes(self) -> list[int]: ... + + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[Field]: ... + + def field(self, i: int) -> Field: ... + + __getitem__ = field + + +class SparseUnionType(UnionType): + @property + def mode(self) -> Literal["sparse"]: ... + + +class DenseUnionType(UnionType): + @property + def mode(self) -> Literal["dense"]: ... + + +_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) + + +class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): + @property + def run_end_type(self) -> _RunEndType: ... + @property + def value_type(self) -> _BasicValueT: ... + + +_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) + + +class BaseExtensionType(DataType): + def __arrow_ext_class__(self) -> type[ExtensionArray]: ... + + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: ... + + @property + def extension_name(self) -> str: ... + + @property + def storage_type(self) -> DataType: ... + + def wrap_array(self, storage: _StorageT) -> _StorageT: ... + + +class ExtensionType(BaseExtensionType): + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... + + def __arrow_ext_serialize__(self) -> bytes: ... + + @classmethod + def __arrow_ext_deserialize__( + cls, storage_type: DataType, serialized: bytes) -> Self: ... + + +class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): + @property + def value_type(self) -> _ValueT: ... + + @property + def shape(self) -> list[int]: ... + + @property + def dim_names(self) -> list[str] | None: ... + + @property + def permutation(self) -> list[int] | None: ... + + +class Bool8Type(BaseExtensionType): + ... + + +class UuidType(BaseExtensionType): + ... + + +class JsonType(BaseExtensionType): + ... + + +class OpaqueType(BaseExtensionType): + @property + def type_name(self) -> str: ... + + @property + def vendor_name(self) -> str: ... + + +class UnknownExtensionType(ExtensionType): + def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + + +def register_extension_type(ext_type: ExtensionType) -> None: ... + + +def unregister_extension_type(type_name: str) -> None: ... + + +class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): + def __init__( + self, __arg0__: Mapping[str | bytes, str | bytes] + | Iterable[tuple[str, str]] + | KeyValueMetadata + | None = None, **kwargs: str + ) -> None: ... + + def equals(self, other: KeyValueMetadata) -> bool: ... + + def __len__(self) -> int: ... + + def __contains__(self, /, __key: object) -> bool: ... # type: ignore[override] + + def __getitem__(self, /, __key: Any) -> Any: ... # type: ignore[override] + + def __iter__(self) -> Iterator[bytes]: ... + + def get_all(self, key: str) -> list[bytes]: ... + + def to_dict(self) -> dict[bytes, bytes]: ... + + +class Field(_Weakrefable, Generic[_DataTypeT]): + def equals(self, other: Field, check_metadata: bool = False) -> bool: ... + + def __hash__(self) -> int: ... + + @property + def nullable(self) -> bool: ... + + @property + def name(self) -> str: ... + + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + + @property + def type(self) -> _DataTypeT: ... + def with_metadata(self, metadata: dict[bytes | str, bytes | str] | + Mapping[bytes | str, bytes | str] | Any) -> Self: ... + + def remove_metadata(self) -> Self: ... + + def with_type(self, new_type: DataType) -> Field: ... + + def with_name(self, name: str) -> Self: ... + + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: ... + + def flatten(self) -> list[Field]: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: ... + + def __arrow_c_schema__(self) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: ... + + +class Schema(_Weakrefable): + def __len__(self) -> int: ... + + def __getitem__(self, key: str | int) -> Field: ... + + _field = __getitem__ + def __iter__(self) -> Iterator[Field]: ... + + def __hash__(self) -> int: ... + + def __sizeof__(self) -> int: ... + @property + def pandas_metadata(self) -> dict: ... + + @property + def names(self) -> list[str]: ... + + @property + def types(self) -> list[DataType]: ... + + @property + def metadata(self) -> dict[bytes, bytes]: ... + + def empty_table(self) -> Table: ... + + def equals(self, other: Schema, check_metadata: bool = False) -> bool: ... + + @classmethod + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | + None = None) -> Schema: ... + + def field(self, i: int | str | bytes) -> Field: ... + + @deprecated("Use 'field' instead") + def field_by_name(self, name: str) -> Field: ... + + def get_field_index(self, name: str) -> int: ... + + def get_all_field_indices(self, name: str) -> list[int]: ... + + def append(self, field: Field) -> Schema: ... + + def insert(self, i: int, field: Field) -> Schema: ... + + def remove(self, i: int) -> Schema: ... + + def set(self, i: int, field: Field) -> Schema: ... + + @deprecated("Use 'with_metadata' instead") + def add_metadata(self, metadata: dict) -> Schema: ... + + def with_metadata(self, metadata: dict) -> Schema: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def remove_metadata(self) -> Schema: ... + + def to_string( + self, + truncate_metadata: bool = True, + show_field_metadata: bool = True, + show_schema_metadata: bool = True, + element_size_limit: int | None = None, + ) -> str: ... + + def _export_to_c(self, out_ptr: int) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int) -> Schema: ... + + def __arrow_c_schema__(self) -> Any: ... + + @staticmethod + def _import_from_c_capsule(schema: Any) -> Schema: ... + + +def unify_schemas( + schemas: Sequence[Schema], + *, + promote_options: Literal["default", "permissive"] = "default" +) -> Schema: ... + + +def field( + name: SupportArrowSchema | str | Any, type: _DataTypeT | str | None = None, + nullable: bool = ..., + metadata: dict[Any, Any] | None = None +) -> Field[_DataTypeT] | Field[Any]: ... + + +def null() -> NullType: ... + + +def bool_() -> BoolType: ... + + +def uint8() -> UInt8Type: ... + + +def int8() -> Int8Type: ... + + +def uint16() -> UInt16Type: ... + + +def int16() -> Int16Type: ... + + +def uint32() -> UInt32Type: ... + + +def int32() -> Int32Type: ... + + +def int64() -> Int64Type: ... + + +def uint64() -> UInt64Type: ... + + +def timestamp( + unit: _Unit | str, tz: _Tz | None = None) -> TimestampType[_Unit, _Tz]: ... + + +def time32(unit: _Time32Unit | str) -> Time32Type[_Time32Unit]: ... + + +def time64(unit: _Time64Unit | str) -> Time64Type[_Time64Unit]: ... + + +def duration(unit: _Unit | str) -> DurationType[_Unit]: ... + + +def month_day_nano_interval() -> MonthDayNanoIntervalType: ... + + +def date32() -> Date32Type: ... + + +def date64() -> Date64Type: ... + + +def float16() -> Float16Type: ... + + +def float32() -> Float32Type: ... + + +def float64() -> Float64Type: ... + + +def decimal32(precision: _Precision, scale: _Scale | + None = None) -> Decimal32Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal64(precision: _Precision, scale: _Scale | + None = None) -> Decimal64Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal128(precision: _Precision, scale: _Scale | + None = None) -> Decimal128Type[_Precision, _Scale | Literal[0]]: ... + + +def decimal256(precision: _Precision, scale: _Scale | + None = None) -> Decimal256Type[_Precision, _Scale | Literal[0]]: ... + + +def string() -> StringType: ... + + +utf8 = string + + +def binary(length: Literal[-1] | int = ...) -> BinaryType | FixedSizeBinaryType: ... + + +def large_binary() -> LargeBinaryType: ... + + +def large_string() -> LargeStringType: ... + + +large_utf8 = large_string + + +def binary_view() -> BinaryViewType: ... + + +def string_view() -> StringViewType: ... + + +def list_( + value_type: _DataTypeT | Field[_DataTypeT] | None = None, + list_size: Literal[-1] | _Size | None = None +) -> ListType[_DataTypeT] | FixedSizeListType[_DataTypeT, _Size]: ... + + +def large_list(value_type: _DataTypeT | + Field[_DataTypeT] | None = None) -> LargeListType[_DataTypeT]: ... + + +def list_view(value_type: _DataTypeT | + Field[_DataTypeT] | None = None) -> ListViewType[_DataTypeT]: ... + + +def large_list_view( + value_type: _DataTypeT | Field[_DataTypeT] | None = None +) -> LargeListViewType[_DataTypeT]: ... + + +def map_( + key_type: _K | Field | str | None = None, + item_type: _ValueT | Field | str | None = None, + keys_sorted: bool | None = None +) -> MapType[_K, _ValueT, Literal[False]]: ... + + +def dictionary( + index_type: _IndexT | str, + value_type: _BasicValueT | str, + ordered: _Ordered | None = None +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... + + +def struct( + fields: Iterable[ + Field[Any] + | tuple[str, Field[Any] | None] + | tuple[str, DataType | None] + ] | Mapping[str, Field[Any] | DataType | None], +) -> StructType: ... + + +def sparse_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> SparseUnionType: ... + + +def dense_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> DenseUnionType: ... + + +def union( + child_fields: list[Field[Any]], mode: Literal["sparse" | "dense"] | int | str, + type_codes: list[int] | None = None) -> SparseUnionType | DenseUnionType: ... + + +def run_end_encoded( + run_end_type: _RunEndType | str | None, value_type: _BasicValueT | str | None +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: ... + + +def json_(storage_type: DataType = ...) -> JsonType: ... + + +def uuid() -> UuidType: ... + + +def fixed_shape_tensor( + value_type: _ValueT, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + permutation: Sequence[int] | None = None, +) -> FixedShapeTensorType[_ValueT]: ... + + +def bool8() -> Bool8Type: ... + + +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: ... + + +def type_for_alias(name: Any) -> DataType: ... + + +def schema( + fields: ( + Iterable[Field[Any]] + | Iterable[tuple[str, DataType | str | None]] + | Mapping[Any, DataType | str | None] + ), + metadata: Mapping[bytes, bytes] + | Mapping[str, str] + | Mapping[bytes, str] + | Mapping[str, bytes] | None = None, +) -> Schema: ... + + +def from_numpy_dtype(dtype: np.dtype[Any] | type | str) -> DataType: ... + + +__all__ = [ + "_Weakrefable", + "_Metadata", + "DataType", + "_BasicDataType", + "NullType", + "BoolType", + "UInt8Type", + "Int8Type", + "UInt16Type", + "Int16Type", + "UInt32Type", + "Int32Type", + "UInt64Type", + "Int64Type", + "Float16Type", + "Float32Type", + "Float64Type", + "Date32Type", + "Date64Type", + "MonthDayNanoIntervalType", + "StringType", + "LargeStringType", + "StringViewType", + "BinaryType", + "LargeBinaryType", + "BinaryViewType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "ListType", + "LargeListType", + "ListViewType", + "LargeListViewType", + "FixedSizeListType", + "DictionaryMemo", + "DictionaryType", + "MapType", + "StructType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "RunEndEncodedType", + "BaseExtensionType", + "ExtensionType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "KeyValueMetadata", + "Field", + "Schema", + "unify_schemas", + "field", + "null", + "bool_", + "uint8", + "int8", + "uint16", + "int16", + "uint32", + "int32", + "int64", + "uint64", + "timestamp", + "time32", + "time64", + "duration", + "month_day_nano_interval", + "date32", + "date64", + "float16", + "float32", + "float64", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "string", + "utf8", + "binary", + "large_binary", + "large_string", + "large_utf8", + "binary_view", + "string_view", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "dictionary", + "struct", + "sparse_union", + "dense_union", + "union", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "type_for_alias", + "schema", + "from_numpy_dtype", + "_Unit", + "_Tz", + "_Time32Unit", + "_Time64Unit", + "_DataTypeT", +] diff --git a/python/pyarrow-stubs/pyarrow/array.pyi b/python/pyarrow-stubs/pyarrow/array.pyi new file mode 100644 index 00000000000..df01608e10c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/array.pyi @@ -0,0 +1,891 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from collections.abc import Iterable, Iterator, Sequence + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import ( + Any, + Generic, + Literal, + TypeVar, +) + +import numpy as np +import pandas as pd + +from pyarrow._compute import CastOptions +from pyarrow._stubs_typing import ( + ArrayLike, + Indices, + Mask, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportPyArrowArray, +) +from pyarrow.lib import ( + Buffer, + Device, + MemoryManager, + MemoryPool, + Tensor, + _Weakrefable, +) +from typing_extensions import deprecated +import builtins + +from .scalar import ( # noqa: F401 + BinaryScalar, + BinaryViewScalar, + BooleanScalar, + Date32Scalar, + Date64Scalar, + DictionaryScalar, + DoubleScalar, + DurationScalar, + ExtensionScalar, + FixedSizeBinaryScalar, + FixedSizeListScalar, + FloatScalar, + HalfFloatScalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + Int8Scalar, + LargeBinaryScalar, + LargeListScalar, + LargeStringScalar, + ListScalar, + ListViewScalar, + MapScalar, + MonthDayNanoIntervalScalar, + NullScalar, + RunEndEncodedScalar, + Scalar, + StringScalar, + StringViewScalar, + StructScalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + UInt8Scalar, + UnionScalar, +) +from .device import DeviceAllocationType +from ._types import ( # noqa: F401 + BaseExtensionType, + BinaryType, + DataType, + Field, + Float64Type, + Int64Type, + MapType, + StringType, + StructType, + _AsPyType, + _BasicDataType, + _BasicValueT, + _DataTypeT, + _IndexT, + _RunEndType, + _Size, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, +) +from ._stubs_typing import NullableCollection + + +def array( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray + | SupportArrowDeviceArray | SupportPyArrowArray, + type: Any | None = None, + mask: Mask | pd.Series[bool] | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def asarray( + values: NullableCollection[Any] | Iterable[Any] | SupportArrowArray + | SupportArrowDeviceArray, + type: _DataTypeT | Any | None = None, +) -> Array[Scalar[_DataTypeT]] | ArrayLike: ... + + +def nulls( + size: int, + type: Any | None = None, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def repeat( + value: Any, + size: int, + memory_pool: MemoryPool | None = None, +) -> ArrayLike: ... + + +def infer_type(values: Iterable[Any], mask: Mask | None = None, + from_pandas: bool = False) -> DataType: ... + + +class ArrayStatistics(_Weakrefable): + @property + def null_count(self) -> int | None: ... + + @property + def distinct_count(self) -> int | None: ... + + @property + def is_distinct_count_exact(self) -> bool | None: ... + + @property + def min(self) -> Any | None: ... + + @property + def is_min_exact(self) -> bool | None: ... + + @property + def max(self) -> Any | None: ... + + @property + def is_max_exact(self) -> bool | None: ... + + +_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + + +class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): + def to_pandas( + self, + memory_pool: MemoryPool | None = None, + categories: list | tuple | None = None, + strings_to_categorical: bool = False, + zero_copy_only: bool = False, + integer_object_nulls: bool = False, + date_as_object: bool = True, + timestamp_as_object: bool = False, + use_threads: bool = True, + deduplicate_objects: bool = True, + ignore_metadata: bool = False, + safe: bool = True, + split_blocks: bool = False, + self_destruct: bool = False, + maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, + types_mapper: Any = None, # Callable[[DataType], ExtensionDtype | None] | None + coerce_temporal_nanoseconds: bool = False, + ) -> _ConvertAs: ... + + +_CastAs = TypeVar("_CastAs", bound=DataType) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + + +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + def as_py(self) -> list[Any]: ... + + def diff(self, other: Self) -> str: ... + + # Private attribute used internally (e.g., for column names in batches) + _name: str | None + + def cast( + self, + target_type: _CastAs | str, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_CastAs]]: ... + + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: ... + + def sum(self, **kwargs) -> _Scalar_co: ... + + @property + def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... + def unique(self) -> Self: ... + + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: ... + + def value_counts(self) -> StructArray: ... + + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + type: _DataTypeT | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_DataTypeT]] | Array[Scalar]: ... + + @staticmethod + def from_buffers( + type: _DataTypeT, + length: int, + buffers: Sequence[Buffer | None], + null_count: int = -1, + offset=0, + children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... + + def to_string( + self, + *, + indent: int = 2, + top_level_indent: int = 0, + window: int = 10, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + + format = to_string + def equals(self, other: Array | Any) -> bool: ... + + def __len__(self) -> int: ... + + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: ... + + def is_nan(self) -> BooleanArray: ... + + def is_valid(self) -> BooleanArray: ... + + def fill_null( + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: ... + + def __getitem__(self, key: int | builtins.slice) -> _Scalar_co | Self: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def filter( + self, + mask: Mask, + *, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + ) -> Self: ... + + def index( + self: Array[_ScalarT] | Array[Scalar[_BasicDataType[_AsPyType]]], + value: _ScalarT | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def to_numpy(self, zero_copy_only: bool = True, + writable: bool = False) -> np.ndarray: ... + + def to_pylist( + self, + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[Any]: ... + + tolist = to_pylist + def validate(self, *, full: bool = False) -> None: ... + + @property + def offset(self) -> int: ... + + def buffers(self) -> list[Buffer | None]: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def __dlpack__(self, stream: int | None = None) -> Any: ... + + def __dlpack_device__(self) -> tuple[int, int]: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def statistics(self) -> ArrayStatistics | None: ... + + +class NullArray(Array[NullScalar]): + ... + + +class BooleanArray(Array[BooleanScalar]): + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + + +class NumericArray(Array[_ScalarT]): + ... + + +class IntegerArray(NumericArray[_ScalarT]): + ... + + +class FloatingPointArray(NumericArray[_ScalarT]): + ... + + +class Int8Array(IntegerArray[Int8Scalar]): + ... + + +class UInt8Array(IntegerArray[UInt8Scalar]): + ... + + +class Int16Array(IntegerArray[Int16Scalar]): + ... + + +class UInt16Array(IntegerArray[UInt16Scalar]): + ... + + +class Int32Array(IntegerArray[Int32Scalar]): + ... + + +class UInt32Array(IntegerArray[UInt32Scalar]): + ... + + +class Int64Array(IntegerArray[Int64Scalar]): + ... + + +class UInt64Array(IntegerArray[UInt64Scalar]): + ... + + +class Date32Array(NumericArray[Date32Scalar]): + ... + + +class Date64Array(NumericArray[Date64Scalar]): + ... + + +class TimestampArray(NumericArray[TimestampScalar[_Unit, _Tz]]): + ... + + +class Time32Array(NumericArray[Time32Scalar[_Time32Unit]]): + ... + + +class Time64Array(NumericArray[Time64Scalar[_Time64Unit]]): + ... + + +class DurationArray(NumericArray[DurationScalar[_Unit]]): + ... + + +class MonthDayNanoIntervalArray(Array[MonthDayNanoIntervalScalar]): + ... + + +class HalfFloatArray(FloatingPointArray[HalfFloatScalar]): + ... + + +class FloatArray(FloatingPointArray[FloatScalar]): + ... + + +class DoubleArray(FloatingPointArray[DoubleScalar]): + ... + + +class FixedSizeBinaryArray(Array[FixedSizeBinaryScalar]): + ... + + +class Decimal32Array(FixedSizeBinaryArray): + ... + + +class Decimal64Array(FixedSizeBinaryArray): + ... + + +class Decimal128Array(FixedSizeBinaryArray): + ... + + +class Decimal256Array(FixedSizeBinaryArray): + ... + + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: ... + + def value_parent_indices(self) -> Int64Array: ... + + def value_lengths(self) -> Int32Array: ... + + +class ListArray(BaseListArray[_ScalarT]): + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int] | list[int | None], + values: Array[Scalar[_DataTypeT]] | list[int] | list[float] | list[str] + | list[bytes] | list, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> (ListArray[ListScalar[ + _DataTypeT | Int64Type | Float64Type | StringType | BinaryType + ]] | ListArray): ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int32Array: ... + + +class LargeListArray(BaseListArray[LargeListScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array | list[int] | list[int | None], + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int64Array: ... + + +class ListViewArray(BaseListArray[ListViewScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int32Array: ... + + @property + def sizes(self) -> Int32Array: ... + + +class LargeListViewArray(BaseListArray[LargeListScalar[_DataTypeT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]] | Array, + *, + type: _DataTypeT | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + + @property + def values(self) -> Array: ... + + @property + def offsets(self) -> Int64Array: ... + + @property + def sizes(self) -> Int64Array: ... + + +class FixedSizeListArray(BaseListArray[FixedSizeListScalar[_DataTypeT, _Size]]): + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + list_size: _Size | None = None, + *, + type: DataType | None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, _Size | None]: ... + + @property + def values(self) -> BaseListArray[ListScalar[_DataTypeT]]: ... + + +_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) +_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + + +class MapArray(BaseListArray[MapScalar[_MapKeyT, _MapItemT]]): + @classmethod + def from_arrays( + cls, + offsets: Int64Array | list[int] | None, + keys: Array[Scalar[_MapKeyT]] | np.ndarray | list | None = None, + items: Array[Scalar[_MapItemT]] | np.ndarray | list | None = None, + values: Array | DataType | None = None, + *, + type: DataType | None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + + @property + def keys(self) -> Array: ... + + @property + def items(self) -> Array: ... + + +class UnionArray(Array[UnionScalar]): + @deprecated("Use fields() instead") + def child(self, pos: int) -> Field: ... + + def field(self, pos: int) -> Array: ... + + @property + def type_codes(self) -> Int8Array: ... + + @property + def offsets(self) -> Int32Array: ... + + @staticmethod + def from_dense( + types: Int8Array, + value_offsets: Int32Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | list[int] | None = None, + ) -> UnionArray: ... + + @staticmethod + def from_sparse( + types: Int8Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | list[int] | None = None, + ) -> UnionArray: ... + + +class StringArray(Array[StringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + + +class LargeStringArray(Array[LargeStringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: ... + + +class StringViewArray(Array[StringViewScalar]): + ... + + +class BinaryArray(Array[BinaryScalar]): + @property + def total_values_length(self) -> int: ... + + +class LargeBinaryArray(Array[LargeBinaryScalar]): + @property + def total_values_length(self) -> int: ... + + +class BinaryViewArray(Array[BinaryViewScalar]): + ... + + +class DictionaryArray(Array[DictionaryScalar[_IndexT, _BasicValueT]]): + def dictionary_encode(self) -> Self: ... # type: ignore[override] + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: ... + + @property + def indices(self) -> Array[Scalar[_IndexT]]: ... + @property + def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... + + @staticmethod + def from_buffers( # type: ignore[override] + type: _BasicValueT, + length: int, + buffers: list[Buffer], + dictionary: Array | np.ndarray | pd.Series, + null_count: int = -1, + offset: int = 0, + ) -> DictionaryArray[Any, _BasicValueT]: ... + + @staticmethod + def from_arrays( + indices: Indices | Sequence[int | None], + dictionary: Array | np.ndarray | pd.Series | list[Any], + mask: np.ndarray | pd.Series | BooleanArray | None = None, + ordered: bool = False, + from_pandas: bool = False, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> DictionaryArray: ... + + +class StructArray(Array[StructScalar]): + def field(self, index: int | str) -> Array: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: ... + + @staticmethod + def from_arrays( + arrays: Iterable[Array | np.ndarray | list], + names: Sequence[str] | list[Field] | None = None, + fields: list[Field] | None = None, + mask=None, + memory_pool: MemoryPool | None = None, + type: StructType | None = None, + ) -> StructArray: ... + + def sort(self, order: Order = "ascending", by: str | + None = None, **kwargs) -> StructArray: ... + + +class RunEndEncodedArray(Array[RunEndEncodedScalar[_RunEndType, _BasicValueT]]): + @staticmethod + def from_arrays( + run_ends: Int16Array | Int32Array | Int64Array | list[int], + values: Array | list[Any], type: DataType | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... + + @staticmethod + def from_buffers( # type: ignore[override] + type: DataType, + length: int, + buffers: list[Buffer] | list[None], + null_count: int = -1, + offset=0, + children: tuple[Array, Array] | list[list[int]] | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: ... + + @property + def run_ends(self) -> Array[Scalar[_RunEndType]]: ... + + @property + def values(self) -> Array[Scalar[_BasicValueT]]: ... + + def find_physical_offset(self) -> int: ... + + def find_physical_length(self) -> int: ... + + +_ArrayT = TypeVar("_ArrayT", bound=Array) + + +class ExtensionArray(Array[ExtensionScalar], Generic[_ArrayT]): + @property + def storage(self) -> Any: ... + + @staticmethod + def from_storage(typ: BaseExtensionType, + storage: _ArrayT) -> ExtensionArray[_ArrayT]: ... + + +class JsonArray(ExtensionArray[_ArrayT]): + ... + + +class UuidArray(ExtensionArray[_ArrayT]): + ... + + +class FixedShapeTensorArray(ExtensionArray[_ArrayT]): + def to_numpy_ndarray(self) -> np.ndarray: ... + + def to_tensor(self) -> Tensor: ... + + @classmethod + def from_numpy_ndarray( + cls, obj: np.ndarray, + dim_names: list[str] | tuple[str, ...] | None = None + ) -> Self: ... + + +class OpaqueArray(ExtensionArray[_ArrayT]): + ... + + +class Bool8Array(ExtensionArray): + def to_numpy(self, zero_copy_only: bool = ..., + writable: bool = ...) -> np.ndarray: ... + + @classmethod + def from_storage(cls, storage: Int8Array) -> Self: ... # type: ignore[override] + + @classmethod + def from_numpy(cls, obj: np.ndarray) -> Self: ... + + +def concat_arrays(arrays: Iterable[_ArrayT], + memory_pool: MemoryPool | None = None) -> _ArrayT: ... + + +def _empty_array(type: _DataTypeT) -> Array[Scalar[_DataTypeT]]: ... + + +__all__ = [ + "array", + "asarray", + "nulls", + "repeat", + "infer_type", + "_PandasConvertible", + "Array", + "NullArray", + "BooleanArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "FixedSizeBinaryArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "BaseListArray", + "ListArray", + "LargeListArray", + "ListViewArray", + "LargeListViewArray", + "FixedSizeListArray", + "MapArray", + "UnionArray", + "StringArray", + "LargeStringArray", + "StringViewArray", + "BinaryArray", + "LargeBinaryArray", + "BinaryViewArray", + "DictionaryArray", + "StructArray", + "RunEndEncodedArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "FixedShapeTensorArray", + "concat_arrays", + "_empty_array", + "_CastAs", +] diff --git a/python/pyarrow-stubs/pyarrow/builder.pyi b/python/pyarrow-stubs/pyarrow/builder.pyi new file mode 100644 index 00000000000..9001d9835b6 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/builder.pyi @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + + +class StringBuilder(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | float | None): ... + + def append_values(self, values: Iterable[str | bytes | float | None]): ... + + def finish(self) -> StringArray: ... + + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + + +class StringViewBuilder(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | float | None): ... + + def append_values(self, values: Iterable[str | bytes | float | None]): ... + + def finish(self) -> StringViewArray: ... + + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/pyarrow-stubs/pyarrow/cffi.pyi b/python/pyarrow-stubs/pyarrow/cffi.pyi new file mode 100644 index 00000000000..e4f077d7155 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/cffi.pyi @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import cffi + +c_source: str +ffi: cffi.FFI diff --git a/python/pyarrow-stubs/pyarrow/compat.pyi b/python/pyarrow-stubs/pyarrow/compat.pyi new file mode 100644 index 00000000000..30e3ec13e0d --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/compat.pyi @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +def encode_file_path(path: str | bytes) -> bytes: ... +def tobytes(o: str | bytes) -> bytes: ... +def frombytes(o: bytes, *, safe: bool = False): ... + + +__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/python/pyarrow-stubs/pyarrow/compute.pyi b/python/pyarrow-stubs/pyarrow/compute.pyi new file mode 100644 index 00000000000..97e08b9b107 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/compute.pyi @@ -0,0 +1,1822 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Iterable, Sequence, Mapping +from typing import Literal, TypeAlias, TypeVar, Any, ParamSpec + +import numpy as np + +# Option classes +from pyarrow._compute import ArraySortOptions as ArraySortOptions +from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions +from pyarrow._compute import CastOptions as CastOptions +from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeOptions as CumulativeOptions # noqa: F401 +from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions +from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions +from pyarrow._compute import ( # noqa: F401 + DictionaryEncodeOptions as DictionaryEncodeOptions) +from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions + +# Expressions +from pyarrow._compute import Expression as Expression +from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import ( # noqa: F401 + ExtractRegexSpanOptions as ExtractRegexSpanOptions) +from pyarrow._compute import FilterOptions as FilterOptions +from pyarrow._compute import FunctionOptions as FunctionOptions # noqa: F401 +from pyarrow._compute import IndexOptions as IndexOptions # noqa: F401 +from pyarrow._compute import JoinOptions as JoinOptions # noqa: F401 +from pyarrow._compute import ListFlattenOptions as ListFlattenOptions +from pyarrow._compute import ListSliceOptions as ListSliceOptions +from pyarrow._compute import MakeStructOptions as MakeStructOptions +from pyarrow._compute import MapLookupOptions as MapLookupOptions +from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions +from pyarrow._compute import ModeOptions as ModeOptions +from pyarrow._compute import NullOptions as NullOptions +from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PairwiseOptions as PairwiseOptions +from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import PivotWiderOptions as PivotWiderOptions +from pyarrow._compute import QuantileOptions as QuantileOptions +from pyarrow._compute import RandomOptions as RandomOptions +from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import RankQuantileOptions as RankQuantileOptions +from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions +from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions +from pyarrow._compute import RoundOptions as RoundOptions +from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions +from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions +from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions +from pyarrow._compute import SelectKOptions as SelectKOptions +from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SkewOptions as SkewOptions +from pyarrow._compute import SliceOptions as SliceOptions +from pyarrow._compute import SortOptions as SortOptions +from pyarrow._compute import SplitOptions as SplitOptions +from pyarrow._compute import SplitPatternOptions as SplitPatternOptions # noqa: F401 +from pyarrow._compute import StrftimeOptions as StrftimeOptions +from pyarrow._compute import StrptimeOptions as StrptimeOptions +from pyarrow._compute import StructFieldOptions as StructFieldOptions +from pyarrow._compute import TakeOptions as TakeOptions +from pyarrow._compute import TDigestOptions as TDigestOptions +from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import VarianceOptions as VarianceOptions +from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import WinsorizeOptions as WinsorizeOptions +from pyarrow._compute import ZeroFillOptions as ZeroFillOptions + +# Functions +from pyarrow._compute import call_function as call_function # noqa: F401 +from pyarrow._compute import ( # noqa: F401 + call_tabular_function as call_tabular_function) +from pyarrow._compute import get_function as get_function # noqa: F401 +from pyarrow._compute import list_functions as list_functions # noqa: F401 +from pyarrow._compute import ( # noqa: F401 + register_scalar_function as register_scalar_function) +from pyarrow._compute import ( # noqa: F401 + register_aggregate_function as register_aggregate_function) +from pyarrow._compute import ( # noqa: F401 + register_vector_function as register_vector_function) +from pyarrow._compute import ( # noqa: F401 + register_tabular_function as register_tabular_function) + +# Function and Kernel classes +from pyarrow._compute import Function as Function # noqa: F401 +from pyarrow._compute import Kernel as Kernel # noqa: F401 +from pyarrow._compute import ScalarFunction as ScalarFunction # noqa: F401 +from pyarrow._compute import ScalarKernel as ScalarKernel # noqa: F401 +from pyarrow._compute import VectorFunction as VectorFunction # noqa: F401 +from pyarrow._compute import VectorKernel as VectorKernel # noqa: F401 +from pyarrow._compute import ( # noqa: F401 + ScalarAggregateFunction as ScalarAggregateFunction) +from pyarrow._compute import ( # noqa: F401 + ScalarAggregateKernel as ScalarAggregateKernel) +from pyarrow._compute import ( # noqa: F401 + HashAggregateFunction as HashAggregateFunction) +from pyarrow._compute import HashAggregateKernel as HashAggregateKernel # noqa: F401 + +# Udf + +from pyarrow._compute import _Order, _Placement +from pyarrow._stubs_typing import ArrayLike, ScalarLike, PyScalar +from pyarrow._types import _RunEndType +from . import lib + +_P = ParamSpec("_P") +_R = TypeVar("_R") + + +class _ExprComparable(Expression): + def __ge__(self, other: Any) -> Expression: ... + def __le__(self, other: Any) -> Expression: ... + def __gt__(self, other: Any) -> Expression: ... + def __lt__(self, other: Any) -> Expression: ... + + +def field(*name_or_index: str | bytes | tuple[str | int, ...] | int) -> Expression: ... +def __ge__(self, other: Any) -> Expression: ... + + +def scalar(value: PyScalar | lib.Scalar[Any] | Mapping | lib.int64()) -> Expression: ... + + +def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... + + +# ============= compute functions ============= +_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | + lib.Scalar | lib.ChunkedArray) +ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT + +SignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.Int8Type] + | lib.Scalar[lib.Int16Type] + | lib.Scalar[lib.Int32Type] + | lib.Scalar[lib.Int64Type] +) +UnsignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.UInt8Type] + | lib.Scalar[lib.UInt16Type] + | lib.Scalar[lib.UInt32Type] + | lib.Scalar[lib.UInt64Type] +) +IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar +FloatScalar: TypeAlias = (lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] + | lib.Scalar[lib.Float64Type]) +DecimalScalar: TypeAlias = ( + lib.Scalar[lib.Decimal32Type] + | lib.Scalar[lib.Decimal64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar +NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar +BinaryScalar: TypeAlias = ( + lib.Scalar[lib.BinaryType] + | lib.Scalar[lib.LargeBinaryType] + | lib.Scalar[lib.FixedSizeBinaryType] +) +StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_ListScalar: TypeAlias = ( + lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] +) +_LargeListScalar: TypeAlias = ( + lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] +) +ListScalar: TypeAlias = ( + lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] +) +TemporalScalar: TypeAlias = ( + lib.Date32Scalar + | lib.Date64Scalar + | lib.Time32Scalar[Any] + | lib.Time64Scalar[Any] + | lib.TimestampScalar[Any] + | lib.DurationScalar[Any] + | lib.MonthDayNanoIntervalScalar +) +NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar +NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar + +_NumericOrTemporalScalarT = TypeVar( + "_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] +_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] +_NumericOrDurationArrayT = TypeVar( + "_NumericOrDurationArrayT", bound=NumericOrDurationArray) +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] +_NumericOrTemporalArrayT = TypeVar( + "_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) +IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] +_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) +FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] +_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) +_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) +StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] +_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) +_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) +BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] +_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) +_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) +StringOrBinaryArray: TypeAlias = StringArray | BinaryArray +_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) +_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) +TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] +_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] +_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] +ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] + +# =============================== 1. Aggregation =============================== + + +def array_take( + array: _ArrayT | lib.Scalar | lib.Table | Expression, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.UInt64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar] + | np.ndarray + | Expression, + /, + *, + boundscheck: bool | None = None, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT | Expression: ... + + +# ========================= 1.1 functions ========================= + + +def all( + array: lib.BooleanScalar | BooleanArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... + + +any = _clone_signature(all) + + +def approximate_median( + array: NumericScalar | NumericArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def count( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +def count_distinct( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +def first( + array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... + + +def first_last( + array: lib.Array[Any] | lib.ChunkedArray[Any] | list[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | Mapping[Any, Any] | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... + + +def index( + data: lib.Array[Any] | lib.ChunkedArray[Any], + value: ScalarLike, + start: int | None = None, + end: int | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... + + +last = _clone_signature(first) +max = _clone_signature(first) +min = _clone_signature(first) +min_max = _clone_signature(first_last) + + +def mean( + array: FloatScalar | FloatArray + | lib.NumericArray[lib.Scalar[Any]] + | lib.ChunkedArray[lib.Scalar[Any]] + | lib.Scalar[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[Any]: ... + + +def mode( + array: NumericScalar | NumericArray, + /, + n: int = 1, + *, + skip_nulls: bool = True, + min_count: int = 0, + options: ModeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... + + +def product( + array: _ScalarT | lib.NumericArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: ... + + +def quantile( + array: NumericScalar | NumericArray, + /, + q: float | Sequence[float] = 0.5, + *, + interpolation: Literal["linear", "lower", + "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + options: QuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def stddev( + array: NumericScalar | NumericArray, + /, + *, + ddof: float = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def sum( + array: _NumericScalarT | NumericArray[_NumericScalarT] | lib.Expression, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | lib.Expression: ... + + +def tdigest( + array: NumericScalar | NumericArray, + /, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + options: TDigestOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def variance( + array: NumericScalar | NumericArray | ArrayLike, + /, + *, + ddof: int = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def winsorize( + array: _NumericArrayT, + /, + lower_limit: float = 0.0, + upper_limit: float = 1.0, + *, + options: WinsorizeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... + + +def skew( + array: NumericScalar | NumericArray | ArrayLike, + /, + *, + skip_nulls: bool = True, + biased: bool = True, + min_count: int = 0, + options: SkewOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def kurtosis( + array: NumericScalar | NumericArray | ArrayLike, + /, + *, + skip_nulls: bool = True, + biased: bool = True, + min_count: int = 0, + options: SkewOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... + + +def top_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... + + +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: ... + + +# ========================= 2. Element-wise (“scalar”) functions ========= + +# ========================= 2.1 Arithmetic ========================= +def abs(x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None) -> ( + _NumericOrDurationT | _NumericOrDurationArrayT | Expression): ... + + +abs_checked = _clone_signature(abs) + + +def add( + x: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT + | ArrayLike | int | Expression), + y: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT + | ArrayLike | int | Expression), + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ... + + +add_checked = _clone_signature(add) + + +def divide( + x: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT + | Expression), + y: (_NumericOrTemporalScalarT | NumericOrTemporalScalar | _NumericOrTemporalArrayT + | Expression), + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT | _NumericOrTemporalArrayT | Expression: ... + + +divide_checked = _clone_signature(divide) + + +def exp( + exponent: _FloatArrayT | ArrayOrChunkedArray[NonFloatNumericScalar] | _FloatScalarT + | NonFloatNumericScalar | lib.DoubleScalar | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + _FloatArrayT | lib.DoubleArray | _FloatScalarT | lib.DoubleScalar | Expression): ... + + +multiply = _clone_signature(add) +multiply_checked = _clone_signature(add) + + +def negate( + x: _NumericOrDurationT | _NumericOrDurationArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None) -> ( + _NumericOrDurationT | _NumericOrDurationArrayT | Expression): ... + + +negate_checked = _clone_signature(negate) + + +def power( + base: _NumericScalarT | Expression | _NumericArrayT | NumericScalar, + exponent: _NumericScalarT | Expression | _NumericArrayT | NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +power_checked = _clone_signature(power) + + +def sign( + x: NumericOrDurationArray | NumericOrDurationScalar | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> ( + lib.NumericArray[lib.Int8Scalar] + | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] + | lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar | Expression +): ... + + +def sqrt( + x: NumericArray | NumericScalar | Expression, /, *, + memory_pool: lib.MemoryPool | None = None) -> ( + FloatArray | FloatScalar | Expression): ... + + +sqrt_checked = _clone_signature(sqrt) + +subtract = _clone_signature(add) +subtract_checked = _clone_signature(add) + +# ========================= 2.1 Bit-wise functions ========================= + + +def bit_wise_and( + x: _NumericScalarT | _NumericArrayT | NumericScalar | Expression + | ArrayOrChunkedArray[NumericScalar], + y: _NumericScalarT | _NumericArrayT | NumericScalar | Expression + | ArrayOrChunkedArray[NumericScalar], + /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def bit_wise_not( + x: _NumericScalarT | _NumericArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +bit_wise_or = _clone_signature(bit_wise_and) +bit_wise_xor = _clone_signature(bit_wise_and) +shift_left = _clone_signature(bit_wise_and) +shift_left_checked = _clone_signature(bit_wise_and) +shift_right = _clone_signature(bit_wise_and) +shift_right_checked = _clone_signature(bit_wise_and) + +# ========================= 2.2 Rounding functions ========================= + + +def ceil( + x: _FloatScalarT | _FloatArrayT | Expression, /, *, memory_pool: lib.MemoryPool | + None = None) -> _FloatScalarT | _FloatArrayT | Expression: ... + + +floor = _clone_signature(ceil) + + +def round( + x: _NumericScalarT | _NumericArrayT | Expression | list, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def round_to_multiple( + x: _NumericScalarT | _NumericArrayT | list | Expression, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT | _NumericArrayT | Expression: ... + + +def round_binary( + x: _NumericScalarT | _NumericArrayT | float | list | Expression, + s: lib.Int8Scalar + | lib.Int16Scalar + | lib.Int32Scalar + | lib.Int64Scalar + | lib.Scalar + | Iterable + | float + | Expression, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + _NumericScalarT | lib.NumericArray[_NumericScalarT] | _NumericArrayT + | Expression): ... + + +trunc = _clone_signature(ceil) + +# ========================= 2.3 Logarithmic functions ========================= + + +def ln( + x: FloatScalar | FloatArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> ( + lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] | Expression): ... + + +ln_checked = _clone_signature(ln) +log10 = _clone_signature(ln) +log10_checked = _clone_signature(ln) +log1p = _clone_signature(ln) +log1p_checked = _clone_signature(ln) +log2 = _clone_signature(ln) +log2_checked = _clone_signature(ln) + + +def logb( + x: FloatScalar | FloatArray | Expression | Any, + b: FloatScalar | FloatArray | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] | Expression | Any): ... + + +logb_checked = _clone_signature(logb) + +# ========================= 2.4 Trigonometric functions ========================= +acos = _clone_signature(ln) +acos_checked = _clone_signature(ln) +acosh = _clone_signature(ln) +asin = _clone_signature(ln) +asin_checked = _clone_signature(ln) +asinh = _clone_signature(ln) +atan = _clone_signature(ln) +atanh = _clone_signature(ln) +cos = _clone_signature(ln) +cos_checked = _clone_signature(ln) +cosh = _clone_signature(ln) +sin = _clone_signature(ln) +sin_checked = _clone_signature(ln) +sinh = _clone_signature(ln) +tan = _clone_signature(ln) +tan_checked = _clone_signature(ln) +tanh = _clone_signature(ln) + + +def atan2( + y: FloatScalar | FloatArray | Expression | Any, + x: FloatScalar | FloatArray | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.FloatScalar | lib.DoubleScalar | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] | Expression): ... + + +# ========================= 2.5 Comparisons functions ========================= +def equal( + x: lib.Scalar | lib.Array | lib.ChunkedArray | list | Expression | Any, + y: lib.Scalar | lib.Array | lib.ChunkedArray | list | Expression | Any, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +greater = _clone_signature(equal) +greater_equal = _clone_signature(equal) +less = _clone_signature(equal) +less_equal = _clone_signature(equal) +not_equal = _clone_signature(equal) + + +def max_element_wise( + *args: ScalarOrArray[_Scalar_CoT] | Expression | ScalarLike | ArrayLike, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _Scalar_CoT | Expression | lib.Scalar | lib.Array: ... + + +min_element_wise = _clone_signature(max_element_wise) + +# ========================= 2.6 Logical functions ========================= + + +def and_( + x: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], + y: lib.BooleanScalar | BooleanArray | Expression | ScalarOrArray[lib.BooleanScalar], + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.BooleanScalar | lib.BooleanArray | Expression + | ScalarOrArray[lib.BooleanScalar]): ... + + +and_kleene = _clone_signature(and_) +and_not = _clone_signature(and_) +and_not_kleene = _clone_signature(and_) +or_ = _clone_signature(and_) +or_kleene = _clone_signature(and_) +xor = _clone_signature(and_) + + +def invert( + x: lib.BooleanScalar | _BooleanArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | _BooleanArrayT | Expression: ... + + +# ========================= 2.10 String predicates ========================= +def ascii_is_alnum( + strings: StringScalar | StringArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +ascii_is_alpha = _clone_signature(ascii_is_alnum) +ascii_is_decimal = _clone_signature(ascii_is_alnum) +ascii_is_lower = _clone_signature(ascii_is_alnum) +ascii_is_printable = _clone_signature(ascii_is_alnum) +ascii_is_space = _clone_signature(ascii_is_alnum) +ascii_is_upper = _clone_signature(ascii_is_alnum) +utf8_is_alnum = _clone_signature(ascii_is_alnum) +utf8_is_alpha = _clone_signature(ascii_is_alnum) +utf8_is_decimal = _clone_signature(ascii_is_alnum) +utf8_is_digit = _clone_signature(ascii_is_alnum) +utf8_is_lower = _clone_signature(ascii_is_alnum) +utf8_is_numeric = _clone_signature(ascii_is_alnum) +utf8_is_printable = _clone_signature(ascii_is_alnum) +utf8_is_space = _clone_signature(ascii_is_alnum) +utf8_is_upper = _clone_signature(ascii_is_alnum) +ascii_is_title = _clone_signature(ascii_is_alnum) +utf8_is_title = _clone_signature(ascii_is_alnum) +string_is_ascii = _clone_signature(ascii_is_alnum) + +# ========================= 2.11 String transforms ========================= + + +def ascii_capitalize( + strings: _StringScalarT | _StringArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_lower = _clone_signature(ascii_capitalize) +ascii_reverse = _clone_signature(ascii_capitalize) +ascii_swapcase = _clone_signature(ascii_capitalize) +ascii_title = _clone_signature(ascii_capitalize) +ascii_upper = _clone_signature(ascii_capitalize) + + +def binary_length( + strings: ScalarOrArray[StringOrBinaryScalar] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array + | Expression +): ... + + +def binary_repeat( + strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ( + _StringOrBinaryScalarT | lib.Array[_StringOrBinaryScalarT] | _StringOrBinaryArrayT + | Expression): ... + + +def binary_replace_slice( + strings: _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ... + + +def binary_reverse( + strings: _BinaryScalarT | _BinaryArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> _BinaryScalarT | _BinaryArrayT | Expression: ... + + +def replace_substring( + strings: _StringScalarT | _StringArrayT | Expression, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +replace_substring_regex = _clone_signature(replace_substring) + + +def utf8_capitalize( + strings: _StringScalarT | _StringArrayT | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT | _StringArrayT | Expression: ... + + +def utf8_length( + strings: lib.StringScalar | lib.LargeStringScalar | lib.StringArray + | lib.ChunkedArray[lib.StringScalar] | lib.LargeStringArray + | lib.ChunkedArray[lib.LargeStringScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array + | Expression): ... + + +utf8_lower = _clone_signature(utf8_capitalize) + + +def utf8_replace_slice( + strings: _StringScalarT | _StringArrayT | Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +utf8_reverse = _clone_signature(utf8_capitalize) +utf8_swapcase = _clone_signature(utf8_capitalize) +utf8_title = _clone_signature(utf8_capitalize) +utf8_upper = _clone_signature(utf8_capitalize) + +# ========================= 2.12 String padding ========================= + + +def ascii_center( + strings: _StringScalarT | _StringArrayT | Expression, + /, + width: int | None = None, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_lpad = _clone_signature(ascii_center) +ascii_rpad = _clone_signature(ascii_center) +utf8_center = _clone_signature(ascii_center) +utf8_lpad = _clone_signature(ascii_center) +utf8_rpad = _clone_signature(ascii_center) + + +def utf8_zero_fill( + strings: _StringScalarT | _StringArrayT | Expression, + /, + width: int | None = None, + padding: str = "0", + *, + options: ZeroFillOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +utf8_zfill = utf8_zero_fill + +# ========================= 2.13 String trimming ========================= + + +def ascii_ltrim( + strings: _StringScalarT | _StringArrayT | Expression, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_rtrim = _clone_signature(ascii_ltrim) +ascii_trim = _clone_signature(ascii_ltrim) +utf8_ltrim = _clone_signature(ascii_ltrim) +utf8_rtrim = _clone_signature(ascii_ltrim) +utf8_trim = _clone_signature(ascii_ltrim) + + +def ascii_ltrim_whitespace( + strings: _StringScalarT | _StringArrayT | Expression, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) + +# ========================= 2.14 String splitting ========================= + + +def ascii_split_whitespace( + strings: _StringScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.ListArray[_StringScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] + | Expression): ... + + +def split_pattern( + strings: _StringOrBinaryScalarT | lib.Array[lib.Scalar[_DataTypeT]] | Expression, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.ListArray[_StringOrBinaryScalarT] | lib.ListArray[lib.ListScalar[_DataTypeT]] + | Expression): ... + + +split_pattern_regex = _clone_signature(split_pattern) +utf8_split_whitespace = _clone_signature(ascii_split_whitespace) + +# ========================= 2.15 String component extraction ========================= + + +def extract_regex( + strings: StringOrBinaryScalar | StringOrBinaryArray | Expression, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +extract_regex_span = _clone_signature(extract_regex) + + +# ========================= 2.16 String join ========================= +def binary_join( + strings, separator, /, *, memory_pool: lib.MemoryPool | None = None +) -> StringScalar | StringArray: ... + + +def binary_join_element_wise( + *strings: str + | bytes + | _StringOrBinaryScalarT + | _StringOrBinaryArrayT + | Expression + | list, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT | _StringOrBinaryArrayT | Expression: ... + + +# ========================= 2.17 String Slicing ========================= +def binary_slice( + strings: _BinaryScalarT | _BinaryArrayT | Expression | lib.Scalar, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryScalarT | _BinaryArrayT | Expression: ... + + +def utf8_slice_codeunits( + strings: _StringScalarT | _StringArrayT | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +def utf8_normalize( + strings: _StringScalarT | _StringArrayT | Expression, + /, + form: Literal["NFC", "NFKC", "NFD", "NFKD"] = "NFC", + *, + options: Utf8NormalizeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT | _StringArrayT | Expression: ... + + +# ========================= 2.18 Containment tests ========================= +def count_substring( + strings: lib.StringScalar | lib.BinaryScalar | lib.LargeStringScalar + | lib.LargeBinaryScalar | lib.StringArray | lib.BinaryArray + | lib.ChunkedArray[lib.StringScalar] | lib.ChunkedArray[lib.BinaryScalar] + | lib.LargeStringArray | lib.LargeBinaryArray + | lib.ChunkedArray[lib.LargeStringScalar] | lib.ChunkedArray[lib.LargeBinaryScalar] + | Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.Int32Scalar | lib.Int64Scalar | lib.Int32Array | lib.Int64Array + | Expression): ... + + +count_substring_regex = _clone_signature(count_substring) + + +def ends_with( + strings: StringScalar | BinaryScalar | StringArray | BinaryArray | Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +find_substring = _clone_signature(count_substring) +find_substring_regex = _clone_signature(count_substring) + + +def index_in( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + value_set: lib.Array | lib.ChunkedArray | Expression, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar | lib.Int32Array | Expression: ... + + +def is_in( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + value_set: lib.Array | lib.ChunkedArray | Expression, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +match_like = _clone_signature(ends_with) +match_substring = _clone_signature(ends_with) +match_substring_regex = _clone_signature(ends_with) +starts_with = _clone_signature(ends_with) + +# ========================= 2.19 Categorizations ========================= + + +def is_finite( + values: NumericScalar | lib.NullScalar | NumericArray | lib.NullArray | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +is_inf = _clone_signature(is_finite) +is_nan = _clone_signature(is_finite) + + +def is_null( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +def is_valid( + values: lib.Scalar | lib.Array | lib.ChunkedArray | Expression | ArrayLike, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +true_unless_null = _clone_signature(is_valid) + +# ========================= 2.20 Selecting / multiplexing ========================= + + +def case_when( + cond: lib.StructScalar + | lib.StructArray + | lib.ChunkedArray[lib.StructScalar] + | Expression, + /, + *cases: _ScalarOrArrayT | ArrayLike, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT | lib.Array | Expression: ... + + +def choose( + indices: ArrayLike | ScalarLike, + /, + *values: ArrayLike | ScalarLike, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: ... + + +def coalesce( + *values: _ScalarOrArrayT | Expression, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT | Expression: ... + + +def fill_null( + values: _ScalarOrArrayT | ScalarLike, fill_value: ArrayLike | ScalarLike +) -> _ScalarOrArrayT | ScalarLike: ... + + +def if_else( + cond: ArrayLike | ScalarLike, + left: ArrayLike | ScalarLike, + right: ArrayLike | ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: ... + + +# ========================= 2.21 Structural transforms ========================= + +def list_value_length( + lists: _ListArray[Any] | _LargeListArray[Any] | ListArray[Any] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array | lib.Int64Array | Expression: ... + + +def make_struct( + *args: lib.Scalar | lib.Array | lib.ChunkedArray | Expression | ArrayLike, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +# ========================= 2.22 Conversions ========================= +def ceil_temporal( + timestamps: _TemporalScalarT | _TemporalArrayT | Expression, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalScalarT | _TemporalArrayT | Expression: ... + + +floor_temporal = _clone_signature(ceil_temporal) +round_temporal = _clone_signature(ceil_temporal) + + +def cast( + arr: lib.Scalar | lib.Array | lib.ChunkedArray | lib.Table, + target_type: _DataTypeT | str | None = None, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.Scalar[_DataTypeT] | lib.Scalar[Any] | lib.Array[lib.Scalar[_DataTypeT]] + | lib.Array[lib.Scalar[Any]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]] + | lib.ChunkedArray[lib.Scalar[Any]] | lib.Table +): ... + + +def strftime( + timestamps: TemporalScalar | TemporalArray | Expression, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringScalar | lib.StringArray | Expression: ... + + +def strptime( + strings: StringScalar | StringArray | Expression, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar | lib.TimestampArray | Expression: ... + + +# ========================= 2.23 Temporal component extraction ========================= +def day( + values: TemporalScalar | TemporalArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None) -> ( + lib.Int64Scalar | lib.Int64Array | Expression +): ... + + +def day_of_week( + values: TemporalScalar | TemporalArray | Expression, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +day_of_year = _clone_signature(day) + + +def hour( + values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any] + | lib.TimestampArray[Any] | lib.Time32Array[Any] | lib.Time64Array[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] + | lib.ChunkedArray[lib.Time32Scalar[Any]] + | lib.ChunkedArray[lib.Time64Scalar[Any]] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +def is_dst( + values: lib.TimestampScalar | lib.TimestampArray[Any] + | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +def iso_week( + values: lib.TimestampScalar | lib.TimestampArray[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +iso_year = _clone_signature(iso_week) + + +def is_leap_year( + values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar + | lib.TimestampArray + | lib.Date32Array + | lib.Date64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar] | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar | lib.BooleanArray | Expression: ... + + +microsecond = _clone_signature(iso_week) +millisecond = _clone_signature(iso_week) +minute = _clone_signature(iso_week) +month = _clone_signature(day_of_week) +nanosecond = _clone_signature(hour) +quarter = _clone_signature(day_of_week) +second = _clone_signature(hour) +subsecond = _clone_signature(hour) +us_week = _clone_signature(iso_week) +us_year = _clone_signature(iso_week) +year = _clone_signature(iso_week) + + +def week( + values: lib.TimestampScalar | lib.TimestampArray + | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array | Expression: ... + + +def year_month_day( + values: TemporalScalar | TemporalArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar | lib.StructArray | Expression: ... + + +iso_calendar = _clone_signature(year_month_day) + + +# ========================= 2.24 Temporal difference ========================= +def day_time_interval_between(start, end, /, *, + memory_pool: lib.MemoryPool | None = None): ... + + +def days_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array: ... + + +hours_between = _clone_signature(days_between) +microseconds_between = _clone_signature(days_between) +milliseconds_between = _clone_signature(days_between) +minutes_between = _clone_signature(days_between) + + +def month_day_nano_interval_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: ... + + +def month_interval_between(start, end, /, *, + memory_pool: lib.MemoryPool | None = None): ... + + +nanoseconds_between = _clone_signature(days_between) +quarters_between = _clone_signature(days_between) +seconds_between = _clone_signature(days_between) + + +def weeks_between( + start, + end, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array: ... + + +years_between = _clone_signature(days_between) + +# ========================= 2.25 Timezone handling ========================= + + +def assume_timezone( + timestamps: lib.TimestampScalar | lib.Scalar[lib.TimestampType] | lib.TimestampArray + | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, + timezone: str | None = None, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> ( + lib.TimestampScalar | lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar] + | Expression +): ... + + +def local_timestamp( + timestamps: lib.TimestampScalar | lib.TimestampArray + | lib.ChunkedArray[lib.TimestampScalar] | Expression, + /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampScalar | lib.TimestampArray | Expression: ... + + +# ========================= 2.26 Random number generation ========================= +def random( + n: int, + *, + initializer: Literal["system"] | int = "system", + options: RandomOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +# ========================= 3. Array-wise (“vector”) functions ========================= + +# ========================= 3.1 Cumulative Functions ========================= +def cumulative_sum( + values: _NumericArrayT | ArrayLike | Expression, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT | Expression | lib.Array: ... + + +cumulative_sum_checked = _clone_signature(cumulative_sum) +cumulative_prod = _clone_signature(cumulative_sum) +cumulative_prod_checked = _clone_signature(cumulative_sum) +cumulative_max = _clone_signature(cumulative_sum) +cumulative_min = _clone_signature(cumulative_sum) +cumulative_mean = _clone_signature(cumulative_sum) +# ========================= 3.2 Associative transforms ========================= + + +def dictionary_encode( + array: _ScalarOrArrayT | Expression, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | Expression: ... + + +def dictionary_decode( + array: _ScalarOrArrayT | Expression, + /, + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | Expression: ... + + +def unique(array: _ArrayT | Expression, /, *, memory_pool: lib.MemoryPool | + None = None) -> _ArrayT | Expression: ... + + +def value_counts( + array: lib.Array | lib.ChunkedArray | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray | Expression: ... + +# ========================= 3.3 Selections ========================= + + +def array_filter( + array: _ArrayT | Expression, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT | Expression: ... + + +def drop_null(input: _ArrayT | Expression, /, *, memory_pool: lib.MemoryPool | + None = None) -> _ArrayT | Expression: ... + + +filter = array_filter +take = array_take + +# ========================= 3.4 Containment tests ========================= + + +def indices_nonzero( + values: lib.BooleanArray + | lib.NullArray + | NumericArray + | lib.Decimal128Array + | lib.Decimal256Array | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +# ========================= 3.5 Sorts and partitions ========================= +def array_sort_indices( + array: lib.Array | lib.ChunkedArray | Expression, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def partition_nth_indices( + array: lib.Array | lib.ChunkedArray | Expression | Iterable, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def pivot_wider( + keys: lib.Array | lib.ChunkedArray | Sequence[str], + values: lib.Array | lib.ChunkedArray | Sequence[Any], + /, + key_names: Sequence[str] | None = None, + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + options: PivotWiderOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... + + +def rank( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + options: RankOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... + + +def rank_quantile( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: RankQuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def rank_normal( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: RankQuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... + + +def select_k_unstable( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression, + /, + k: int | None = None, + sort_keys: Sequence[tuple[str | Expression, str]] | None = None, + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +def sort_indices( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table | Expression, + /, + sort_keys: Sequence[tuple[str | Expression, _Order]] | None = None, + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array | Expression: ... + + +# ========================= 3.6 Structural transforms ========================= +def list_element( + lists: lib.Array[ListScalar[_DataTypeT]] | lib.ChunkedArray[ListScalar[_DataTypeT]] + | ListScalar[_DataTypeT] | Expression, + index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None +) -> (lib.Array[lib.Scalar[_DataTypeT]] | lib.ChunkedArray[lib.Scalar[_DataTypeT]] + | _DataTypeT | Expression): ... + + +def list_flatten( + lists: ArrayOrChunkedArray[ListScalar[Any]] | Expression, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any] | Expression: ... + + +def list_parent_indices( + lists: ArrayOrChunkedArray[Any] | Expression, /, *, + memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array | Expression: ... + + +def list_slice( + lists: ArrayOrChunkedArray[Any] | Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any] | Expression: ... + + +def map_lookup( + container, + /, + query_key, + occurrence: str, + *, + options: MapLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): ... + + +def struct_field( + values, + /, + indices, + *, + options: StructFieldOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): ... + + +def fill_null_backward( + values: _ScalarOrArrayT | ScalarLike | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | ScalarLike | Expression: ... + + +def fill_null_forward( + values: _ScalarOrArrayT | ScalarLike | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | ScalarLike | Expression: ... + + +def replace_with_mask( + values: _ScalarOrArrayT | Expression, + mask: list[bool] | list[bool | None] | BooleanArray, + replacements, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT | Expression: ... + + +# ========================= 3.7 Pairwise functions ========================= +def pairwise_diff( + input: _NumericOrTemporalArrayT | Expression, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT | Expression: ... + + +def run_end_encode( + input: _NumericOrTemporalArrayT | Expression, + /, + *, + run_end_type: _RunEndType | None = None, + options: RunEndEncodeOptions | None = None, + memory_pool: lib.MemoryPool | None = None +) -> _NumericOrTemporalArrayT | Expression: ... + + +def run_end_decode( + input: _NumericOrTemporalArrayT | Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None +) -> _NumericOrTemporalArrayT | Expression: ... + + +pairwise_diff_checked = _clone_signature(pairwise_diff) diff --git a/python/pyarrow-stubs/pyarrow/config.pyi b/python/pyarrow-stubs/pyarrow/config.pyi new file mode 100644 index 00000000000..069b70e553a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/config.pyi @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import NamedTuple + + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + + +class CppBuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + + +class BuildInfo(NamedTuple): + build_type: str + cpp_build_info: CppBuildInfo + + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + + +build_info: BuildInfo +cpp_build_info: CppBuildInfo +cpp_version: str +cpp_version_info: VersionInfo + + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + + +__all__ = [ + "VersionInfo", + "BuildInfo", + "CppBuildInfo", + "RuntimeInfo", + "build_info", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/python/pyarrow-stubs/pyarrow/csv.pyi b/python/pyarrow-stubs/pyarrow/csv.pyi new file mode 100644 index 00000000000..a7abd413aab --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/csv.pyi @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/python/pyarrow-stubs/pyarrow/cuda.pyi b/python/pyarrow-stubs/pyarrow/cuda.pyi new file mode 100644 index 00000000000..0394965bb73 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/cuda.pyi @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._cuda import ( + BufferReader, + BufferWriter, + Context, + CudaBuffer, + HostBuffer, + IpcMemHandle, + new_host_buffer, + read_message, + read_record_batch, + serialize_record_batch, +) + +__all__ = [ + "BufferReader", + "BufferWriter", + "Context", + "CudaBuffer", + "HostBuffer", + "IpcMemHandle", + "new_host_buffer", + "read_message", + "read_record_batch", + "serialize_record_batch", +] diff --git a/python/pyarrow-stubs/pyarrow/dataset.pyi b/python/pyarrow-stubs/pyarrow/dataset.pyi new file mode 100644 index 00000000000..66d86b14a25 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/dataset.pyi @@ -0,0 +1,199 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Iterable, Sequence +from typing import Literal, TypeAlias, Any + +from _typeshed import StrPath +from pyarrow._dataset import ( + CsvFileFormat, + CsvFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FileFormat, + FileFragment, + FilenamePartitioning, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + InMemoryDataset, + IpcFileFormat, + IpcFileWriteOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, +) +from pyarrow._dataset_orc import OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, +) +from pyarrow._dataset_parquet_encryption import ( + ParquetDecryptionConfig, + ParquetEncryptionConfig, +) +from pyarrow.compute import Expression, field, scalar +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table + +from ._fs import SupportedFileSystem + +_orc_available: bool +_parquet_available: bool + +__all__ = [ + "CsvFileFormat", + "CsvFragmentScanOptions", + "Dataset", + "DatasetFactory", + "DirectoryPartitioning", + "FeatherFileFormat", + "FileFormat", + "FileFragment", + "FilenamePartitioning", + "FileSystemDataset", + "FileSystemDatasetFactory", + "FileSystemFactoryOptions", + "FileWriteOptions", + "Fragment", + "FragmentScanOptions", + "HivePartitioning", + "InMemoryDataset", + "IpcFileFormat", + "IpcFileWriteOptions", + "JsonFileFormat", + "JsonFragmentScanOptions", + "Partitioning", + "PartitioningFactory", + "Scanner", + "TaggedRecordBatch", + "UnionDataset", + "UnionDatasetFactory", + "WrittenFile", + "get_partition_keys", + # Orc + "OrcFileFormat", + # Parquet + "ParquetDatasetFactory", + "ParquetFactoryOptions", + "ParquetFileFormat", + "ParquetFileFragment", + "ParquetFileWriteOptions", + "ParquetFragmentScanOptions", + "ParquetReadOptions", + "RowGroupInfo", + # Parquet Encryption + "ParquetDecryptionConfig", + "ParquetEncryptionConfig", + # Compute + "Expression", + "field", + "scalar", + # Dataset + "partitioning", + "parquet_dataset", + "write_dataset", +] + +_DatasetFormat: TypeAlias = ( + Literal["parquet", "ipc", "arrow", "feather", "csv", "json", "orc", str] +) + + +def partitioning( + schema: Schema = None, + *, + field_names: list[str] = None, + flavor: Literal["hive"] = None, + dictionaries: dict[str, Array] | Literal["infer"] | None = None, +) -> Partitioning | PartitioningFactory: ... + + +def parquet_dataset( + metadata_path: StrPath, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + format: ParquetFileFormat | None = None, + partitioning: Partitioning | PartitioningFactory | str | None = None, + partition_base_dir: str | None = None, +) -> FileSystemDataset: ... + + +def dataset( + source: StrPath + | Sequence[Dataset] + | Sequence[StrPath] + | Iterable[RecordBatch] + | Iterable[Table] + | RecordBatchReader + | RecordBatch + | Table, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> FileSystemDataset | UnionDataset | InMemoryDataset | Dataset: ... + + +def write_dataset( + data: Any | Dataset | Table | RecordBatch | RecordBatchReader | list[Table] + | Iterable[RecordBatch] | Scanner, + base_dir: StrPath, + *, + basename_template: str | None = None, + format: FileFormat | _DatasetFormat | None = None, + partitioning: Partitioning | PartitioningFactory | list[str] | None = None, + partitioning_flavor: str | None = None, + schema: Schema | None = None, + filesystem: SupportedFileSystem | str | None = None, + file_options: FileWriteOptions | None = None, + use_threads: bool | None = True, + max_partitions: int = 1024, + max_open_files: int = 1024, + max_rows_per_file: int = 0, + min_rows_per_group: int = 0, + max_rows_per_group: int = 1024 * 1024, # noqa: Y011 + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: + Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", + create_dir: bool = True, + preserve_order: bool | None = None, +): ... + + +def _get_partition_keys(partition_expression: Expression) -> dict[str, Any]: ... diff --git a/python/pyarrow-stubs/pyarrow/device.pyi b/python/pyarrow-stubs/pyarrow/device.pyi new file mode 100644 index 00000000000..7787ac44deb --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/device.pyi @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from pyarrow.lib import _Weakrefable + + +class DeviceAllocationType(enum.Enum): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + + +class Device(_Weakrefable): + @property + def type_name(self) -> str: ... + + @property + def device_id(self) -> int: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + +class MemoryManager(_Weakrefable): + @property + def device(self) -> Device: ... + + @property + def is_cpu(self) -> bool: ... + + +def default_cpu_memory_manager() -> MemoryManager: ... + + +__all__ = ["DeviceAllocationType", "Device", + "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/pyarrow-stubs/pyarrow/error.pyi b/python/pyarrow-stubs/pyarrow/error.pyi new file mode 100644 index 00000000000..eac936afcb5 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/error.pyi @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + + +class ArrowException(Exception): + ... + + +class ArrowInvalid(ValueError, ArrowException): + ... + + +class ArrowMemoryError(MemoryError, ArrowException): + ... + + +class ArrowKeyError(KeyError, ArrowException): + ... + + +class ArrowTypeError(TypeError, ArrowException): + ... + + +class ArrowNotImplementedError(NotImplementedError, ArrowException): + ... + + +class ArrowCapacityError(ArrowException): + ... + + +class ArrowIndexError(IndexError, ArrowException): + ... + + +class ArrowSerializationError(ArrowException): + ... + + +class ArrowCancelled(ArrowException): + signum: int | None + def __init__(self, message: str, signum: int | None = None) -> None: ... + + +ArrowIOError = IOError + + +class StopToken: + ... + + +def enable_signal_handlers(enable: bool) -> None: ... + + +have_signal_refcycle: bool + + +class SignalStopHandler: + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + def __dealloc__(self) -> None: ... + @property + def stop_token(self) -> StopToken: ... + + +__all__ = [ + "ArrowException", + "ArrowInvalid", + "ArrowMemoryError", + "ArrowKeyError", + "ArrowTypeError", + "ArrowNotImplementedError", + "ArrowCapacityError", + "ArrowIndexError", + "ArrowSerializationError", + "ArrowCancelled", + "ArrowIOError", + "StopToken", + "enable_signal_handlers", + "have_signal_refcycle", + "SignalStopHandler", +] diff --git a/python/pyarrow-stubs/pyarrow/feather.pyi b/python/pyarrow-stubs/pyarrow/feather.pyi new file mode 100644 index 00000000000..cf9d3402091 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/feather.pyi @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable +from typing import IO, Literal + +import pandas as pd + +from pyarrow import lib +from pyarrow.lib import Table +from pyarrow._typing import StrPath +from ._feather import FeatherError + + +class FeatherDataset: + path_or_paths: str | list[str] + validate_schema: bool + + def __init__(self, path_or_paths: str | + list[str], validate_schema: bool = True) -> None: ... + + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... + + def read_pandas( + self, columns: list[str] | None = None, use_threads: bool = True + ) -> pd.DataFrame: ... + + +def check_chunked_overflow(name: str, col) -> None: ... + + +def write_feather( + df: pd.DataFrame | Table | lib.ChunkedArray, + dest: StrPath | IO, + compression: Literal["zstd", "lz4", "uncompressed", "snappy"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +) -> None: ... + + +def read_feather( + source: StrPath | IO | lib.NativeFile, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, +) -> pd.DataFrame: ... + + +def read_table( + source: StrPath | IO | lib.NativeFile, + columns: list[str | int] | Iterable[str | int] | None = None, + memory_map: bool = False, + use_threads: bool = True, +) -> Table: ... + + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] diff --git a/python/pyarrow-stubs/pyarrow/flight.pyi b/python/pyarrow-stubs/pyarrow/flight.pyi new file mode 100644 index 00000000000..dcc6ee2244b --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/flight.pyi @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._flight import ( + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + connect, +) + +__all__ = [ + "Action", + "ActionType", + "BasicAuth", + "CallInfo", + "CertKeyPair", + "ClientAuthHandler", + "ClientMiddleware", + "ClientMiddlewareFactory", + "DescriptorType", + "FlightCallOptions", + "FlightCancelledError", + "FlightClient", + "FlightDataStream", + "FlightDescriptor", + "FlightEndpoint", + "FlightError", + "FlightInfo", + "FlightInternalError", + "FlightMetadataReader", + "FlightMetadataWriter", + "FlightMethod", + "FlightServerBase", + "FlightServerError", + "FlightStreamChunk", + "FlightStreamReader", + "FlightStreamWriter", + "FlightTimedOutError", + "FlightUnauthenticatedError", + "FlightUnauthorizedError", + "FlightUnavailableError", + "FlightWriteSizeExceededError", + "GeneratorStream", + "Location", + "MetadataRecordBatchReader", + "MetadataRecordBatchWriter", + "RecordBatchStream", + "Result", + "SchemaResult", + "ServerAuthHandler", + "ServerCallContext", + "ServerMiddleware", + "ServerMiddlewareFactory", + "Ticket", + "TracingServerMiddlewareFactory", + "connect", +] diff --git a/python/pyarrow-stubs/pyarrow/fs.pyi b/python/pyarrow-stubs/pyarrow/fs.pyi new file mode 100644 index 00000000000..77bf9193900 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/fs.pyi @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._fs import ( + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, + SupportedFileSystem, +) +from pyarrow._azurefs import AzureFileSystem +from pyarrow._hdfs import HadoopFileSystem +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import ( + AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, + S3FileSystem, + S3LogLevel, + S3RetryStrategy, + ensure_s3_initialized, + finalize_s3, + ensure_s3_finalized, + initialize_s3, + resolve_s3_region, +) + +FileStats = FileInfo + + +def copy_files( + source: str, + destination: str, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, + *, + chunk_size: int = 1024 * 1024, # noqa: Y011 + use_threads: bool = True, +) -> None: ... + + +def _ensure_filesystem( + filesystem: FileSystem | str | object, + *, + use_mmap: bool = False +) -> FileSystem: ... + + +def _resolve_filesystem_and_path( + path: str | object, + filesystem: FileSystem | str | object | None = None, + *, + memory_map: bool = False +) -> tuple[FileSystem, str]: ... + + +class FSSpecHandler(FileSystemHandler): # type: ignore[misc] # All abstract methods implemented via fsspec delegation # noqa: E501 + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... + + +__all__ = [ + # _fs + "FileSelector", + "FileType", + "FileInfo", + "FileSystem", + "LocalFileSystem", + "SubTreeFileSystem", + "_MockFileSystem", + "FileSystemHandler", + "PyFileSystem", + # _azurefs + "AzureFileSystem", + # _hdfs + "HadoopFileSystem", + # _gcsfs + "GcsFileSystem", + # _s3fs + "AwsDefaultS3RetryStrategy", + "AwsStandardS3RetryStrategy", + "S3FileSystem", + "S3LogLevel", + "S3RetryStrategy", + "ensure_s3_initialized", + "finalize_s3", + "ensure_s3_finalized", + "initialize_s3", + "resolve_s3_region", + # fs + "FileStats", + "copy_files", + "FSSpecHandler", +] diff --git a/python/pyarrow-stubs/pyarrow/gandiva.pyi b/python/pyarrow-stubs/pyarrow/gandiva.pyi new file mode 100644 index 00000000000..7e129d3ed1d --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/gandiva.pyi @@ -0,0 +1,110 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable +from typing import Literal + +from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable + + +class Node(_Weakrefable): + def return_type(self) -> DataType: ... + + +class Expression(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + + +class Condition(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + + +class SelectionVector(_Weakrefable): + def to_array(self) -> Array: ... + + +class Projector(_Weakrefable): + @property + def llvm_ir(self): ... + + def evaluate( + self, batch: RecordBatch, selection: SelectionVector | None = None + ) -> list[Array]: ... + + +class Filter(_Weakrefable): + @property + def llvm_ir(self): ... + + def evaluate( + self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" + ) -> SelectionVector: ... + + +class TreeExprBuilder(_Weakrefable): + def make_literal(self, value: float | str | bytes | + bool, dtype: DataType | str | None) -> Node: ... + + def make_expression( + self, root_node: Node | None, return_field: Field) -> Expression: ... + + def make_function( + self, name: str, children: list[Node | None], + return_type: DataType) -> Node: ... + + def make_field(self, field: Field | None) -> Node: ... + + def make_if( + self, condition: Node, this_node: Node | None, + else_node: Node | None, return_type: DataType | None + ) -> Node: ... + def make_and(self, children: list[Node | None]) -> Node: ... + def make_or(self, children: list[Node | None]) -> Node: ... + def make_in_expression(self, node: Node | None, values: Iterable, + dtype: DataType) -> Node: ... + + def make_condition(self, condition: Node | None) -> Condition: ... + + +class Configuration(_Weakrefable): + def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... + + +def make_projector( + schema: Schema, + children: list[Expression | None], + pool: MemoryPool | None = None, + selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", + configuration: Configuration | None = None, +) -> Projector: ... + + +def make_filter( + schema: Schema, condition: Condition | None, + configuration: Configuration | None = None +) -> Filter: ... + + +class FunctionSignature(_Weakrefable): + def return_type(self) -> DataType: ... + def param_types(self) -> list[DataType]: ... + def name(self) -> str: ... + + +def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi b/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi new file mode 100644 index 00000000000..fd5ae83c569 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/__init__.pyi @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from .from_dataframe import from_dataframe as from_dataframe + +__all__ = ["from_dataframe"] diff --git a/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi b/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi new file mode 100644 index 00000000000..e1d8ae949c9 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/buffer.pyi @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from pyarrow.lib import Buffer + + +class DlpackDeviceType(enum.IntEnum): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class _PyArrowBuffer: + def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... + @property + def bufsize(self) -> int: ... + @property + def ptr(self) -> int: ... + def __dlpack__(self): ... + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: ... diff --git a/python/pyarrow-stubs/pyarrow/interchange/column.pyi b/python/pyarrow-stubs/pyarrow/interchange/column.pyi new file mode 100644 index 00000000000..67508ac0689 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/column.pyi @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import enum + +from collections.abc import Iterable +from typing import Any, TypeAlias, TypedDict + +from pyarrow.lib import Array, ChunkedArray + +from .buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + + +class ColumnNullType(enum.IntEnum): + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + data: tuple[_PyArrowBuffer, Dtype] + validity: tuple[_PyArrowBuffer, Dtype] | None + offsets: tuple[_PyArrowBuffer, Dtype] | None + + +class CategoricalDescription(TypedDict): + is_ordered: bool + is_dictionary: bool + categories: _PyArrowColumn | None + + +class Endianness(enum.Enum): + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + ... + + +class _PyArrowColumn: + _col: Array | ChunkedArray + + def __init__(self, column: Array | ChunkedArray, + allow_copy: bool = True) -> None: ... + + def size(self) -> int: ... + @property + def offset(self) -> int: ... + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: ... + @property + def describe_categorical(self) -> CategoricalDescription: ... + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: ... + @property + def null_count(self) -> int: ... + @property + def metadata(self) -> dict[str, Any]: ... + def num_chunks(self) -> int: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: ... + def get_buffers(self) -> ColumnBuffers: ... diff --git a/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi b/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi new file mode 100644 index 00000000000..419b3e2cdb3 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/dataframe.pyi @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Iterable, Sequence +from typing import Any + +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.lib import RecordBatch, Table + + +class _PyArrowDataFrame: + def __init__( + self, + df: Table | RecordBatch, + nan_as_null: bool = False, + allow_copy: bool = True) -> None: ... + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + @property + def metadata(self) -> dict[str, Any]: ... + def num_columns(self) -> int: ... + def num_rows(self) -> int: ... + def num_chunks(self) -> int: ... + def column_names(self) -> Iterable[str]: ... + def get_column(self, i: int) -> _PyArrowColumn: ... + def get_column_by_name(self, name: str) -> _PyArrowColumn: ... + def get_columns(self) -> Iterable[_PyArrowColumn]: ... + def select_columns(self, indices: Sequence[int]) -> Self: ... + def select_columns_by_name(self, names: Sequence[str]) -> Self: ... + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: ... diff --git a/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi b/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi new file mode 100644 index 00000000000..d6ad272dfc6 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/interchange/from_dataframe.pyi @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Protocol, TypeAlias + +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table + +from .column import ( + ColumnBuffers, + ColumnNullType, + Dtype, + DtypeKind, +) + + +class DataFrameObject(Protocol): + def __dataframe__(self, nan_as_null: bool = False, + allow_copy: bool = True) -> Any: ... + + +ColumnObject: TypeAlias = Any + + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... + + +def _from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: ... + + +def protocol_df_chunk_to_pyarrow( + df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: ... + + +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... + + +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: ... + + +def categorical_column_to_dictionary( + col: ColumnObject, allow_copy: bool = True +) -> DictionaryArray: ... + + +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: ... + + +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: ... + + +def buffers_to_array( + buffers: ColumnBuffers, + data_type: tuple[DtypeKind, int, str, str], + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> Array: ... + + +def validity_buffer_from_mask( + validity_buff: Buffer, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: ... + + +def validity_buffer_nan_sentinel( + data_pa_buffer: Buffer, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: ... diff --git a/python/pyarrow-stubs/pyarrow/io.pyi b/python/pyarrow-stubs/pyarrow/io.pyi new file mode 100644 index 00000000000..be6a07d5418 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/io.pyi @@ -0,0 +1,430 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from collections.abc import Callable +from io import IOBase + +from _typeshed import StrPath + +import numpy as np + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Any, Literal, SupportsIndex +import builtins + +from pyarrow._stubs_typing import Compression, SupportPyBuffer +from pyarrow.lib import MemoryPool, _Weakrefable + +from .device import Device, DeviceAllocationType, MemoryManager +from ._types import KeyValueMetadata + + +def have_libhdfs() -> bool: ... + + +def io_thread_count() -> int: ... + + +def set_io_thread_count(count: int) -> None: ... + + +Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] + + +class NativeFile(_Weakrefable): + _default_chunk_size: int + + def __enter__(self) -> Self: ... + def __exit__(self, *args) -> None: ... + @property + def mode(self) -> Mode: ... + + def readable(self) -> bool: ... + def seekable(self) -> bool: ... + def isatty(self) -> bool: ... + def fileno(self) -> int: ... + + @property + def closed(self) -> bool: ... + def close(self) -> None: ... + def size(self) -> int: ... + + def metadata(self) -> KeyValueMetadata: ... + + def tell(self) -> int: ... + + def seek(self, position: int, whence: int = 0) -> int: ... + + def flush(self) -> None: ... + + def write(self, data: bytes | SupportPyBuffer) -> int: ... + + def read(self, nbytes: int | None = None) -> bytes: ... + + def get_stream(self, file_offset: int, nbytes: int) -> Self: ... + + def read_at(self, nbytes: int, offset: int) -> bytes: ... + + def read1(self, nbytes: int | None = None) -> bytes: ... + + def readall(self) -> bytes: ... + def readinto(self, b: SupportPyBuffer) -> int: ... + + def readline(self, size: int | None = None) -> bytes: ... + + def readlines(self, hint: int | None = None) -> list[bytes]: ... + + def __iter__(self) -> Self: ... + + def __next__(self) -> bytes: ... + def read_buffer(self, nbytes: int | None = None) -> Buffer: ... + + def truncate(self, pos: int | None = None) -> int: ... + + def writelines(self, lines: list[bytes]): ... + + def download(self, stream_or_path: StrPath | IOBase, + buffer_size: int | None = None) -> None: ... + + def upload(self, stream: IOBase, buffer_size: int | None) -> None: ... + + def writable(self): ... + +# ---------------------------------------------------------------------- +# Python file-like objects + + +class PythonFile(NativeFile): + def __init__(self, handle: IOBase, + mode: Literal["r", "w"] | None = None) -> None: ... + + +class MemoryMappedFile(NativeFile): + @classmethod + def create(cls, path: str, size: float) -> Self: ... + + def _open(self, path: str, + mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... + + def resize(self, new_size: int) -> None: ... + + +def memory_map( + path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" +) -> MemoryMappedFile: ... + + +create_memory_map = MemoryMappedFile.create + + +class OSFile(NativeFile): + name: str + + def __init__( + self, + path: str, + mode: Literal["r", "rb", "w", "wb", "a", "ab"] = "r", + memory_pool: MemoryPool | None = None, + ) -> None: ... + + +class FixedSizeBufferWriter(NativeFile): + def __init__(self, buffer: Buffer) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: ... + + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + + def set_memcopy_threshold(self, threshold: int) -> None: ... + + +# ---------------------------------------------------------------------- +# Arrow buffers + +class Buffer(_Weakrefable): + def __len__(self) -> int: ... + + def _assert_cpu(self) -> None: ... + @property + def size(self) -> int: ... + + @property + def address(self) -> int: ... + + def hex(self) -> bytes: ... + + @property + def is_mutable(self) -> bool: ... + + @property + def is_cpu(self) -> bool: ... + + @property + def device(self) -> Device: ... + + @property + def memory_manager(self) -> MemoryManager: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def parent(self) -> Buffer | None: ... + + def __getitem__(self, key: int | builtins.slice) -> int | Self: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self) -> bool: ... + + def __buffer__(self, flags: int) -> memoryview: ... + + def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... + def to_pybytes(self) -> bytes: ... + + +class ResizableBuffer(Buffer): + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: ... + + +def allocate_buffer( + size: int, + memory_pool: MemoryPool | None = None, + resizable: Literal[False] | Literal[True] | None = None # noqa: Y030 +) -> Buffer | ResizableBuffer: ... + + +# ---------------------------------------------------------------------- +# Arrow Stream +class BufferOutputStream(NativeFile): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def getvalue(self) -> Buffer: ... + + +class MockOutputStream(NativeFile): + ... + + +class BufferReader(NativeFile): + def __init__(self, obj) -> None: ... + + +class CompressedInputStream(NativeFile): + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: str | None, + ) -> None: ... + + +class CompressedOutputStream(NativeFile): + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: str, + ) -> None: ... + + +class BufferedInputStream(NativeFile): + def __init__(self, stream: NativeFile, buffer_size: int, + memory_pool: MemoryPool | None = None) -> None: ... + + def detach(self) -> NativeFile: ... + + +class BufferedOutputStream(NativeFile): + def __init__(self, stream: NativeFile, buffer_size: int, + memory_pool: MemoryPool | None = None) -> None: ... + + def detach(self) -> NativeFile: ... + + +class TransformInputStream(NativeFile): + def __init__(self, stream: NativeFile, + transform_func: Callable[[Buffer], Any]) -> None: ... + + +class Transcoder: + def __init__(self, decoder, encoder) -> None: ... + def __call__(self, buf: Buffer): ... + + +def transcoding_input_stream( + stream: NativeFile, src_encoding: str, dest_encoding: str +) -> TransformInputStream: ... + + +def py_buffer(obj: SupportPyBuffer | np.ndarray) -> Buffer: ... + + +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: ... + + +def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... + +# --------------------------------------------------------------------- + + +class CacheOptions(_Weakrefable): + hole_size_limit: int + range_size_limit: int + lazy: bool + prefetch_limit: int + + def __init__( + self, + *, + hole_size_limit: int | None = None, + range_size_limit: int | None = None, + lazy: bool = True, + prefetch_limit: int = 0, + ) -> None: ... + + @classmethod + def from_network_metrics( + cls, + time_to_first_byte_millis: int, + transfer_bandwidth_mib_per_sec: int, + ideal_bandwidth_utilization_frac: float = 0.9, + max_ideal_request_size_mib: int = 64, + ) -> Self: ... + + +class Codec(_Weakrefable): + def __init__(self, compression: Compression | str | None, + compression_level: int | None = None) -> None: ... + + @classmethod + def detect(cls, path: StrPath) -> Self: ... + + @staticmethod + def is_available(compression: Compression | str) -> bool: ... + + @staticmethod + def supports_compression_level(compression: Compression) -> int: ... + + @staticmethod + def default_compression_level(compression: Compression) -> int: ... + + @staticmethod + def minimum_compression_level(compression: Compression) -> int: ... + + @staticmethod + def maximum_compression_level(compression: Compression) -> int: ... + + @property + def name(self) -> Compression: ... + + @property + def compression_level(self) -> int: ... + + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030 + memory_pool: MemoryPool | None = None, + ) -> Buffer | bytes: ... + + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030 + memory_pool: MemoryPool | None = None, + ) -> Buffer | bytes: ... + + +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030 + memory_pool: MemoryPool | None = None, +) -> Buffer | bytes: ... + + +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[False] | Literal[True] | None = None, # noqa: Y030 + memory_pool: MemoryPool | None = None, +) -> Buffer | bytes: ... + + +def input_stream( + source: StrPath | Buffer | NativeFile | IOBase | SupportPyBuffer, + compression: + Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] | None = "detect", + buffer_size: int | str | None = None, +) -> BufferReader: ... + + +def output_stream( + source: StrPath | Buffer | NativeFile | IOBase | SupportPyBuffer, + compression: + Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] | None = "detect", + buffer_size: int | None = None, +) -> NativeFile: ... + + +__all__ = [ + "have_libhdfs", + "io_thread_count", + "set_io_thread_count", + "NativeFile", + "PythonFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "OSFile", + "FixedSizeBufferWriter", + "Buffer", + "ResizableBuffer", + "allocate_buffer", + "BufferOutputStream", + "MockOutputStream", + "BufferReader", + "CompressedInputStream", + "CompressedOutputStream", + "BufferedInputStream", + "BufferedOutputStream", + "TransformInputStream", + "Transcoder", + "transcoding_input_stream", + "py_buffer", + "foreign_buffer", + "as_buffer", + "CacheOptions", + "Codec", + "compress", + "decompress", + "input_stream", + "output_stream", +] diff --git a/python/pyarrow-stubs/pyarrow/ipc.pyi b/python/pyarrow-stubs/pyarrow/ipc.pyi new file mode 100644 index 00000000000..d153ab0f46a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/ipc.pyi @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from io import IOBase +from typing import Any + +from _typeshed import StrPath +import pandas as pd +import pyarrow.lib as lib + +from pyarrow.lib import ( + Alignment, + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) + + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + + +def new_stream( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchStreamWriter: ... + + +def open_stream( + source: bytes | int | lib.Buffer | lib.NativeFile | IOBase, + *, + options: Any = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchStreamReader: ... + + +def new_file( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + metadata: lib.KeyValueMetadata | dict[bytes, bytes] | None = None, +) -> RecordBatchFileWriter: ... + + +def open_file( + source: StrPath | bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: Any = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchFileReader: ... + + +def serialize_pandas( + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... + + +def deserialize_pandas( + buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + + +__all__ = [ + "Alignment", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/python/pyarrow-stubs/pyarrow/json.pyi b/python/pyarrow-stubs/pyarrow/json.pyi new file mode 100644 index 00000000000..67768db42e4 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/json.pyi @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/pyarrow-stubs/pyarrow/lib.pyi b/python/pyarrow-stubs/pyarrow/lib.pyi new file mode 100644 index 00000000000..f87f63d9d92 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/lib.pyi @@ -0,0 +1,130 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import NamedTuple + +from .array import * # noqa: F401, F403 +from .builder import * # noqa: F401, F403 +from .compat import * # noqa: F401, F403 +from .config import * # noqa: F401, F403 +from .device import * # noqa: F401, F403 +from .error import * # noqa: F401, F403 +from .io import * # noqa: F401, F403 +from ._ipc import * # noqa: F401, F403 +from .memory import * # noqa: F401, F403 +from .pandas_shim import * # noqa: F401, F403 +from .scalar import * # noqa: F401, F403 +from .table import * # noqa: F401, F403 +from .tensor import * # noqa: F401, F403 +from ._types import * # noqa: F401, F403 +from .memory import MemoryPool +from .array import Array +from ._types import DataType + + +class MonthDayNano(NamedTuple): + days: int + months: int + nanoseconds: int + + def __init__(self, *args, **kwargs) -> None: ... # type: ignore[misc] + + +def cpu_count() -> int: ... + + +def set_cpu_count(count: int) -> None: ... + + +def is_threading_enabled() -> bool: ... + + +def arange( + start: int, stop: int, step: int = 1, *, memory_pool: MemoryPool | None = None +) -> Array: ... + + +def is_boolean_value(obj: object) -> bool: ... + + +def is_integer_value(obj: object) -> bool: ... + + +def is_float_value(obj: object) -> bool: ... + + +def tzinfo_to_string(tz: object) -> str: ... + + +def string_to_tzinfo(tz: str) -> object: ... + + +def _ndarray_to_arrow_type(values: object, type_: object) -> object: ... + + +def _is_primitive(type_id: int) -> bool: ... + + +def ensure_type(ty: object) -> DataType: ... + + +Type_NA: int +Type_BOOL: int +Type_UINT8: int +Type_INT8: int +Type_UINT16: int +Type_INT16: int +Type_UINT32: int +Type_INT32: int +Type_UINT64: int +Type_INT64: int +Type_HALF_FLOAT: int +Type_FLOAT: int +Type_DOUBLE: int +Type_DECIMAL32: int +Type_DECIMAL64: int +Type_DECIMAL128: int +Type_DECIMAL256: int +Type_DATE32: int +Type_DATE64: int +Type_TIMESTAMP: int +Type_TIME32: int +Type_TIME64: int +Type_DURATION: int +Type_INTERVAL_MONTHS: int +Type_INTERVAL_DAY_TIME: int +Type_INTERVAL_MONTH_DAY_NANO: int +Type_BINARY: int +Type_STRING: int +Type_LARGE_BINARY: int +Type_LARGE_STRING: int +Type_FIXED_SIZE_BINARY: int +Type_BINARY_VIEW: int +Type_STRING_VIEW: int +Type_LIST: int +Type_LARGE_LIST: int +Type_LIST_VIEW: int +Type_LARGE_LIST_VIEW: int +Type_MAP: int +Type_FIXED_SIZE_LIST: int +Type_STRUCT: int +Type_SPARSE_UNION: int +Type_DENSE_UNION: int +Type_DICTIONARY: int +Type_RUN_END_ENCODED: int +UnionMode_SPARSE: int +UnionMode_DENSE: int diff --git a/python/pyarrow-stubs/pyarrow/memory.pyi b/python/pyarrow-stubs/pyarrow/memory.pyi new file mode 100644 index 00000000000..f80e01ab21c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/memory.pyi @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.lib import _Weakrefable + + +class MemoryPool(_Weakrefable): + def release_unused(self) -> None: ... + + def bytes_allocated(self) -> int: ... + + def total_bytes_allocated(self) -> int: ... + + def max_memory(self) -> int | None: ... + + def num_allocations(self) -> int: ... + + def print_stats(self) -> None: ... + + @property + def backend_name(self) -> str: ... + + +class LoggingMemoryPool(MemoryPool): + ... + + +class ProxyMemoryPool(MemoryPool): + ... + + +def default_memory_pool() -> MemoryPool: ... + + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: ... + + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: ... + + +def system_memory_pool() -> MemoryPool: ... + + +def jemalloc_memory_pool() -> MemoryPool: ... + + +def mimalloc_memory_pool() -> MemoryPool: ... + + +def set_memory_pool(pool: MemoryPool) -> None: ... + + +def log_memory_allocations(enable: bool = True) -> None: ... + + +def total_allocated_bytes() -> int: ... + + +def jemalloc_set_decay_ms(decay_ms: int) -> None: ... + + +def supported_memory_backends() -> list[str]: ... + + +__all__ = [ + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "default_memory_pool", + "proxy_memory_pool", + "logging_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "set_memory_pool", + "log_memory_allocations", + "total_allocated_bytes", + "jemalloc_set_decay_ms", + "supported_memory_backends", +] diff --git a/python/pyarrow-stubs/pyarrow/orc.pyi b/python/pyarrow-stubs/pyarrow/orc.pyi new file mode 100644 index 00000000000..f16350d0ffc --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/orc.pyi @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from . import _orc +from ._fs import SupportedFileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table + + +class ORCFile: + reader: _orc.ORCReader + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: ... + + @property + def schema(self) -> Schema: ... + + @property + def nrows(self) -> int: ... + + @property + def nstripes(self) -> int: ... + + @property + def file_version(self) -> str: ... + + @property + def software_version(self) -> str: ... + + @property + def compression(self) -> Literal["UNCOMPRESSED", + "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + + @property + def compression_size(self) -> int: ... + + @property + def writer(self) -> str: ... + + @property + def writer_version(self) -> str: ... + + @property + def row_index_stride(self) -> int: ... + + @property + def nstripe_statistics(self) -> int: ... + + @property + def content_length(self) -> int: ... + + @property + def stripe_statistics_length(self) -> int: ... + + @property + def file_footer_length(self) -> int: ... + + @property + def file_postscript_length(self) -> int: ... + + @property + def file_length(self) -> int: ... + + def read_stripe( + self, n: int, columns: list[str | int] | None = None + ) -> RecordBatch: ... + + def read(self, columns: list[str | int] | None = None) -> Table: ... + + +class ORCWriter: + writer: _orc.ORCWriter + is_open: bool + + def __init__( + self, + where: StrPath | NativeFile | IO, + *, + file_version: Any = "0.12", + batch_size: Any = 1024, + stripe_size: Any = 64 * 1024 * 1024, # noqa: Y011 + compression: Any = "UNCOMPRESSED", + compression_block_size: Any = 65536, + compression_strategy: Any = "SPEED", + row_index_stride: Any = 10000, + padding_tolerance: Any = 0.0, + dictionary_key_size_threshold: Any = 0.0, + bloom_filter_columns: Any = None, + bloom_filter_fpp: Any = 0.05, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + def __getattr__(self, name: str) -> Any: ... + def write(self, table: Table) -> None: ... + + def close(self) -> None: ... + + +def read_table( + source: StrPath | NativeFile | IO, + columns: list[str | int] | None = None, + filesystem: SupportedFileSystem | str | None = None, +) -> Table: ... + + +# TODO: should not use Any here? +def write_table( + table: Table, + where: StrPath | NativeFile | IO, + *, + file_version: Any = "0.12", + batch_size: Any = 1024, + stripe_size: Any = 64 * 1024 * 1024, # noqa: Y011 + compression: Any = 'UNCOMPRESSED', + compression_block_size: Any = 65536, + compression_strategy: Any = 'SPEED', + row_index_stride: Any = 10000, + padding_tolerance: Any = 0.0, + dictionary_key_size_threshold: Any = 0.0, + bloom_filter_columns: Any = None, + bloom_filter_fpp: Any = 0.05, +) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/pandas_compat.pyi b/python/pyarrow-stubs/pyarrow/pandas_compat.pyi new file mode 100644 index 00000000000..4e614c58a3f --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/pandas_compat.pyi @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, TypedDict, TypeVar + +import numpy as np +import pandas as pd + +from pandas import DatetimeTZDtype + +from .lib import Array, DataType, Schema, Table, _pandas_api + +_T = TypeVar("_T") + + +def get_logical_type_map() -> dict[int, str]: ... +def get_logical_type(arrow_type: DataType) -> str: ... +def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... +def get_logical_type_from_numpy(pandas_collection) -> str: ... +def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + + +class _ColumnMetadata(TypedDict): + name: str + field_name: str + pandas_type: int + numpy_type: str + metadata: dict | None + + +def get_column_metadata( + column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str +) -> _ColumnMetadata: ... + + +def construct_metadata( + columns_to_convert: list[pd.Series], + df: pd.DataFrame, + column_names: list[str], + index_levels: list[pd.Index], + index_descriptors: list[dict], + preserve_index: bool, + types: list[DataType], + column_field_names: list[str] = ..., +) -> dict[bytes, bytes]: ... + + +def dataframe_to_types( + df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None +) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... + + +def dataframe_to_arrays( + df: pd.DataFrame, + schema: Schema, + preserve_index: bool | None, + nthreads: int = 1, + columns: list[str] | None = None, + safe: bool = True, +) -> tuple[Array, Schema, int]: ... +def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... +def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... + + +def table_to_dataframe( + options, + table: Table, + categories=None, + ignore_metadata: bool = False, + types_mapper=None) -> pd.DataFrame: ... + + +def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... + + +__all__ = [ + "_pandas_api", +] diff --git a/python/pyarrow-stubs/pyarrow/pandas_shim.pyi b/python/pyarrow-stubs/pyarrow/pandas_shim.pyi new file mode 100644 index 00000000000..181d78e7a0c --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/pandas_shim.pyi @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import types as stdlib_types +from collections.abc import Iterable +from typing import Any, TypeGuard + +from pandas import Categorical, DatetimeTZDtype, Index, Series, DataFrame + +from numpy import dtype +from pandas.core.dtypes.base import ExtensionDtype + + +class _PandasAPIShim: + has_sparse: bool + + def series(self, *args, **kwargs) -> Series: ... + def data_frame(self, *args, **kwargs) -> DataFrame: ... + @property + def have_pandas(self) -> bool: ... + @property + def compat(self) -> stdlib_types.ModuleType: ... + @property + def pd(self) -> stdlib_types.ModuleType: ... + def infer_dtype(self, obj: Iterable) -> str: ... + def pandas_dtype(self, dtype: str) -> dtype: ... + @property + def loose_version(self) -> Any: ... + @property + def version(self) -> str: ... + def is_v1(self) -> bool: ... + def is_ge_v21(self) -> bool: ... + def is_ge_v23(self) -> bool: ... + def is_ge_v3(self) -> bool: ... + def uses_string_dtype(self) -> bool: ... + @property + def categorical_type(self) -> type[Categorical]: ... + @property + def datetimetz_type(self) -> type[DatetimeTZDtype]: ... + @property + def extension_dtype(self) -> type[ExtensionDtype]: ... + + def is_array_like( + self, obj: Any + ) -> TypeGuard[Series | Index | Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[DatetimeTZDtype]: ... + def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... + def is_sparse(self, obj: Any) -> bool: ... + def is_data_frame(self, obj: Any) -> TypeGuard[DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[Series]: ... + def is_index(self, obj: Any) -> TypeGuard[Index]: ... + def get_values(self, obj: Any) -> bool: ... + def get_rangeindex_attribute(self, level, name): ... + + +_pandas_api: _PandasAPIShim + +__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi b/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi new file mode 100644 index 00000000000..5329bd6c66a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/parquet/__init__.pyi @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from .core import * # noqa: F401, F403 diff --git a/python/pyarrow-stubs/pyarrow/parquet/core.pyi b/python/pyarrow-stubs/pyarrow/parquet/core.pyi new file mode 100644 index 00000000000..3be44c2e58a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/parquet/core.pyi @@ -0,0 +1,371 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +from pathlib import Path + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Callable, Iterator, Iterable, Sequence +from typing import IO, Literal + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow import _parquet +from pyarrow._compute import Expression +from pyarrow._fs import FileSystem, SupportedFileSystem +from pyarrow._parquet import ( + ColumnChunkMetaData, + ColumnSchema, + FileDecryptionProperties, + FileEncryptionProperties, + FileMetaData, + ParquetLogicalType, + ParquetReader, + ParquetSchema, + RowGroupMetaData, + SortingColumn, + Statistics, +) +from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow.dataset import ParquetFileFragment, Partitioning, PartitioningFactory +from pyarrow.lib import Buffer, NativeFile, RecordBatch, Schema, Table, ChunkedArray +from typing_extensions import deprecated + +__all__ = ( + "ColumnChunkMetaData", + "ColumnSchema", + "FileDecryptionProperties", + "FileEncryptionProperties", + "FileMetaData", + "ParquetDataset", + "ParquetFile", + "ParquetLogicalType", + "ParquetReader", + "ParquetSchema", + "ParquetWriter", + "RowGroupMetaData", + "SortingColumn", + "Statistics", + "read_metadata", + "read_pandas", + "read_schema", + "read_table", + "write_metadata", + "write_table", + "write_to_dataset", + "_filters_to_expression", + "filters_to_expression", +) + + +def filters_to_expression( + filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + + +@deprecated("use filters_to_expression") +def _filters_to_expression( + filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + + +_Compression: TypeAlias = Literal["gzip", "bz2", + "brotli", "lz4", "zstd", "snappy", "none"] + + +class ParquetFile: + reader: ParquetReader + common_metadata: FileMetaData + + def __init__( + self, + source: str | Path | Buffer | NativeFile | IO, + *, + metadata: FileMetaData | None = None, + common_metadata: FileMetaData | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + filesystem: SupportedFileSystem | None = None, + page_checksum_verification: bool = False, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def close(self, force: bool = False) -> None: ... + @property + def closed(self) -> bool: ... + + def read_row_group( + self, + i: int, + columns: Sequence[str | int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def read_row_groups( + self, + row_groups: Sequence[int], + columns: Iterable[str | int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def iter_batches( + self, + batch_size: int = 65536, + row_groups: Sequence[int] | None = None, + columns: Iterable[str | int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: ... + + def read( + self, + columns: Sequence[str | int] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + + def scan_contents( + self, columns: Iterable[str | int] | None = None, batch_size: int = 65536 + ) -> int: ... + + +class ParquetWriter: + flavor: str + schema_changed: bool + schema: ParquetSchema + where: str | Path | IO + file_handler: NativeFile | None + writer: _parquet.ParquetWriter + is_open: bool + + def __init__( + self, + where: str | Path | IO | NativeFile, + schema: Schema, + filesystem: SupportedFileSystem | None = None, + flavor: str | None = None, + version: Literal["1.0", "2.4", "2.6"] = ..., + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool | list = False, + column_encoding: str | dict | None = None, + writer_engine_version=None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **options, + ) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> Literal[False]: ... + + def write( + self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None + ) -> None: ... + def write_batch(self, batch: RecordBatch, + row_group_size: int | None = None) -> None: ... + + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def close(self) -> None: ... + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: ... + + +class ParquetDataset: + def __init__( + self, + path_or_paths: SingleOrList[str] + | SingleOrList[Path] + | SingleOrList[NativeFile] + | SingleOrList[IO], + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + *, + filters: Expression + | FilterTuple + | list[FilterTuple] + | list[list[FilterTuple]] + | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str + | list[str] + | Partitioning + | PartitioningFactory + | None = "hive", + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + def equals(self, other: ParquetDataset) -> bool: ... + @property + def schema(self) -> Schema: ... + + def read( + self, + columns: list[str] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: ... + def read_pandas(self, **kwargs) -> Table: ... + @property + def fragments(self) -> list[ParquetFileFragment]: ... + @property + def files(self) -> list[str]: ... + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning: ... + + +def read_table( + source: SingleOrList[str] + | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO] | Buffer, + *, + columns: list | None = None, + use_threads: bool = True, + schema: Schema | None = None, + use_pandas_metadata: bool = False, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | PartitioningFactory | None = "hive", + filesystem: SupportedFileSystem | str | None = None, + filters: Expression + | FilterTuple + | list[FilterTuple] + | Sequence[Sequence[tuple]] + | None = None, + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, +) -> Table: ... + + +def read_pandas( + source: str | Path | NativeFile | IO | Buffer, columns: list | None = None, **kwargs +) -> Table: ... + + +def write_table( + table: Table, + where: str | Path | NativeFile | IO, + row_group_size: int | None = None, + version: Literal["1.0", "2.4", "2.6"] = "2.6", + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + coerce_timestamps: str | None = None, + allow_truncated_timestamps: bool = False, + data_page_size: int | None = None, + flavor: str | None = None, + filesystem: SupportedFileSystem | str | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool = False, + column_encoding: str | dict | None = None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **kwargs, +) -> None: ... + + +def write_to_dataset( + table: Table | ChunkedArray, + root_path: str | Path, + partition_cols: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + partitioning: Partitioning | list[str] | None = None, + basename_template: str | None = None, + use_threads: bool | None = None, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] + | None = None, + **kwargs, +) -> None: ... + + +def write_metadata( + schema: Schema, + where: str | NativeFile, + metadata_collector: list[FileMetaData] | None = None, + filesystem: SupportedFileSystem | None = None, + **kwargs, +) -> None: ... + + +def read_metadata( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | str | None = None, +) -> FileMetaData: ... + + +def read_schema( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | str | None = None, +) -> Schema: ... diff --git a/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi b/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi new file mode 100644 index 00000000000..fe9a454e593 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/parquet/encryption.pyi @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._parquet_encryption import ( + CryptoFactory, + DecryptionConfiguration, + EncryptionConfiguration, + KmsClient, + KmsConnectionConfig, +) + +__all__ = [ + "CryptoFactory", + "DecryptionConfiguration", + "EncryptionConfiguration", + "KmsClient", + "KmsConnectionConfig", +] diff --git a/python/pyarrow-stubs/pyarrow/scalar.pyi b/python/pyarrow-stubs/pyarrow/scalar.pyi new file mode 100644 index 00000000000..70b2ea2b347 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/scalar.pyi @@ -0,0 +1,466 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import collections.abc +import datetime as dt +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from collections.abc import Iterator +from typing import Any, Generic, Literal + +import numpy as np + +from pyarrow._compute import CastOptions +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from pyarrow.table import ArrayOrChunkedArray +from typing_extensions import TypeVar + +from ._types import ( # noqa: F401 + DataType, + Decimal128Type, + Date32Type, + Date64Type, + Time32Type, + Time64Type, + TimestampType, + Decimal256Type, + NullType, + BoolType, + UInt8Type, + Int8Type, + DurationType, MonthDayNanoIntervalType, BinaryType, LargeBinaryType, + FixedSizeBinaryType, StringType, LargeStringType, BinaryViewType, StringViewType, + FixedSizeListType, + Float16Type, Float32Type, Float64Type, Decimal32Type, Decimal64Type, + LargeListType, + LargeListViewType, + ListType, + ListViewType, + OpaqueType, DictionaryType, MapType, _BasicDataType, + StructType, RunEndEncodedType, + UInt16Type, Int16Type, UInt32Type, Int32Type, UInt64Type, Int64Type, + UnionType, ExtensionType, BaseExtensionType, Bool8Type, UuidType, JsonType, + _BasicValueT, + _DataTypeT, + _IndexT, + _K, + _Precision, + _RunEndType, + _Scale, + _Size, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, + _ValueT, +) + +_AsPyTypeK = TypeVar("_AsPyTypeK") +_AsPyTypeV = TypeVar("_AsPyTypeV") +_DataType_co = TypeVar("_DataType_co", bound=DataType, covariant=True) + + +class Scalar(_Weakrefable, Generic[_DataType_co]): + @property + def type(self) -> _DataType_co: ... + + @property + def is_valid(self) -> bool: ... + + def cast( + self, + target_type: None | _DataTypeT | str, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self | Scalar[_DataTypeT] | Scalar[Any]: ... + + def validate(self, *, full: bool = False) -> None: ... + + def equals(self, other: Scalar | ArrayOrChunkedArray) -> bool: ... + + def __hash__(self) -> int: ... + + def as_py(self: Scalar[Any], *, maps_as_pydicts: Literal["lossy", + "strict"] | None = None) -> Any: ... + + def as_buffer(self) -> Buffer | None: ... + + # Buffer protocol support + def __buffer__(self, flags: int) -> memoryview: ... + + # Methods for structured types (StructScalar, MapScalar, ListScalar, etc.) + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[Any]: ... + + def __getitem__(self, key: int | str) -> Any: ... + + def __contains__(self, key: object) -> bool: ... + + def keys(self) -> Iterator[str]: ... + + def items(self) -> Iterator[tuple[str, Any]]: ... + + @property + def values(self) -> Any: ... + + # Methods for compatibility with array-like interface + def to_pylist(self) -> list: ... + def tolist(self) -> list: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> Any: ... + + +_NULL: NullScalar +NA: NullScalar + + +class NullScalar(Scalar[NullType]): + ... + + +class BooleanScalar(Scalar[BoolType]): + ... + + +class UInt8Scalar(Scalar[UInt8Type]): + ... + + +class Int8Scalar(Scalar[Int8Type]): + ... + + +class UInt16Scalar(Scalar[UInt16Type]): + ... + + +class Int16Scalar(Scalar[Int16Type]): + ... + + +class UInt32Scalar(Scalar[UInt32Type]): + ... + + +class Int32Scalar(Scalar[Int32Type]): + ... + + +class UInt64Scalar(Scalar[UInt64Type]): + ... + + +class Int64Scalar(Scalar[Int64Type]): + ... + + +class HalfFloatScalar(Scalar[Float16Type]): + ... + + +class FloatScalar(Scalar[Float32Type]): + ... + + +class DoubleScalar(Scalar[Float64Type]): + ... + + +class Decimal32Scalar(Scalar[Decimal32Type[_Precision, _Scale]]): + ... + + +class Decimal64Scalar(Scalar[Decimal64Type[_Precision, _Scale]]): + ... + + +class Decimal128Scalar(Scalar[Decimal128Type[_Precision, _Scale]]): + ... + + +class Decimal256Scalar(Scalar[Decimal256Type[_Precision, _Scale]]): + ... + + +class Date32Scalar(Scalar[Date32Type]): + ... + + +class Date64Scalar(Scalar[Date64Type]): + @property + def value(self) -> dt.date | None: ... + + +class Time32Scalar(Scalar[Time32Type[_Time32Unit]]): + @property + def value(self) -> dt.time | None: ... + + +class Time64Scalar(Scalar[Time64Type[_Time64Unit]]): + @property + def value(self) -> dt.time | None: ... + + +class TimestampScalar(Scalar[TimestampType[_Unit, _Tz]]): + @property + def value(self) -> int | None: ... + + +class DurationScalar(Scalar[DurationType[_Unit]]): + @property + def value(self) -> dt.timedelta | None: ... + + +class MonthDayNanoIntervalScalar(Scalar[MonthDayNanoIntervalType]): + @property + def value(self) -> MonthDayNano | None: ... + + +class BinaryScalar(Scalar[BinaryType]): + def as_buffer(self) -> Buffer: ... + + +class LargeBinaryScalar(Scalar[LargeBinaryType]): + def as_buffer(self) -> Buffer: ... + + +class FixedSizeBinaryScalar(Scalar[FixedSizeBinaryType]): + def as_buffer(self) -> Buffer: ... + + +class StringScalar(Scalar[StringType]): + def as_buffer(self) -> Buffer: ... + + +class LargeStringScalar(Scalar[LargeStringType]): + def as_buffer(self) -> Buffer: ... + + +class BinaryViewScalar(Scalar[BinaryViewType]): + def as_buffer(self) -> Buffer: ... + + +class StringViewScalar(Scalar[StringViewType]): + def as_buffer(self) -> Buffer: ... + + +class ListScalar(Scalar[ListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class FixedSizeListScalar(Scalar[FixedSizeListType[_DataTypeT, _Size]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class LargeListScalar(Scalar[LargeListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class ListViewScalar(Scalar[ListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class LargeListViewScalar(Scalar[LargeListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> Scalar[_DataTypeT]: ... + + def __iter__(self) -> Iterator[Array]: ... + + +class StructScalar(Scalar[StructType], collections.abc.Mapping[str, Scalar]): + def __len__(self) -> int: ... + + def __iter__(self) -> Iterator[str]: ... + + def __getitem__(self, key: int | str) -> Scalar[Any]: ... + + def keys(self) -> collections.abc.KeysView[str]: # type: ignore[override] + ... + + def items(self) -> collections.abc.ItemsView[str, Scalar[Any]]: # type: ignore[override] # noqa: E501 + ... + + def _as_py_tuple(self) -> list[tuple[str, Any]]: ... + + +class MapScalar(Scalar[MapType[_K, _ValueT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + + def __getitem__(self, i: int | str) -> ( + tuple[Scalar[_K], _ValueT, Any] | Scalar[Any]): ... + + def __iter__(self: Scalar[ + MapType[_BasicDataType[_AsPyTypeK], _BasicDataType[_AsPyTypeV]]] + | Scalar[MapType[Any, _BasicDataType[_AsPyTypeV]]] + | Scalar[MapType[_BasicDataType[_AsPyTypeK], Any]]) -> ( + Iterator[tuple[_AsPyTypeK, _AsPyTypeV]] + | Iterator[tuple[Any, _AsPyTypeV]] + | Iterator[tuple[_AsPyTypeK, Any]] + ): ... + + +class DictionaryScalar(Scalar[DictionaryType[_IndexT, _BasicValueT]]): + @property + def index(self) -> Scalar[_IndexT]: ... + + @property + def value(self) -> Scalar[_BasicValueT]: ... + + @property + def dictionary(self) -> Array: ... + + +class RunEndEncodedScalar(Scalar[RunEndEncodedType[_RunEndType, _BasicValueT]]): + @property + def value(self) -> tuple[int, _BasicValueT] | None: ... + + +class UnionScalar(Scalar[UnionType]): + @property + def value(self) -> Any | None: ... + + @property + def type_code(self) -> str: ... + + +class ExtensionScalar(Scalar[ExtensionType]): + @property + def value(self) -> Any | None: ... + + @staticmethod + def from_storage(typ: BaseExtensionType, value) -> ExtensionScalar: ... + + +class Bool8Scalar(Scalar[Bool8Type]): + ... + + +class UuidScalar(Scalar[UuidType]): + ... + + +class JsonScalar(Scalar[JsonType]): + ... + + +class OpaqueScalar(Scalar[OpaqueType]): + ... + + +class FixedShapeTensorScalar(ExtensionScalar): + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> ( + np.ndarray): ... # type: ignore[override] + + def to_tensor(self) -> Tensor: ... + + +def scalar( + value: Any, + type: _DataTypeT | str | None = None, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Scalar[_DataTypeT] | Scalar[Any]: ... + + +__all__ = [ + "Scalar", + "_NULL", + "NA", + "NullScalar", + "BooleanScalar", + "UInt8Scalar", + "Int8Scalar", + "UInt16Scalar", + "Int16Scalar", + "UInt32Scalar", + "Int32Scalar", + "UInt64Scalar", + "Int64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "FixedSizeBinaryScalar", + "StringScalar", + "LargeStringScalar", + "BinaryViewScalar", + "StringViewScalar", + "ListScalar", + "FixedSizeListScalar", + "LargeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "StructScalar", + "MapScalar", + "DictionaryScalar", + "RunEndEncodedScalar", + "UnionScalar", + "ExtensionScalar", + "FixedShapeTensorScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "scalar", +] diff --git a/python/pyarrow-stubs/pyarrow/substrait.pyi b/python/pyarrow-stubs/pyarrow/substrait.pyi new file mode 100644 index 00000000000..b78bbd8aebd --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/substrait.pyi @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._substrait import ( + BoundExpressions, + SubstraitSchema, + deserialize_expressions, + deserialize_schema, + get_supported_functions, + run_query, + serialize_expressions, + serialize_schema, +) + +__all__ = [ + "BoundExpressions", + "get_supported_functions", + "run_query", + "deserialize_expressions", + "serialize_expressions", + "deserialize_schema", + "serialize_schema", + "SubstraitSchema", +] diff --git a/python/pyarrow-stubs/pyarrow/table.pyi b/python/pyarrow-stubs/pyarrow/table.pyi new file mode 100644 index 00000000000..6dd61674d40 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/table.pyi @@ -0,0 +1,686 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from collections.abc import ( + Collection, Generator, Iterable, Iterator, Sequence, Mapping) +from typing import Any, Generic, Literal, TypeVar +import builtins + +import numpy as np +import pandas as pd + +from numpy.typing import NDArray +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, + VarianceOptions, +) +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.compute import Expression +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Device, MemoryManager, MemoryPool, Schema +from pyarrow.lib import Field as _Field + +from .array import Array, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from ._ipc import RecordBatchReader +from .scalar import BooleanScalar, Int64Scalar, Scalar, StructScalar +from .tensor import Tensor +from ._stubs_typing import NullableCollection +from ._types import DataType, _AsPyType, _BasicDataType, _DataTypeT + +Field: TypeAlias = _Field[DataType] +_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +ArrayOrChunkedArray: TypeAlias = Array[_Scalar_co] | ChunkedArray[_Scalar_co] + +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed | str +AggregateOptions: TypeAlias = (ScalarAggregateOptions | CountOptions + | TDigestOptions | VarianceOptions | FunctionOptions) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + + def as_py(self) -> list[Any]: ... + + @property + def data(self) -> Self: ... + @property + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: ... + + # Private attribute used internally for column names + _name: str | None + + def length(self) -> int: ... + + __len__ = length + + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: ... + + format = to_string + def validate(self, *, full: bool = False) -> None: ... + + @property + def null_count(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def __getitem__( + self, key: int | np.integer | builtins.slice) -> _Scalar_co | Self: ... + + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[BooleanScalar]: ... + + def is_nan(self) -> ChunkedArray[BooleanScalar]: ... + + def is_valid(self) -> ChunkedArray[BooleanScalar]: ... + + def cast( + self, target_type: _CastAs | str | None, safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None + ) -> Self | ChunkedArray[Scalar[_CastAs]]: ... + + def fill_null(self, fill_value: Scalar[_DataTypeT] | Any) -> Self: ... + + def equals(self, other: Self | Any) -> bool: ... + + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: ... + + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: ... + + def flatten(self, memory_pool: MemoryPool | + None = None) -> list[ChunkedArray[Any]]: ... + + def combine_chunks(self, memory_pool: MemoryPool | + None = None) -> Array[_Scalar_co]: ... + + def unique(self) -> ChunkedArray[_Scalar_co]: ... + + def value_counts(self) -> StructArray: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def filter(self, mask: Mask, + null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ... + + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + + def take(self, indices: Indices) -> Self: ... + + def drop_null(self) -> Self: ... + + def sort(self, order: Order = "ascending", **kwargs) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + @property + def num_chunks(self) -> int: ... + + def chunk(self, i: int) -> Array[_Scalar_co]: ... + + @property + def chunks(self) -> list[Array[_Scalar_co]]: ... + + def iterchunks( + self: ArrayOrChunkedArray[_ScalarT], + ) -> Generator[Array, None, None]: ... + + def __iter__(self) -> Iterator[_Scalar_co]: ... + + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: ... + + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: ... + + @property + def is_cpu(self) -> bool: ... + + +def chunked_array( + arrays: Iterable[NullableCollection[Any]] + | Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray] + | Iterable[Array[_ScalarT]] | Array[_ScalarT] + | SupportArrowArray | SupportArrowStream, + type: DataType | str | None = None, +) -> ChunkedArray[Scalar[Any]] | ChunkedArray[_ScalarT]: ... + + +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) + + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, + copy: bool | None = None) -> np.ndarray: ... + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: ... + + def __getitem__(self, key: int | str | slice) -> _ColumnT | Self: ... + + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: ... + + @property + def column_names(self) -> list[str]: ... + + @property + def columns(self) -> list[_ColumnT]: ... + + def drop_null(self) -> Self: ... + + def field(self, i: int | str) -> Field: ... + + @classmethod + def from_pydict( + cls, + mapping: + Mapping[Any, ArrayOrChunkedArray[Any] | list[Any] | np.ndarray | range], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_pylist( + cls, + mapping: Sequence[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + ) -> Self: ... + + def itercolumns(self) -> Generator[_ColumnT, None, None]: ... + + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: ... + + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: ... + + def take(self, indices: Indices) -> Self: ... + + def filter( + self, + mask: Mask | Expression, + null_selection_behavior: NullSelectionBehavior = "drop") -> Self: ... + + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list[Any]]: ... + + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: ... + + def to_string(self, *, show_metadata: bool = False, + preview_cols: int = 0) -> str: ... + + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: ... + + def add_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def append_column( + self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list[list[Any]] + ) -> Self: ... + + +class RecordBatch(_Tabular[Array]): + def validate(self, *, full: bool = False) -> None: ... + + def replace_schema_metadata( + self, + metadata: dict[str, str] + | dict[bytes, bytes] + | dict[bytes, str] + | dict[str, bytes] + | None = None + ) -> Self: ... + + @property + def num_columns(self) -> int: ... + + @property + def num_rows(self) -> int: ... + + @property + def schema(self) -> Schema: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def equals(self, other: Self | Any, check_metadata: bool = False) -> bool: ... + + def select(self, columns: Iterable[str] | + Iterable[int] | NDArray[np.str_]) -> Self: ... + + def cast(self, target_schema: Schema, safe: bool | None = None, + options: CastOptions | None = None) -> Self: ... + + @classmethod + def from_arrays( + cls, + arrays: Iterable[Any], + names: list[str] | tuple[str, ...] | None = None, + schema: Schema | None = None, + metadata: Mapping[bytes, bytes] + | Mapping[str, str] + | Mapping[bytes, str] + | Mapping[str, bytes] + | None = None, + ) -> Self: ... + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: Sequence[str | int] | None = None, + ) -> Self: ... + + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: ... + + def to_struct_array(self) -> StructArray: ... + + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: ... + + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): ... + + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_array__(self, requested_schema=None): ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: ... + + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: ... + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): ... + + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + + @property + def device_type(self) -> DeviceAllocationType: ... + + @property + def is_cpu(self) -> bool: ... + + def copy_to(self, destination: MemoryManager | Device) -> Self: ... + + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + + +class Table(_Tabular[ChunkedArray[Any]]): + def validate(self, *, full: bool = False) -> None: ... + + def slice(self, offset: int = 0, length: int | None = None) -> Self: ... + + def select(self, columns: Iterable[str] | + Iterable[int] | NDArray[np.str_]) -> Self: ... + + def replace_schema_metadata( + self, metadata: dict[str, str] + | dict[bytes, bytes] + | dict[bytes, str] + | dict[str, bytes] + | None = None + ) -> Self: ... + + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: ... + + def equals(self, other: Self | Any, check_metadata: bool = False) -> bool: ... + + def cast(self, target_schema: Schema, safe: bool | None = None, + options: CastOptions | None = None) -> Self: ... + + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: Sequence[str | int] | None = None, + safe: bool = True, + ) -> Self: ... + + @classmethod + def from_arrays( + cls, + arrays: + Collection[ArrayOrChunkedArray[Any] | Collection[NDArray[Any]] | list[Any]], + names: list[str] | tuple[str, ...] | None = None, + schema: Schema | None = None, + metadata: Mapping[bytes, bytes] + | Mapping[str, str] + | Mapping[bytes, str] + | Mapping[str, bytes] | None = None, + ) -> Self: ... + + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[StructScalar] + ) -> Self: ... + + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[StructScalar]: ... + + @classmethod + def from_batches(cls, batches: Iterable[RecordBatch], + schema: Schema | None = None) -> Self: ... + + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: ... + + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: ... + + @property + def schema(self) -> Schema: ... + + @property + def num_columns(self) -> int: ... + + @property + def num_rows(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def get_total_buffer_size(self) -> int: ... + + def __sizeof__(self) -> int: ... + + def add_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def remove_column(self, i: int) -> Self: ... + + def set_column(self, i: int, field_: str | Field, + column: ArrayOrChunkedArray[Any] | list[list[Any]]) -> Self: ... + + def rename_columns(self, names: list[str] | dict[str, str]) -> Self: ... + + def drop(self, columns: str | list[str]) -> Self: ... + + def group_by(self, keys: str | list[str], + use_threads: bool = True) -> TableGroupBy: ... + + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: ... + + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: ... + + def __arrow_c_stream__(self, requested_schema=None): ... + + @property + def is_cpu(self) -> bool: ... + + +def record_batch( + data: Mapping[str, list[Any] | Array[Any]] + | Collection[Array[Any] | ChunkedArray[Any] | list[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | Schema | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, +) -> RecordBatch: ... + + +def table( + data: Collection[ArrayOrChunkedArray[Any] | list[Any] | range | str] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray + | Mapping[str, list[Any] | Array[Any] | ChunkedArray[Any] | range] + | Mapping[str, Any], + names: list[str] | Schema | None = None, + schema: Schema | None = None, + metadata: Mapping[str | bytes, str | bytes] | None = None, + nthreads: int | None = None, +) -> Table: ... + + +def concat_tables( + tables: Iterable[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "permissive"] = "none", + **kwargs: Any, +) -> Table: ... + + +class TableGroupBy: + + keys: str | list[str] + + def __init__(self, table: Table, keys: str | + list[str], use_threads: bool = True): ... + + def aggregate( + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], + ) -> Table: ... + + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... + + +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: ... + + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", + "concat_batches", + "Aggregation", + "AggregateOptions", +] diff --git a/python/pyarrow-stubs/pyarrow/tensor.pyi b/python/pyarrow-stubs/pyarrow/tensor.pyi new file mode 100644 index 00000000000..ba40c7b299d --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/tensor.pyi @@ -0,0 +1,268 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from collections.abc import Sequence +import numpy as np + +from pyarrow.lib import _Weakrefable +from pyarrow._types import DataType +from scipy.sparse import coo_matrix, csr_matrix +from sparse import COO # type: ignore[import-untyped, import-not-found] + + +class Tensor(_Weakrefable): + @classmethod + def from_numpy(cls, obj: np.ndarray, + dim_names: Sequence[str] | None = None) -> Self: ... + + def to_numpy(self) -> np.ndarray: ... + + def equals(self, other: Tensor) -> bool: ... + + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + + @property + def is_mutable(self) -> bool: ... + + @property + def is_contiguous(self) -> bool: ... + + @property + def ndim(self) -> int: ... + + @property + def size(self) -> str: ... + + @property + def shape(self) -> tuple[int, ...]: ... + + @property + def strides(self) -> tuple[int, ...]: ... + + @property + def type(self) -> DataType: ... + + +class SparseCOOTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + coords: np.ndarray, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_pydata_sparse( + cls, obj: COO, dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> coo_matrix: ... + + def to_pydata_sparse(self) -> COO: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def has_canonical_format(self) -> bool: ... + @property + def type(self) -> DataType: ... + + +class SparseCSRMatrix(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def type(self) -> DataType: ... + + +class SparseCSCMatrix(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: ... + + @classmethod + def from_scipy(cls, obj: csr_matrix, + dim_names: list[str] | None = None) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_scipy(self) -> csr_matrix: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + + +class SparseCSFTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, + dim_names: Sequence[str] | None = None) -> Self: ... + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: Sequence[np.ndarray], + indices: Sequence[np.ndarray], + shape: tuple[int, ...], + axis_order: Sequence[int] | None = None, + dim_names: Sequence[str] | None = None, + ) -> Self: ... + + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: ... + + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ... + + def to_tensor(self) -> Tensor: ... + + def equals(self, other: Self) -> bool: ... + + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: ... + + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def type(self) -> DataType: ... + + +__all__ = [ + "Tensor", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", +] diff --git a/python/pyarrow-stubs/pyarrow/tests/util.pyi b/python/pyarrow-stubs/pyarrow/tests/util.pyi new file mode 100644 index 00000000000..5ceb784588a --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/tests/util.pyi @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Callable +from contextlib import AbstractContextManager +from decimal import Decimal +from os import PathLike +from typing import Any, Literal +import socket + +import pyarrow.fs + + +def randsign() -> int: ... +def random_seed(seed: int) -> AbstractContextManager[None]: ... +def randdecimal(precision: int, scale: int) -> Decimal: ... +def random_ascii(length: int) -> bytes: ... +def rands(nchars: int) -> str: ... +def get_modified_env_with_pythonpath() -> dict[str, str]: ... +def invoke_script(script_name: str, *args: str) -> None: ... +def changed_environ(name: str, value: str) -> AbstractContextManager[None]: ... +def change_cwd(path: str | PathLike[str]) -> AbstractContextManager[None]: ... +def disabled_gc() -> AbstractContextManager[None]: ... +def _filesystem_uri(path: str) -> str: ... + + +def memory_leak_check( + f: Callable[[], Any], + metric: Literal['rss', 'vms', 'shared'] = 'rss', + threshold: int = 131072, + iterations: int = 10, + check_interval: int = 1 +) -> None: ... + + +class FSProtocolClass: + def __init__(self, path: str | PathLike[str]) -> None: ... + def __fspath__(self) -> str: ... + + +class ProxyHandler(pyarrow.fs.FileSystemHandler): + _fs: pyarrow.fs.FileSystem + def __init__(self, fs: pyarrow.fs.FileSystem) -> None: ... + def __eq__(self, other: object) -> bool: ... + def __ne__(self, other: object) -> bool: ... + def get_type_name(self) -> str: ... + def normalize_path(self, path: str) -> str: ... + def get_file_info(self, paths: list[str]) -> list[pyarrow.fs.FileInfo]: ... + def get_file_info_selector( + self, selector: pyarrow.fs.FileSelector) -> list[pyarrow.fs.FileInfo]: ... + + def create_dir(self, path: str, recursive: bool) -> None: ... + def delete_dir(self, path: str) -> None: ... + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: ... + def delete_root_dir_contents(self) -> None: ... + def delete_file(self, path: str) -> None: ... + def move(self, src: str, dest: str) -> None: ... + def copy_file(self, src: str, dest: str) -> None: ... + def open_input_stream(self, path: str) -> Any: ... + def open_input_file(self, path: str) -> Any: ... + def open_output_stream(self, path: str, metadata: dict[str, str]) -> Any: ... + def open_append_stream(self, path: str, metadata: dict[str, str]) -> Any: ... + + +def _ensure_minio_component_version(component: str, minimum_year: int) -> bool: ... +def _run_mc_command(mcdir: str, *args: str) -> None: ... +def windows_has_tzdata() -> bool: ... +def running_on_musllinux() -> bool: ... + + +def signal_wakeup_fd( + *, warn_on_full_buffer: bool = False) -> AbstractContextManager[socket.socket]: ... + + +def _configure_s3_limited_user( + s3_server: dict[str, Any], policy: str, username: str, password: str) -> None: ... + + +def _wait_for_minio_startup( + mcdir: str, address: str, access_key: str, secret_key: str) -> None: ... diff --git a/python/pyarrow-stubs/pyarrow/types.pyi b/python/pyarrow-stubs/pyarrow/types.pyi new file mode 100644 index 00000000000..9e5a0568db0 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/types.pyi @@ -0,0 +1,227 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +from enum import IntEnum + +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +import pyarrow.lib as lib + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + UInt8Type, + UInt16Type, + UInt32Type, + UInt64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | UInt32Type | UInt64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = (TimestampType[Any, Any] + | DurationType[Any] | _Time | _Date | _Interval) +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... +def is_uint32(t: DataType) -> TypeIs[UInt32Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... +def is_primitive(t: DataType) -> bool: ... +def is_boolean_value(obj: Any) -> bool: ... +def is_integer_value(obj: Any) -> bool: ... +def is_float_value(obj: Any) -> bool: ... + + +__all__ = [ + "lib", + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] + + +class TypesEnum(IntEnum): + INTERVAL_MONTHS = 0 + INTERVAL_DAY_TIME = 1 + INTERVAL_MONTH_DAY_NANO = 2 diff --git a/python/pyarrow-stubs/pyarrow/util.pyi b/python/pyarrow-stubs/pyarrow/util.pyi new file mode 100644 index 00000000000..c3317960c81 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/util.pyi @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Callable, Sequence +from os import PathLike +from typing import Any, Protocol, TypeVar + +_F = TypeVar("_F", bound=Callable) +_N = TypeVar("_N") + + +class _DocStringComponents(Protocol): + _docstring_components: list[str] + + +def doc( + *docstrings: str | _DocStringComponents | Callable | None, **params: Any +) -> Callable[[_F], _F]: ... +def _is_iterable(obj) -> bool: ... +def _is_path_like(path) -> bool: ... +def _stringify_path(path: str | PathLike) -> str: ... +def product(seq: Sequence[_N]) -> _N: ... + + +def get_contiguous_span( + shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int +) -> tuple[int, int]: ... +def find_free_port() -> int: ... +def guid() -> str: ... +def _download_urllib(url, out_path) -> None: ... +def _download_requests(url, out_path) -> None: ... +def download_tzdata_on_windows() -> None: ... +def _deprecate_api(old_name, new_name, api, next_version, type=...): ... +def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... +def _break_traceback_cycle_from_frame(frame) -> None: ... diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index da2fe966475..46e092ad8f7 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -29,19 +29,17 @@ For more information see the official page at https://arrow.apache.org """ -import gc as _gc import importlib as _importlib import os as _os import platform as _platform import sys as _sys -import warnings as _warnings try: - from ._generated_version import version as __version__ + from ._generated_version import version as __version__ # type: ignore[import-untyped] # noqa: E501 except ImportError: # Package is not installed, parse git tag at runtime try: - import setuptools_scm + import setuptools_scm # type: ignore[import-not-found, import-untyped] # Code duplicated from setup.py to avoid a dependency on each other def parse_git(root, **kwargs): @@ -49,14 +47,14 @@ def parse_git(root, **kwargs): Parse function for setuptools_scm that ignores tags for non-C++ subprojects, e.g. apache-arrow-js-XXX tags. """ - from setuptools_scm.git import parse + from setuptools_scm.git import parse # type: ignore[import-not-found, import-untyped] # noqa: E501 kwargs['describe_command'] = \ "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" return parse(root, **kwargs) __version__ = setuptools_scm.get_version('../', parse=parse_git) except ImportError: - __version__ = None + __version__ = None # type: ignore[assignment] import pyarrow.lib as _lib from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, @@ -153,6 +151,8 @@ def print_entry(label, value): print(f" {codec: <20}: {status: <8}") +from pyarrow.lib import ( + DataType, Array, MemoryPool) # type: ignore[reportAttributeAccessIssue] from pyarrow.lib import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, @@ -170,7 +170,7 @@ def print_entry(label, value): bool8, fixed_shape_tensor, json_, opaque, uuid, field, type_for_alias, - DataType, DictionaryType, StructType, + DictionaryType, StructType, ListType, LargeListType, FixedSizeListType, ListViewType, LargeListViewType, MapType, UnionType, SparseUnionType, DenseUnionType, @@ -187,8 +187,7 @@ def print_entry(label, value): Field, Schema, schema, - unify_schemas, - Array, Tensor, + unify_schemas, Tensor, array, chunked_array, record_batch, nulls, repeat, SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix, SparseCSFTensor, @@ -243,7 +242,7 @@ def print_entry(label, value): from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, Codec, compress, decompress, allocate_buffer) -from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, +from pyarrow.lib import (LoggingMemoryPool, ProxyMemoryPool, total_allocated_bytes, set_memory_pool, default_memory_pool, system_memory_pool, jemalloc_memory_pool, mimalloc_memory_pool, @@ -365,7 +364,7 @@ def create_library_symlinks(): if _sys.platform == 'linux': bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*')) - def get_symlink_path(hard_path): + def get_symlink_path(hard_path): # type: ignore[reportRedeclaration] return hard_path.rsplit('.', 1)[0] else: bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib')) diff --git a/python/pyarrow/acero.py b/python/pyarrow/acero.py index e475e8db5c2..cd99a1bbc53 100644 --- a/python/pyarrow/acero.py +++ b/python/pyarrow/acero.py @@ -22,7 +22,7 @@ # distutils: language = c++ # cython: language_level = 3 -from pyarrow.lib import Table, RecordBatch, array +from pyarrow.lib import Table, RecordBatch, array, Schema from pyarrow.compute import Expression, field try: @@ -49,11 +49,14 @@ except ImportError: class DatasetModuleStub: class Dataset: - pass + @property + def schema(self): + return Schema() class InMemoryDataset: - pass - ds = DatasetModuleStub + def __init__(self, source): + pass + ds = DatasetModuleStub # type: ignore[assignment] def _dataset_to_decl(dataset, use_threads=True, implicit_ordering=False): @@ -306,7 +309,7 @@ def _perform_join_asof(left_operand, left_on, left_by, # AsofJoin does not return on or by columns for right_operand. right_columns = [ col for col in right_operand.schema.names - if col not in [right_on] + right_by + if col not in [right_on] + right_by # type: ignore[reportOperatorIssue] ] columns_collisions = set(left_operand.schema.names) & set(right_columns) if columns_collisions: diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index bf5beab589d..109d8ebe597 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3634,7 +3634,7 @@ cdef class FixedSizeListArray(BaseListArray): Or create from a values array, list size and matching type: >>> typ = pa.list_(pa.field("values", pa.int64()), 2) - >>> arr = pa.FixedSizeListArray.from_arrays(values,type=typ) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) >>> arr [ diff --git a/python/pyarrow/benchmark.py b/python/pyarrow/benchmark.py index 25ee1141f08..0ee9063a9a7 100644 --- a/python/pyarrow/benchmark.py +++ b/python/pyarrow/benchmark.py @@ -18,4 +18,4 @@ # flake8: noqa -from pyarrow.lib import benchmark_PandasObjectIsNull +from pyarrow.lib import benchmark_PandasObjectIsNull # type: ignore[attr-defined] diff --git a/python/pyarrow/cffi.py b/python/pyarrow/cffi.py index 1da1a916914..e5a1c9c1d07 100644 --- a/python/pyarrow/cffi.py +++ b/python/pyarrow/cffi.py @@ -16,8 +16,15 @@ # under the License. from __future__ import absolute_import +from typing import TYPE_CHECKING -import cffi +if TYPE_CHECKING: + import cffi +else: + try: + import cffi + except ImportError: + pass c_source = """ struct ArrowSchema { diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index fe0afdb0a87..259dd5eb94d 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -106,7 +106,7 @@ import warnings import pyarrow as pa -from pyarrow import _compute_docstrings +from pyarrow import _compute_docstrings # type: ignore[reportAttributeAccessIssue] from pyarrow.vendored import docscrape @@ -241,7 +241,7 @@ def _handle_options(name, options_class, options, args, kwargs): def _make_generic_wrapper(func_name, func, options_class, arity): if options_class is None: - def wrapper(*args, memory_pool=None): + def wrapper(*args, memory_pool=None): # type: ignore[misc] if arity is not Ellipsis and len(args) != arity: raise TypeError( f"{func_name} takes {arity} positional argument(s), " @@ -251,7 +251,8 @@ def wrapper(*args, memory_pool=None): return Expression._call(func_name, list(args)) return func.call(args, None, memory_pool) else: - def wrapper(*args, memory_pool=None, options=None, **kwargs): + def wrapper( # type: ignore[misc] + *args, memory_pool=None, options=None, **kwargs): if arity is not Ellipsis: if len(args) < arity: raise TypeError( @@ -608,7 +609,7 @@ def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None): sort_keys.append(("dummy", "descending")) else: sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys) - options = SelectKOptions(k, sort_keys) + options = SelectKOptions(k, sort_keys) # type: ignore[reportArgumentType] return call_function("select_k_unstable", [values], options, memory_pool) @@ -655,7 +656,7 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None): sort_keys.append(("dummy", "ascending")) else: sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys) - options = SelectKOptions(k, sort_keys) + options = SelectKOptions(k, sort_keys) # type: ignore[reportArgumentType] return call_function("select_k_unstable", [values], options, memory_pool) @@ -681,7 +682,8 @@ def random(n, *, initializer='system', options=None, memory_pool=None): memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """ - options = RandomOptions(initializer=initializer) + options = RandomOptions( + initializer=initializer) # type: ignore[reportArgumentType] return call_function("random", [], options, memory_pool, length=n) @@ -723,7 +725,7 @@ def field(*name_or_index): if isinstance(name_or_index[0], (str, int)): return Expression._field(name_or_index[0]) elif isinstance(name_or_index[0], tuple): - return Expression._nested_field(name_or_index[0]) + return Expression._nested_field(name_or_index[0]) # type: ignore else: raise TypeError( "field reference should be str, multiple str, tuple or " @@ -731,7 +733,7 @@ def field(*name_or_index): ) # In case of multiple strings not supplied in a tuple else: - return Expression._nested_field(name_or_index) + return Expression._nested_field(name_or_index) # type: ignore def scalar(value): diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 41beaa14041..0e8ef66485e 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -114,13 +114,13 @@ defaults['timezone_data'] = os.path.exists("/usr/share/zoneinfo") try: - import cython # noqa + import cython # type: ignore[import-untyped, import-not-found] # noqa defaults['cython'] = True except ImportError: pass try: - import fastparquet # noqa + import fastparquet # type: ignore[import-untyped, import-not-found] # noqa defaults['fastparquet'] = True except ImportError: pass @@ -347,7 +347,7 @@ def func(ctx, x): pc.register_aggregate_function(func, func_name, - func_doc, + func_doc, # type: ignore { "x": pa.float64(), }, diff --git a/python/pyarrow/cuda.py b/python/pyarrow/cuda.py index 18c530d4afe..eeb637f0ab4 100644 --- a/python/pyarrow/cuda.py +++ b/python/pyarrow/cuda.py @@ -18,7 +18,7 @@ # flake8: noqa -from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, +from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, # type: ignore[reportMissingModuleSource] HostBuffer, BufferReader, BufferWriter, new_host_buffer, serialize_record_batch, read_message, diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 039da8c0d56..967c4b475dd 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -54,6 +54,9 @@ get_partition_keys as _get_partition_keys, # keep for backwards compatibility _filesystemdataset_write, ) + from pyarrow.fs import FileInfo + + except ImportError as exc: raise ImportError( f"The pyarrow installation is not built with support for 'dataset' ({str(exc)})" @@ -70,7 +73,8 @@ ) try: - from pyarrow._dataset_orc import OrcFileFormat + from pyarrow._dataset_orc import ( # type: ignore[import-not-found] + OrcFileFormat) _orc_available = True except ImportError: pass @@ -371,6 +375,7 @@ def _ensure_multiple_sources(paths, filesystem=None): # possible improvement is to group the file_infos by type and raise for # multiple paths per error category if is_local: + # type: ignore[reportGeneralTypeIssues] for info in filesystem.get_file_info(paths): file_type = info.type if file_type == FileType.File: @@ -422,16 +427,18 @@ def _ensure_single_source(path, filesystem=None): filesystem, path = _resolve_filesystem_and_path(path, filesystem) # ensure that the path is normalized before passing to dataset discovery + assert isinstance(path, str) path = filesystem.normalize_path(path) # retrieve the file descriptor file_info = filesystem.get_file_info(path) + assert isinstance(file_info, FileInfo) # depending on the path type either return with a recursive # directory selector or as a list containing a single file - if file_info.type == FileType.Directory: + if file_info.type == FileType.Directory: # type: ignore[reportAttributeAccessIssue] paths_or_selector = FileSelector(path, recursive=True) - elif file_info.type == FileType.File: + elif file_info.type == FileType.File: # type: ignore[reportAttributeAccessIssue] paths_or_selector = [path] else: raise FileNotFoundError(path) @@ -1035,6 +1042,7 @@ def file_visitor(written_file): _filesystemdataset_write( scanner, base_dir, basename_template, filesystem, partitioning, preserve_order, file_options, max_partitions, file_visitor, - existing_data_behavior, max_open_files, max_rows_per_file, - min_rows_per_group, max_rows_per_group, create_dir + existing_data_behavior, # type: ignore[reportArgumentType] + max_open_files, max_rows_per_file, min_rows_per_group, + max_rows_per_group, create_dir ) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 241c27706a6..4b0ecb9f18e 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -183,6 +183,7 @@ def write_feather(df, dest, compression=None, compression_level=None, f'one of {_FEATHER_SUPPORTED_CODECS}') try: + assert version in (1, 2) _feather.write_feather(table, dest, compression=compression, compression_level=compression_level, chunksize=chunksize, version=version) @@ -269,7 +270,7 @@ def read_table(source, columns=None, memory_map=False, use_threads=True): f"Got columns {columns} of types {column_type_names}") # Feather v1 already respects the column selection - if reader.version < 3: + if int(reader.version) < 3: return table # Feather v2 reads with sorted / deduplicated selection elif sorted(set(columns)) == columns: diff --git a/python/pyarrow/flight.py b/python/pyarrow/flight.py index b1836907c67..ba5008c9ecf 100644 --- a/python/pyarrow/flight.py +++ b/python/pyarrow/flight.py @@ -16,7 +16,7 @@ # under the License. try: - from pyarrow._flight import ( # noqa:F401 + from pyarrow._flight import ( # noqa:F401 # type: ignore[import-not-found] connect, Action, ActionType, diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 670ccaaf245..e1aa9090d2d 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -40,7 +40,7 @@ _not_imported = [] try: - from pyarrow._azurefs import AzureFileSystem # noqa + from pyarrow._azurefs import AzureFileSystem # noqa # type: ignore[reportMissingModuleSource] except ImportError: _not_imported.append("AzureFileSystem") @@ -50,12 +50,12 @@ _not_imported.append("HadoopFileSystem") try: - from pyarrow._gcsfs import GcsFileSystem # noqa + from pyarrow._gcsfs import GcsFileSystem # noqa # type: ignore[reportMissingModuleSource] except ImportError: _not_imported.append("GcsFileSystem") try: - from pyarrow._s3fs import ( # noqa + from pyarrow._s3fs import ( # noqa # type: ignore[reportMissingModuleSource] AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy, S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized, finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region) @@ -111,7 +111,7 @@ def _ensure_filesystem(filesystem, *, use_mmap=False): else: # handle fsspec-compatible filesystems try: - import fsspec + import fsspec # type: ignore[import-untyped] except ImportError: pass else: @@ -165,6 +165,7 @@ def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False): file_info = None exists_locally = False else: + assert isinstance(file_info, FileInfo) exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that @@ -250,7 +251,9 @@ def copy_files(source, destination, destination, destination_filesystem ) + assert isinstance(source_fs, FileSystem) file_info = source_fs.get_file_info(source_path) + assert isinstance(file_info, FileInfo) if file_info.type == FileType.Directory: source_sel = FileSelector(source_path, recursive=True) _copy_files_selector(source_fs, source_sel, diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py index 4e0d66ec665..222c289c879 100644 --- a/python/pyarrow/orc.py +++ b/python/pyarrow/orc.py @@ -20,7 +20,7 @@ import warnings from pyarrow.lib import Table -import pyarrow._orc as _orc +import pyarrow._orc as _orc # type: ignore[reportMissingModuleSource] from pyarrow.fs import _resolve_filesystem_and_path @@ -255,9 +255,11 @@ def __init__(self, where, *, file_version=file_version, batch_size=batch_size, stripe_size=stripe_size, - compression=compression, + compression=compression, # type: ignore[reportArgumentType] compression_block_size=compression_block_size, - compression_strategy=compression_strategy, + compression_strategy=( + compression_strategy # type: ignore[reportArgumentType] + ), row_index_stride=row_index_stride, padding_tolerance=padding_tolerance, dictionary_key_size_threshold=dictionary_key_size_threshold, diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 300a14a3918..3f129198f8e 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -33,18 +33,18 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa -_logical_type_map = {} -_numpy_logical_type_map = {} -_pandas_logical_type_map = {} +_logical_type_map: dict[int, str] = {} +_numpy_logical_type_map: dict[int, str] = {} +_pandas_logical_type_map: dict[int, str] = {} def get_logical_type_map(): - global _logical_type_map + global _logical_type_map # noqa: F824 if not _logical_type_map: _logical_type_map.update({ @@ -90,9 +90,9 @@ def get_logical_type(arrow_type): def get_numpy_logical_type_map(): - global _numpy_logical_type_map + global _numpy_logical_type_map # noqa: F824 if not _numpy_logical_type_map: - _numpy_logical_type_map.update({ + _numpy_logical_type_map.update({ # type: ignore[reportCallIssue] np.bool_: 'bool', np.int8: 'int8', np.int16: 'int16', @@ -693,7 +693,7 @@ def get_datetimetz_type(values, dtype, type_): # If no user type passed, construct a tz-aware timestamp type tz = dtype.tz unit = dtype.unit - type_ = pa.timestamp(unit, tz) + type_ = pa.timestamp(unit, tz) # type: ignore[reportArgumentType] elif type_ is None: # Trust the NumPy dtype type_ = pa.from_numpy_dtype(values.dtype) @@ -732,7 +732,7 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= pandas Block """ - import pandas.core.internals as _int + import pandas.core.internals as _int # type: ignore[import-not-found] block_arr = item.get('block', None) placement = item['placement'] @@ -758,6 +758,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= # create ExtensionBlock arr = item['py_array'] assert len(placement) == 1 + assert isinstance(columns, list) + assert isinstance(extension_columns, dict) name = columns[placement[0]] pandas_dtype = extension_columns[name] if not hasattr(pandas_dtype, '__from_arrow__'): @@ -777,7 +779,7 @@ def make_datetimetz(unit, tz): if _pandas_api.is_v1(): unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns] tz = pa.lib.string_to_tzinfo(tz) - return _pandas_api.datetimetz_type(unit, tz=tz) + return _pandas_api.datetimetz_type(unit, tz=tz) # type: ignore[reportArgumentType] def table_to_dataframe( @@ -811,7 +813,8 @@ def table_to_dataframe( result = pa.lib.table_to_blocks(options, table, categories, list(ext_columns_dtypes.keys())) if _pandas_api.is_ge_v3(): - from pandas.api.internals import create_dataframe_from_blocks + from pandas.api.internals import ( # type: ignore[import-not-found] + create_dataframe_from_blocks) blocks = [ _reconstruct_block( @@ -823,7 +826,8 @@ def table_to_dataframe( return df else: - from pandas.core.internals import BlockManager + from pandas.core.internals import ( # type: ignore[reportMissingImports] + BlockManager) from pandas import DataFrame blocks = [ @@ -833,7 +837,8 @@ def table_to_dataframe( axes = [columns, index] mgr = BlockManager(blocks, axes) if _pandas_api.is_ge_v21(): - df = DataFrame._from_mgr(mgr, mgr.axes) + df = DataFrame._from_mgr( # type: ignore[reportAttributeAccessIssue] + mgr, mgr.axes) else: df = DataFrame(mgr) @@ -1081,10 +1086,10 @@ def _is_generated_index_name(name): def get_pandas_logical_type_map(): - global _pandas_logical_type_map + global _pandas_logical_type_map # noqa: F824 if not _pandas_logical_type_map: - _pandas_logical_type_map.update({ + _pandas_logical_type_map.update({ # type: ignore[reportCallIssue] 'date': 'datetime64[D]', 'datetime': 'datetime64[ns]', 'datetimetz': 'datetime64[ns]', @@ -1151,12 +1156,14 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): labels = getattr(columns, 'codes', None) or [None] # Convert each level to the dtype provided in the metadata - levels_dtypes = [ - (level, col_index.get('pandas_type', str(level.dtype)), - col_index.get('numpy_type', None)) + levels_dtypes = [(level, col_index.get( + 'pandas_type', + str(level.dtype) # type: ignore[reportAttributeAccessIssue] + ), + col_index.get('numpy_type', None)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={} - ) + ) ] new_levels = [] @@ -1168,7 +1175,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # bytes into unicode strings when json.loads-ing them. We need to # convert them back to bytes to preserve metadata. if dtype == np.bytes_: - level = level.map(encoder) + level = level.map(encoder) # type: ignore[reportAttributeAccessIssue] # ARROW-13756: if index is timezone aware DataTimeIndex elif pandas_dtype == "datetimetz": tz = pa.lib.string_to_tzinfo( @@ -1177,12 +1184,14 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): if _pandas_api.is_ge_v3(): # with pandas 3+, to_datetime returns a unit depending on the string # data, so we restore it to the original unit from the metadata - level = level.as_unit(np.datetime_data(dtype)[0]) + level = level.as_unit(np.datetime_data( + dtype)[0]) # type: ignore[reportArgumentType] # GH-41503: if the column index was decimal, restore to decimal elif pandas_dtype == "decimal": level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level]) elif ( - level.dtype == "str" and numpy_dtype == "object" + level.dtype == "str" # type: ignore[reportAttributeAccessIssue] + and numpy_dtype == "object" and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", "string"]) ): # the metadata indicate that the original dataframe used object dtype, @@ -1195,11 +1204,12 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # for pandas >= 3 we want to use the default string dtype for .columns new_levels.append(level) continue - elif level.dtype != dtype: - level = level.astype(dtype) + elif level.dtype != dtype: # type: ignore[reportAttributeAccessIssue] + level = level.astype(dtype) # type: ignore[reportAttributeAccessIssue] # ARROW-9096: if original DataFrame was upcast we keep that if level.dtype != numpy_dtype and pandas_dtype != "datetimetz": - level = level.astype(numpy_dtype) + level = level.astype( # type: ignore[reportAttributeAccessIssue] + numpy_dtype) new_levels.append(level) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 24cb586c82b..186c2c92787 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -45,7 +45,7 @@ FileDecryptionProperties, SortingColumn) from pyarrow.fs import (LocalFileSystem, FileType, _resolve_filesystem_and_path, - _ensure_filesystem) + _ensure_filesystem, FileInfo) from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api @@ -1407,6 +1407,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, path_or_paths, filesystem, memory_map=memory_map ) finfo = filesystem.get_file_info(path_or_paths) + assert isinstance(finfo, FileInfo) if finfo.type == FileType.Directory: self._base_dir = path_or_paths else: @@ -1567,6 +1568,7 @@ def _get_common_pandas_metadata(self): for name in ["_common_metadata", "_metadata"]: metadata_path = os.path.join(str(self._base_dir), name) finfo = self.filesystem.get_file_info(metadata_path) + assert isinstance(finfo, FileInfo) if finfo.is_file: pq_meta = read_metadata( metadata_path, filesystem=self.filesystem) @@ -1665,6 +1667,7 @@ def files(self): >>> dataset.files ['dataset_v2_files/year=2019/...-0.parquet', ... """ + assert isinstance(self._dataset, pa.dataset.FileSystemDataset) return self._dataset.files @property @@ -1672,6 +1675,7 @@ def filesystem(self): """ The filesystem type of the Dataset source. """ + assert isinstance(self._dataset, pa.dataset.FileSystemDataset) return self._dataset.filesystem @property @@ -1679,6 +1683,7 @@ def partitioning(self): """ The partitioning of the Dataset source, if discovered. """ + assert isinstance(self._dataset, pa.dataset.FileSystemDataset) return self._dataset.partitioning @@ -2060,7 +2065,8 @@ def write_table(table, where, row_group_size=None, version='2.6', def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, schema=None, partitioning=None, basename_template=None, use_threads=None, - file_visitor=None, existing_data_behavior=None, + file_visitor=None, # type: ignore[reportRedeclaration] + existing_data_behavior=None, **kwargs): """Wrapper around dataset.write_dataset for writing a Table to Parquet format by partitions. @@ -2289,7 +2295,7 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None, filesystem, where = _resolve_filesystem_and_path(where, filesystem) if hasattr(where, "seek"): # file-like - cursor_position = where.tell() + cursor_position = where.tell() # type: ignore[reportAttributeAccessIssue] writer = ParquetWriter(where, schema, filesystem, **kwargs) writer.close() @@ -2298,8 +2304,8 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None, # ParquetWriter doesn't expose the metadata until it's written. Write # it and read it again. metadata = read_metadata(where, filesystem=filesystem) - if hasattr(where, "seek"): - where.seek(cursor_position) # file-like, set cursor back. + if hasattr(where, "seek"): # file-like, set cursor back. + where.seek(cursor_position) # type: ignore[reportAttributeAccessIssue] for m in metadata_collector: metadata.append_row_groups(m) diff --git a/python/pyarrow/py.typed b/python/pyarrow/py.typed new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/python/pyarrow/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 83cabcf447d..16fed344e4d 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1036,7 +1036,7 @@ cdef class StructScalar(Scalar, Mapping): Parameters ---------- - index : Union[int, str] + key : Union[int, str] Index / position or name of the field. Returns diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 575444c1cfc..3f227d3101c 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -64,7 +64,8 @@ if os.environ.get('TZDIR', None) is None: from importlib import resources try: - os.environ['TZDIR'] = os.path.join(resources.files('tzdata'), 'zoneinfo') + tzdata_path = resources.files('tzdata') + os.environ['TZDIR'] = os.path.join(str(tzdata_path), 'zoneinfo') except ModuleNotFoundError: print( 'Package "tzdata" not found. Not setting TZDIR environment variable.' @@ -191,6 +192,7 @@ def decorate(func): def wrapper(*args, **kwargs): remaining_attempts = attempts curr_delay = delay + last_exception = None while remaining_attempts > 0: try: return func(*args, **kwargs) @@ -201,6 +203,9 @@ def wrapper(*args, **kwargs): if max_delay: curr_delay = min(curr_delay, max_delay) time.sleep(curr_delay) + # At this point, we've exhausted all attempts and last_exception must be set + # (since we must have caught at least one exception to exit the loop) + assert last_exception is not None, "No attempts were made" raise last_exception return wrapper return decorate diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py index 50da6693aff..62da25f0af3 100644 --- a/python/pyarrow/tests/interchange/test_conversion.py +++ b/python/pyarrow/tests/interchange/test_conversion.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow.interchange as pi from pyarrow.interchange.column import ( @@ -163,8 +163,8 @@ def test_pandas_roundtrip_string(): result = pi.from_dataframe(pandas_df) assert result["a"].to_pylist() == table["a"].to_pylist() - assert pa.types.is_string(table["a"].type) - assert pa.types.is_large_string(result["a"].type) + assert pa.types.is_string(table.column("a").type) + assert pa.types.is_large_string(result.column("a").type) table_protocol = table.__dataframe__() result_protocol = result.__dataframe__() @@ -193,8 +193,8 @@ def test_pandas_roundtrip_large_string(): result = pi.from_dataframe(pandas_df) assert result["a_large"].to_pylist() == table["a_large"].to_pylist() - assert pa.types.is_large_string(table["a_large"].type) - assert pa.types.is_large_string(result["a_large"].type) + assert pa.types.is_large_string(table.column("a_large").type) + assert pa.types.is_large_string(result.column("a_large").type) table_protocol = table.__dataframe__() result_protocol = result.__dataframe__() @@ -231,12 +231,12 @@ def test_pandas_roundtrip_string_with_missing(): result = pi.from_dataframe(pandas_df) assert result["a"].to_pylist() == table["a"].to_pylist() - assert pa.types.is_string(table["a"].type) - assert pa.types.is_large_string(result["a"].type) + assert pa.types.is_string(table.column("a").type) + assert pa.types.is_large_string(result.column("a").type) assert result["a_large"].to_pylist() == table["a_large"].to_pylist() - assert pa.types.is_large_string(table["a_large"].type) - assert pa.types.is_large_string(result["a_large"].type) + assert pa.types.is_large_string(table.column("a_large").type) + assert pa.types.is_large_string(result.column("a_large").type) else: # older versions of pandas do not have bitmask support # https://github.com/pandas-dev/pandas/issues/49888 @@ -261,12 +261,16 @@ def test_pandas_roundtrip_categorical(): result = pi.from_dataframe(pandas_df) assert result["weekday"].to_pylist() == table["weekday"].to_pylist() - assert pa.types.is_dictionary(table["weekday"].type) - assert pa.types.is_dictionary(result["weekday"].type) - assert pa.types.is_string(table["weekday"].chunk(0).dictionary.type) - assert pa.types.is_large_string(result["weekday"].chunk(0).dictionary.type) - assert pa.types.is_int32(table["weekday"].chunk(0).indices.type) - assert pa.types.is_int8(result["weekday"].chunk(0).indices.type) + assert pa.types.is_dictionary(table.column("weekday").type) + assert pa.types.is_dictionary(result.column("weekday").type) + table_chunk_0 = table.column("weekday").chunk(0) + result_chunk_0 = result.column("weekday").chunk(0) + assert isinstance(table_chunk_0, pa.DictionaryArray) + assert isinstance(result_chunk_0, pa.DictionaryArray) + assert pa.types.is_string(table_chunk_0.dictionary.type) + assert pa.types.is_large_string(result_chunk_0.dictionary.type) + assert pa.types.is_int32(table_chunk_0.indices.type) + assert pa.types.is_int8(result_chunk_0.indices.type) table_protocol = table.__dataframe__() result_protocol = result.__dataframe__() @@ -289,6 +293,7 @@ def test_pandas_roundtrip_categorical(): assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"] assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"] + assert desc_cat_result["categories"] is not None assert isinstance(desc_cat_result["categories"]._col, pa.Array) @@ -450,6 +455,7 @@ def test_pyarrow_roundtrip_categorical(offset, length): assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"] assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"] + assert desc_cat_result["categories"] is not None assert isinstance(desc_cat_result["categories"]._col, pa.Array) @@ -464,8 +470,8 @@ def test_pyarrow_roundtrip_large_string(): col = result.__dataframe__().get_column(0) assert col.size() == 3*1024**2 - assert pa.types.is_large_string(table[0].type) - assert pa.types.is_large_string(result[0].type) + assert pa.types.is_large_string(table.column(0).type) + assert pa.types.is_large_string(result.column(0).type) assert table.equals(result) diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index cea694d1c1e..3208b56c42d 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.strategies as past diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 5390a24b90d..435f93c92a9 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -16,11 +16,12 @@ # under the License. import io +from typing import cast try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow.tests import util @@ -137,7 +138,7 @@ def make_sample_file(table_or_df): else: a_table = pa.Table.from_pandas(table_or_df) - buf = io.BytesIO() + buf = io.BytesIO() # type: ignore[attr-defined] _write_table(a_table, buf, compression='SNAPPY', version='2.6') buf.seek(0) @@ -175,5 +176,6 @@ def alltypes_sample(size=10000, seed=0, categorical=False): 'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)], } if categorical: - arrays['str_category'] = arrays['str'].astype('category') + import pandas as pd + arrays['str_category'] = cast(pd.Series, arrays['str']).astype('category') return pd.DataFrame(arrays) diff --git a/python/pyarrow/tests/parquet/encryption.py b/python/pyarrow/tests/parquet/encryption.py index d07f8ae2735..de811dadac0 100644 --- a/python/pyarrow/tests/parquet/encryption.py +++ b/python/pyarrow/tests/parquet/encryption.py @@ -29,7 +29,7 @@ def __init__(self, config): pe.KmsClient.__init__(self) self.master_keys_map = config.custom_kms_conf - def wrap_key(self, key_bytes, master_key_identifier): + def wrap_key(self, key_bytes, master_key_identifier): # type: ignore[override] """Not a secure cipher - the wrapped key is just the master key concatenated with key bytes""" master_key_bytes = self.master_keys_map[master_key_identifier].encode( @@ -38,7 +38,7 @@ def wrap_key(self, key_bytes, master_key_identifier): result = base64.b64encode(wrapped_key) return result - def unwrap_key(self, wrapped_key, master_key_identifier): + def unwrap_key(self, wrapped_key, master_key_identifier): # type: ignore[override] """Not a secure cipher - just extract the key from the wrapped key""" expected_master_key = self.master_keys_map[master_key_identifier] diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 591bcffc1ac..42fca7eb8fc 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -34,7 +34,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -44,12 +44,12 @@ from pyarrow.tests.pandas_examples import dataframe_with_lists from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pass try: import numpy as np except ImportError: - np = None + pass # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' @@ -161,10 +161,10 @@ def test_invalid_source(): # Test that we provide an helpful error message pointing out # that None wasn't expected when trying to open a Parquet None file. with pytest.raises(TypeError, match="None"): - pq.read_table(None) + pq.read_table(None) # type: ignore[arg-type] with pytest.raises(TypeError, match="None"): - pq.ParquetFile(None) + pq.ParquetFile(None) # type: ignore[arg-type] def test_read_table_without_dataset(tempdir): @@ -750,7 +750,7 @@ def test_fastparquet_cross_compatibility(tempdir): # Arrow -> fastparquet file_arrow = str(tempdir / "cross_compat_arrow.parquet") - pq.write_table(table, file_arrow, compression=None) + pq.write_table(table, file_arrow, compression=None) # type: ignore[arg-type] fp_file = fp.ParquetFile(file_arrow) df_fp = fp_file.to_pandas() @@ -791,7 +791,7 @@ def test_buffer_contents( for col in table.columns: [chunk] = col.chunks buf = chunk.buffers()[1] - assert buf.to_pybytes() == buf.size * b"\0" + assert buf.to_pybytes() == buf.size * b"\0" # type: ignore[union-attr] def test_parquet_compression_roundtrip(tempdir): @@ -801,7 +801,7 @@ def test_parquet_compression_roundtrip(tempdir): # the stream due to auto-detecting the extension in the filename table = pa.table([pa.array(range(4))], names=["ints"]) path = tempdir / "arrow-10480.pyarrow.gz" - pq.write_table(table, path, compression="GZIP") + pq.write_table(table, path, compression="GZIP") # type: ignore[arg-type] result = pq.read_table(path) assert result.equals(table) @@ -826,7 +826,7 @@ def test_empty_row_groups(tempdir): def test_reads_over_batch(tempdir): data = [None] * (1 << 20) - data.append([1]) + data.append([1]) # type: ignore[reportArgumentType] # Large list with mostly nones and one final # value. This should force batched reads when # reading back. diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index 2345855a332..af418812be8 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -24,15 +24,14 @@ from pyarrow.tests.parquet.common import (_read_table, _check_roundtrip) except ImportError: - pq = None + pass try: import pandas as pd - import pandas.testing as tm from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pd = tm = None + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index c546bc1532a..6d16da4f772 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -18,11 +18,12 @@ import decimal import io import random +from typing import cast try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa @@ -33,7 +34,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -44,7 +45,7 @@ dataframe_with_lists) from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pass # Marks all of the tests in this module @@ -142,7 +143,7 @@ def test_direct_read_dictionary(): read_dictionary=['f0']) # Compute dictionary-encoded subfield - expected = pa.table([table[0].dictionary_encode()], names=['f0']) + expected = pa.table([table.column(0).dictionary_encode()], names=['f0']) assert result.equals(expected) @@ -174,7 +175,7 @@ def test_direct_read_dictionary_subfield(): expected = pa.table([expected_arr], names=['f0']) assert result.equals(expected) - assert result[0].num_chunks == 1 + assert result.column(0).num_chunks == 1 @pytest.mark.numpy @@ -260,8 +261,8 @@ def test_single_pylist_column_roundtrip(tempdir, dtype,): _write_table(table, filename) table_read = _read_table(filename) for i in range(table.num_columns): - col_written = table[i] - col_read = table_read[i] + col_written = table.column(i) + col_read = table_read.column(i) assert table.field(i).name == table_read.field(i).name assert col_read.num_chunks == 1 data_written = col_written.chunk(0) @@ -390,7 +391,7 @@ def test_parquet_nested_convenience(tempdir): read = pq.read_table( path, columns=['a']) - tm.assert_frame_equal(read.to_pandas(), df[['a']]) + tm.assert_frame_equal(read.to_pandas(), cast(pd.DataFrame, df[['a']])) read = pq.read_table( path, columns=['a', 'b']) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index d3e9cda7301..d1b834c0909 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -20,35 +20,41 @@ import os import pathlib import sys +from typing import TYPE_CHECKING try: import numpy as np except ImportError: - np = None + pass import pytest import unittest.mock as mock import pyarrow as pa import pyarrow.compute as pc -from pyarrow.fs import (FileSelector, FileSystem, LocalFileSystem, +from pyarrow.fs import (FileSelector, FileSystem, LocalFileSystem, FileInfo, FileType, PyFileSystem, SubTreeFileSystem, FSSpecHandler) from pyarrow.tests import util from pyarrow.util import guid -try: +if TYPE_CHECKING: + import pandas as pd + import pandas.testing as tm import pyarrow.parquet as pq from pyarrow.tests.parquet.common import ( _read_table, _test_dataframe, _test_table, _write_table) -except ImportError: - pq = None - - -try: - import pandas as pd - import pandas.testing as tm +else: + try: + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import ( + _read_table, _test_dataframe, _test_table, _write_table) + except ImportError: + pass -except ImportError: - pd = tm = None + try: + import pandas as pd + import pandas.testing as tm + except ImportError: + pass # Marks all of the tests in this module @@ -70,8 +76,8 @@ def test_filesystem_uri(tempdir): assert result.equals(table) # filesystem URI - result = pq.read_table( - "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir)) + result = pq.read_table("data_dir/data.parquet", + filesystem=util._filesystem_uri(tempdir)) assert result.equals(table) @@ -553,7 +559,7 @@ def _generate_partition_directories(fs, base_dir, partition_spec, df): # ['bar', ['a', 'b', 'c']] # part_table : a pyarrow.Table to write to each partition if not isinstance(fs, FileSystem): - fs = PyFileSystem(FSSpecHandler(fs)) + fs = PyFileSystem(FSSpecHandler(fs)) # type: ignore[abstract] DEPTH = len(partition_spec) @@ -572,15 +578,15 @@ def _visit_level(base_dir, level, part_keys): if level == DEPTH - 1: # Generate example data - from pyarrow.fs import FileType - file_path = pathsep.join([level_dir, guid()]) filtered_df = _filter_partition(df, this_part_keys) part_table = pa.Table.from_pandas(filtered_df) with fs.open_output_stream(file_path) as f: _write_table(part_table, f) - assert fs.get_file_info(file_path).type != FileType.NotFound - assert fs.get_file_info(file_path).type == FileType.File + file_info = fs.get_file_info(file_path) + assert isinstance(file_info, FileInfo) + assert file_info.type != FileType.NotFound + assert file_info.type == FileType.File file_success = pathsep.join([level_dir, '_SUCCESS']) with fs.open_output_stream(file_success) as f: @@ -717,8 +723,8 @@ def test_dataset_read_pandas(tempdir): paths = [] for i in range(nfiles): df = _test_dataframe(size, seed=i) - df.index = np.arange(i * size, (i + 1) * size) - df.index.name = 'index' + df.index = np.arange(i * size, (i + 1) * size) # type: ignore[assignment] + df.index.name = 'index' # type: ignore[attr-defined] path = dirpath / f'{i}.parquet' @@ -997,7 +1003,7 @@ def _test_write_to_dataset_no_partitions(base_path, if filesystem is None: filesystem = LocalFileSystem() elif not isinstance(filesystem, FileSystem): - filesystem = PyFileSystem(FSSpecHandler(filesystem)) + filesystem = PyFileSystem(FSSpecHandler(filesystem)) # type: ignore[abstract] # Without partitions, append files to root_path n = 5 @@ -1009,8 +1015,10 @@ def _test_write_to_dataset_no_partitions(base_path, recursive=True) infos = filesystem.get_file_info(selector) - output_files = [info for info in infos if info.path.endswith(".parquet")] - assert len(output_files) == n + if isinstance(infos, list): + assert all(isinstance(info, FileInfo) for info in infos) + output_files = [info for info in infos if info.path.endswith(".parquet")] + assert len(output_files) == n # Deduplicated incoming DataFrame should match # original outgoing Dataframe @@ -1168,11 +1176,11 @@ def test_dataset_read_dictionary(tempdir): path, read_dictionary=['f0']).read() # The order of the chunks is non-deterministic - ex_chunks = [t1[0].chunk(0).dictionary_encode(), - t2[0].chunk(0).dictionary_encode()] + ex_chunks = [t1.column(0).chunk(0).dictionary_encode(), + t2.column(0).chunk(0).dictionary_encode()] - assert result[0].num_chunks == 2 - c0, c1 = result[0].chunk(0), result[0].chunk(1) + assert result.column(0).num_chunks == 2 + c0, c1 = result.column(0).chunk(0), result.column(0).chunk(1) if c0.equals(ex_chunks[0]): assert c1.equals(ex_chunks[1]) else: diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index b89fd97cb91..a7652a01e64 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa @@ -32,7 +32,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -41,7 +41,7 @@ from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pd = tm = None + pass # Marks all of the tests in this module @@ -56,7 +56,7 @@ def test_pandas_parquet_datetime_tz(): # coerce to [ns] due to lack of non-[ns] support. s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]') s = s.dt.tz_localize('utc') - s.index = s + s.index = s # type: ignore[assignment] # Both a column and an index to hit both use cases df = pd.DataFrame({'tz_aware': s, @@ -287,7 +287,8 @@ def test_coerce_int96_timestamp_unit(unit): # For either Parquet version, coercing to nanoseconds is allowed # if Int96 storage is used - expected = pa.Table.from_arrays([arrays.get(unit)]*4, names) + array_for_unit = arrays.get(unit, a_ns) + expected = pa.Table.from_arrays([array_for_unit] * 4, names) read_table_kwargs = {"coerce_int96_timestamp_unit": unit} _check_roundtrip(table, expected, read_table_kwargs=read_table_kwargs, @@ -323,6 +324,7 @@ def get_table(pq_reader_method, filename, **kwargs): # with the default resolution of ns, we get wrong values for INT96 # that are out of bounds for nanosecond range tab_error = get_table(pq_reader_method, filename) + assert tab_error is not None with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Discarding nonzero nanoseconds in conversion", @@ -333,6 +335,7 @@ def get_table(pq_reader_method, filename, **kwargs): tab_correct = get_table( pq_reader_method, filename, coerce_int96_timestamp_unit="s" ) + assert tab_correct is not None df_correct = tab_correct.to_pandas(timestamp_as_object=True) df["a"] = df["a"].astype(object) tm.assert_frame_equal(df, df_correct) diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py index a11a4935a1c..dcdf90c4b1d 100644 --- a/python/pyarrow/tests/parquet/test_encryption.py +++ b/python/pyarrow/tests/parquet/test_encryption.py @@ -22,8 +22,7 @@ import pyarrow.parquet as pq import pyarrow.parquet.encryption as pe except ImportError: - pq = None - pe = None + pass else: from pyarrow.tests.parquet.encryption import ( InMemoryKmsClient, verify_file_encrypted) @@ -118,7 +117,7 @@ def test_encrypted_parquet_write_read(tempdir, data_table): encryption_algorithm="AES_GCM_V1", cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - assert encryption_config.uniform_encryption is False + assert encryption_config.uniform_encryption is False # type: ignore[attr-defined] kms_connection_config, crypto_factory = write_encrypted_file( path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, @@ -141,11 +140,11 @@ def test_uniform_encrypted_parquet_write_read(tempdir, data_table): # Encrypt the footer and all columns with the footer key, encryption_config = pe.EncryptionConfiguration( footer_key=FOOTER_KEY_NAME, - uniform_encryption=True, + uniform_encryption=True, # type: ignore[call-arg] encryption_algorithm="AES_GCM_V1", cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - assert encryption_config.uniform_encryption is True + assert encryption_config.uniform_encryption is True # type: ignore[attr-defined] kms_connection_config, crypto_factory = write_encrypted_file( path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, b"", @@ -280,7 +279,7 @@ def test_encrypted_parquet_write_col_key_and_uniform_encryption(tempdir, data_ta column_keys={ COL_KEY_NAME: ["a", "b"], }, - uniform_encryption=True) + uniform_encryption=True) # type: ignore[call-arg] with pytest.raises(OSError, match=r"Cannot set both column_keys and uniform_encryption"): @@ -392,7 +391,7 @@ def unwrap_key(self, wrapped_key, master_key_identifier): def kms_factory(kms_connection_configuration): return WrongTypeKmsClient(kms_connection_configuration) - crypto_factory = pe.CryptoFactory(kms_factory) + crypto_factory = pe.CryptoFactory(kms_factory) # type: ignore[arg-type] with pytest.raises(TypeError): # Write with encryption properties write_encrypted_parquet(path, data_table, encryption_config, diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 148bfebaa67..646873b3d4f 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -19,11 +19,7 @@ import decimal from collections import OrderedDict import io - -try: - import numpy as np -except ImportError: - np = None +from typing import TYPE_CHECKING import pytest import pyarrow as pa @@ -31,20 +27,25 @@ from pyarrow.fs import LocalFileSystem from pyarrow.tests import util -try: - import pyarrow.parquet as pq - from pyarrow.tests.parquet.common import _write_table -except ImportError: - pq = None - - -try: +if TYPE_CHECKING: + import numpy as np import pandas as pd - import pandas.testing as tm - - from pyarrow.tests.parquet.common import alltypes_sample -except ImportError: - pd = tm = None + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import alltypes_sample, _write_table +else: + try: + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import _write_table, alltypes_sample + except ImportError: + pass + try: + import pandas as pd + except ImportError: + pass + try: + import numpy as np + except ImportError: + pass # Marks all of the tests in this module @@ -56,7 +57,7 @@ def test_parquet_metadata_api(): df = alltypes_sample(size=10000) df = df.reindex(columns=sorted(df.columns)) - df.index = np.random.randint(0, 1000000, size=len(df)) + df.index = np.random.randint(0, 1000000, size=len(df)) # type: ignore[assignment] fileh = make_sample_file(df) ncols = len(df.columns) @@ -80,15 +81,15 @@ def test_parquet_metadata_api(): col = schema[0] repr(col) - assert col.name == df.columns[0] - assert col.max_definition_level == 1 - assert col.max_repetition_level == 0 - assert col.max_repetition_level == 0 - assert col.physical_type == 'BOOLEAN' - assert col.converted_type == 'NONE' + assert col.name == df.columns[0] # type: ignore[attr-defined] + assert col.max_definition_level == 1 # type: ignore[attr-defined] + assert col.max_repetition_level == 0 # type: ignore[attr-defined] + assert col.max_repetition_level == 0 # type: ignore[attr-defined] + assert col.physical_type == 'BOOLEAN' # type: ignore[attr-defined] + assert col.converted_type == 'NONE' # type: ignore[attr-defined] col_float16 = schema[5] - assert col_float16.logical_type.type == 'FLOAT16' + assert col_float16.logical_type.type == 'FLOAT16' # type: ignore[attr-defined] with pytest.raises(IndexError): schema[ncols + 1] # +1 for index @@ -210,15 +211,16 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value, col_meta = rg_meta.column(0) stat = col_meta.statistics - assert stat.has_min_max - assert _close(type, stat.min, min_value) - assert _close(type, stat.max, max_value) - assert stat.null_count == null_count - assert stat.num_values == num_values + assert stat is not None + assert stat.has_min_max # type: ignore[attr-defined] + assert _close(type, stat.min, min_value) # type: ignore[attr-defined] + assert _close(type, stat.max, max_value) # type: ignore[attr-defined] + assert stat.null_count == null_count # type: ignore[attr-defined] + assert stat.num_values == num_values # type: ignore[attr-defined] # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount # method, missing distinct_count is represented as zero instead of None - assert stat.distinct_count == distinct_count - assert stat.physical_type == physical_type + assert stat.distinct_count == distinct_count # type: ignore[attr-defined] + assert stat.physical_type == physical_type # type: ignore[attr-defined] def _close(type, left, right): @@ -236,8 +238,10 @@ def test_parquet_raise_on_unset_statistics(): df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")}) meta = make_sample_file(pa.Table.from_pandas(df)).metadata - assert not meta.row_group(0).column(0).statistics.has_min_max - assert meta.row_group(0).column(0).statistics.max is None + stat = meta.row_group(0).column(0).statistics + assert stat is not None + assert not stat.has_min_max + assert stat.max is None def test_statistics_convert_logical_types(tempdir): @@ -271,8 +275,9 @@ def test_statistics_convert_logical_types(tempdir): pq.write_table(t, path, version='2.6') pf = pq.ParquetFile(path) stats = pf.metadata.row_group(0).column(0).statistics - assert stats.min == min_val - assert stats.max == max_val + assert stats is not None + assert stats.min == min_val # type: ignore[attr-defined] + assert stats.max == max_val # type: ignore[attr-defined] def test_parquet_write_disable_statistics(tempdir): @@ -429,29 +434,36 @@ def test_field_id_metadata(): pf = pq.ParquetFile(pa.BufferReader(contents)) schema = pf.schema_arrow - assert schema[0].metadata[field_id] == b'1' - assert schema[0].metadata[b'other'] == b'abc' + assert schema[0].metadata is not None + assert schema[0].metadata[field_id] == b'1' # type: ignore[index] + assert schema[0].metadata[b'other'] == b'abc' # type: ignore[index] list_field = schema[1] - assert list_field.metadata[field_id] == b'11' + assert list_field.metadata is not None + assert list_field.metadata[field_id] == b'11' # type: ignore[index] list_item_field = list_field.type.value_field - assert list_item_field.metadata[field_id] == b'10' + assert list_item_field.metadata is not None + assert list_item_field.metadata[field_id] == b'10' # type: ignore[index] struct_field = schema[2] - assert struct_field.metadata[field_id] == b'102' + assert struct_field.metadata is not None + assert struct_field.metadata[field_id] == b'102' # type: ignore[index] struct_middle_field = struct_field.type[0] - assert struct_middle_field.metadata[field_id] == b'101' + assert struct_middle_field.metadata is not None + assert struct_middle_field.metadata[field_id] == b'101' # type: ignore[index] struct_inner_field = struct_middle_field.type[0] - assert struct_inner_field.metadata[field_id] == b'100' + assert struct_inner_field.metadata is not None + assert struct_inner_field.metadata[field_id] == b'100' # type: ignore[index] assert schema[3].metadata is None # Invalid input is passed through (ok) but does not # have field_id in parquet (not tested) - assert schema[4].metadata[field_id] == b'xyz' - assert schema[5].metadata[field_id] == b'-1000' + assert schema[4].metadata is not None + assert schema[4].metadata[field_id] == b'xyz' # type: ignore[index] + assert schema[5].metadata[field_id] == b'-1000' # type: ignore[index] def test_parquet_file_page_index(): @@ -495,13 +507,14 @@ def test_multi_dataset_metadata(tempdir): _meta.append_row_groups(meta[0]) # Write merged metadata-only file + assert _meta is not None with open(metapath, "wb") as f: - _meta.write_metadata_file(f) + _meta.write_metadata_file(f) # type: ignore[union-attr] # Read back the metadata meta = pq.read_metadata(metapath) md = meta.to_dict() - _md = _meta.to_dict() + _md = _meta.to_dict() # type: ignore[union-attr] for key in _md: if key != 'serialized_size': assert _md[key] == md[key] @@ -695,13 +708,14 @@ def test_metadata_schema_filesystem(tempdir): assert pq.read_metadata( file_path, filesystem=LocalFileSystem()).equals(metadata) assert pq.read_metadata( + # type: ignore[arg-type] fname, filesystem=f'file:///{tempdir}').equals(metadata) assert pq.read_schema(file_uri).equals(schema) assert pq.read_schema( file_path, filesystem=LocalFileSystem()).equals(schema) assert pq.read_schema( - fname, filesystem=f'file:///{tempdir}').equals(schema) + fname, filesystem=f'file:///{tempdir}').equals(schema) # type: ignore[arg-type] with util.change_cwd(tempdir): # Pass `filesystem` arg @@ -721,7 +735,7 @@ def test_metadata_equals(): original_metadata = pq.read_metadata(pa.BufferReader(buf)) match = "Argument 'other' has incorrect type" with pytest.raises(TypeError, match=match): - original_metadata.equals(None) + original_metadata.equals(None) # type: ignore[arg-type] @pytest.mark.parametrize("t1,t2,expected_error", ( @@ -810,7 +824,7 @@ def msg(c): pq.ColumnChunkMetaData() with pytest.raises(TypeError, match=msg("RowGroupMetaData")): - pq.RowGroupMetaData() + pq.RowGroupMetaData() # type: ignore[call-arg] with pytest.raises(TypeError, match=msg("FileMetaData")): - pq.FileMetaData() + pq.FileMetaData() # type: ignore[call-arg] diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 53864ff15ea..ec0989c8bcb 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -17,11 +17,12 @@ import io import json +from typing import TYPE_CHECKING, cast try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa @@ -29,22 +30,29 @@ from pyarrow.util import guid from pyarrow.vendored.version import Version -try: - import pyarrow.parquet as pq - from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, - _write_table) -except ImportError: - pq = None - - -try: +if TYPE_CHECKING: import pandas as pd import pandas.testing as tm + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import ( + _read_table, _roundtrip_pandas_dataframe, _test_dataframe, + _write_table, alltypes_sample + ) +else: + try: + import pyarrow.parquet as pq + from pyarrow.tests.parquet.common import ( + _read_table, _test_dataframe, _write_table, alltypes_sample, + _roundtrip_pandas_dataframe + ) - from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe, - alltypes_sample) -except ImportError: - pd = tm = None + except ImportError: + pass + try: + import pandas as pd + import pandas.testing as tm + except ImportError: + pass # Marks all of the tests in this module @@ -58,11 +66,14 @@ def test_pandas_parquet_custom_metadata(tempdir): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) + assert arrow_table.schema.metadata is not None assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename) - metadata = pq.read_metadata(filename).metadata + file_metadata = pq.read_metadata(filename) + metadata = file_metadata.metadata + assert metadata is not None assert b'pandas' in metadata js = json.loads(metadata[b'pandas'].decode('utf8')) @@ -117,10 +128,13 @@ def test_attributes_metadata_persistence(tempdir): } table = pa.Table.from_pandas(df) + assert table.schema.metadata is not None assert b'attributes' in table.schema.metadata[b'pandas'] _write_table(table, filename) - metadata = pq.read_metadata(filename).metadata + file_metadata = pq.read_metadata(filename) + metadata = file_metadata.metadata + assert metadata is not None js = json.loads(metadata[b'pandas'].decode('utf8')) assert 'attributes' in js assert js['attributes'] == df.attrs @@ -297,8 +311,8 @@ def test_pandas_parquet_configuration_options(tempdir): @pytest.mark.pandas def test_spark_flavor_preserves_pandas_metadata(): df = _test_dataframe(size=100) - df.index = np.arange(0, 10 * len(df), 10) - df.index.name = 'foo' + df.index = np.arange(0, 10 * len(df), 10) # type: ignore[assignment] + df.index.name = 'foo' # type: ignore[attr-defined] result = _roundtrip_pandas_dataframe(df, {'flavor': 'spark'}) tm.assert_frame_equal(result, df) @@ -450,7 +464,9 @@ def test_backwards_compatible_column_metadata_handling(datadir): table = _read_table( path, columns=['a']) result = table.to_pandas() - tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) + expected_df = expected[['a']].reset_index(drop=True) + assert isinstance(expected_df, pd.DataFrame) + tm.assert_frame_equal(result, expected_df) @pytest.mark.pandas @@ -510,7 +526,7 @@ def test_pandas_categorical_roundtrip(): codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32') categories = ['foo', 'bar', 'baz'] df = pd.DataFrame({'x': pd.Categorical.from_codes( - codes, categories=categories)}) + codes, categories=categories)}) # type: ignore[arg-type] buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) @@ -555,15 +571,15 @@ def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): table, str(tempdir / "case1"), partition_cols=['part'], ) result = pq.read_table(str(tempdir / "case1")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal(result[["col"]], cast(pd.DataFrame, df[["col"]])) pq.write_to_dataset(table, str(tempdir / "case2")) result = pq.read_table(str(tempdir / "case2")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal(result[["col"]], cast(pd.DataFrame, df[["col"]])) pq.write_table(table, str(tempdir / "data.parquet")) result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal(result[["col"]], cast(pd.DataFrame, df[["col"]])) @pytest.mark.pandas @@ -580,7 +596,7 @@ def test_write_to_dataset_pandas_preserve_index(tempdir): table, str(tempdir / "case1"), partition_cols=['part'], ) result = pq.read_table(str(tempdir / "case1")).to_pandas() - tm.assert_frame_equal(result, df_cat) + tm.assert_frame_equal(result, cast(pd.DataFrame, df_cat)) pq.write_to_dataset(table, str(tempdir / "case2")) result = pq.read_table(str(tempdir / "case2")).to_pandas() diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 8eab9bc1740..b93e1a2d5b2 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -30,15 +30,14 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _write_table except ImportError: - pq = None + pass try: - import pandas as pd import pandas.testing as tm from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pass # Marks all of the tests in this module @@ -172,7 +171,7 @@ def test_scan_contents(): pf = pq.ParquetFile(buf) assert pf.scan_contents() == 10000 - assert pf.scan_contents(df.columns[:4]) == 10000 + assert pf.scan_contents(list(df.columns[:4])) == 10000 def test_parquet_file_pass_directory_instead_of_file(tempdir): @@ -215,7 +214,7 @@ def test_iter_batches_columns_reader(tempdir, batch_size): chunk_size=chunk_size) file_ = pq.ParquetFile(filename) - for columns in [df.columns[:10], df.columns[10:]]: + for columns in [list(df.columns[:10]), list(df.columns[10:])]: batches = file_.iter_batches(batch_size=batch_size, columns=columns) batch_starts = range(0, total_size+batch_size, batch_size) for batch, start in zip(batches, batch_starts): @@ -346,6 +345,7 @@ def test_read_statistics(): buf.seek(0) statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics + assert statistics is not None assert statistics.null_count == 1 assert statistics.distinct_count is None # TODO: add tests for is_distinct_count_exact == None and True @@ -388,7 +388,8 @@ def test_parquet_file_fsspec_support(): def test_parquet_file_fsspec_support_through_filesystem_argument(): try: - from fsspec.implementations.memory import MemoryFileSystem + from fsspec.implementations.memory import ( # type: ignore[import-untyped] + MemoryFileSystem) except ImportError: pytest.skip("fsspec is not installed, skipping test") @@ -411,7 +412,7 @@ def test_parquet_file_hugginface_support(): pytest.skip("fsspec is not installed, skipping Hugging Face test") fake_hf_module = types.ModuleType("huggingface_hub") - fake_hf_module.HfFileSystem = MemoryFileSystem + fake_hf_module.HfFileSystem = MemoryFileSystem # type: ignore[attr-defined] with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}): uri = "hf://datasets/apache/arrow/test.parquet" table = pa.table({"a": range(10)}) @@ -423,7 +424,7 @@ def test_parquet_file_hugginface_support(): def test_fsspec_uri_raises_if_fsspec_is_not_available(): # sadly cannot patch sys.modules because cython will still be able to import fsspec try: - import fsspec # noqa: F401 + import fsspec # type: ignore[import-untyped] # noqa: F401 except ImportError: pass else: diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index 3e7352428c9..00a35292b6f 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -23,9 +23,10 @@ try: import pyarrow.parquet as pq from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, + # type: ignore[attr-defined] _test_table, _range_integers) except ImportError: - pq = None + pass try: @@ -33,7 +34,7 @@ import pandas.testing as tm except ImportError: - pd = tm = None + pass # Marks all of the tests in this module @@ -94,10 +95,10 @@ def test_parquet_invalid_writer(tempdir): # avoid segfaults with invalid construction with pytest.raises(TypeError): some_schema = pa.schema([pa.field("x", pa.int32())]) - pq.ParquetWriter(None, some_schema) + pq.ParquetWriter(None, some_schema) # type: ignore[arg-type] with pytest.raises(TypeError): - pq.ParquetWriter(tempdir / "some_path", None) + pq.ParquetWriter(tempdir / "some_path", None) # type: ignore[arg-type] @pytest.mark.pandas @@ -335,6 +336,7 @@ def test_parquet_writer_store_schema(tempdir): writer.write_table(table) meta = pq.read_metadata(path1) + assert meta.metadata is not None assert b'ARROW:schema' in meta.metadata assert meta.metadata[b'ARROW:schema'] @@ -357,6 +359,7 @@ def test_parquet_writer_append_key_value_metadata(tempdir): writer.add_key_value_metadata({'key2': '2', 'key3': '3'}) reader = pq.ParquetFile(path) metadata = reader.metadata.metadata + assert metadata is not None assert metadata[b'key1'] == b'1' assert metadata[b'key2'] == b'2' assert metadata[b'key3'] == b'3' diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 450cce74f1d..db3765f3aae 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -17,31 +17,32 @@ import datetime import sys +from typing import Any -import pytest -import hypothesis as h -import hypothesis.strategies as st +import pytest # type: ignore[import-not-found] +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] try: - import hypothesis.extra.numpy as npst + import hypothesis.extra.numpy as npst # type: ignore[import-not-found] except ImportError: - npst = None + npst = None # type: ignore[assignment] try: - import hypothesis.extra.pytz as tzst + import hypothesis.extra.pytz as tzst # type: ignore[import-not-found] except ImportError: - tzst = None + tzst = None # type: ignore[assignment] try: import zoneinfo except ImportError: - zoneinfo = None + zoneinfo = None # type: ignore[assignment] if sys.platform == 'win32': try: - import tzdata # noqa:F401 + import tzdata # type: ignore[import-not-found] # noqa:F401 except ImportError: zoneinfo = None try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa @@ -134,12 +135,12 @@ timezones = st.one_of(st.none(), st.timezones()) else: timezones = st.none() -timestamp_types = st.builds( +timestamp_types: Any = st.builds( pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=timezones ) -duration_types = st.builds( +duration_types: Any = st.builds( pa.duration, st.sampled_from(['s', 'ms', 'us', 'ns']) ) @@ -234,13 +235,13 @@ def schemas(type_strategy=primitive_types, max_fields=None): all_types = st.deferred( lambda: ( - primitive_types | - list_types() | - struct_types() | - dictionary_types() | - map_types() | - list_types(all_types) | - struct_types(all_types) + primitive_types + | list_types() + | struct_types() + | dictionary_types() + | map_types() + | list_types(all_types) # type: ignore[has-type] + | struct_types(all_types) # type: ignore[has-type] ) ) all_fields = fields(all_types) @@ -280,6 +281,7 @@ def arrays(draw, type, size=None, nullable=True): elif not isinstance(size, int): raise TypeError('Size must be an integer') + assert npst is not None if pa.types.is_null(ty): h.assume(nullable) value = st.none() @@ -292,6 +294,7 @@ def arrays(draw, type, size=None, nullable=True): values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,))) # Workaround ARROW-4952: no easy way to assert array equality # in a NaN-tolerant way. + assert np is not None values[np.isnan(values)] = -42.0 return pa.array(values, type=ty) elif pa.types.is_decimal(ty): @@ -317,9 +320,11 @@ def arrays(draw, type, size=None, nullable=True): offset = ty.tz.split(":") offset_hours = int(offset[0]) offset_min = int(offset[1]) - tz = datetime.timedelta(hours=offset_hours, minutes=offset_min) + tz = datetime.timezone( + datetime.timedelta(hours=offset_hours, minutes=offset_min) + ) except ValueError: - tz = zoneinfo.ZoneInfo(ty.tz) + tz = zoneinfo.ZoneInfo(str(ty.tz)) value = st.datetimes(timezones=st.just(tz), min_value=min_datetime, max_value=max_datetime) elif pa.types.is_duration(ty): @@ -467,7 +472,9 @@ def pandas_compatible_list_types( dictionary_types( value_strategy=pandas_compatible_dictionary_value_types ), - pandas_compatible_list_types(pandas_compatible_types), - struct_types(pandas_compatible_types) + pandas_compatible_list_types( + pandas_compatible_types # type: ignore[has-type] + ), + struct_types(pandas_compatible_types) # type: ignore[has-type] ) ) diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py index cb97e3849fd..48944c25f3c 100644 --- a/python/pyarrow/tests/test_acero.py +++ b/python/pyarrow/tests/test_acero.py @@ -37,9 +37,10 @@ try: import pyarrow.dataset as ds - from pyarrow.acero import ScanNodeOptions + from pyarrow._dataset import ScanNodeOptions except ImportError: - ds = None + ds = None # type: ignore[assignment] + ScanNodeOptions = None # type: ignore[assignment, misc] pytestmark = pytest.mark.acero @@ -53,7 +54,6 @@ def table_source(): def test_declaration(): - table = pa.table({'a': [1, 2, 3], 'b': [4, 5, 6]}) table_opts = TableSourceNodeOptions(table) filter_opts = FilterNodeOptions(field('a') > 1) @@ -89,7 +89,8 @@ def test_declaration_to_reader(table_source): def test_table_source(): with pytest.raises(TypeError): - TableSourceNodeOptions(pa.record_batch([pa.array([1, 2, 3])], ["a"])) + TableSourceNodeOptions(pa.record_batch( + [pa.array([1, 2, 3])], ["a"])) table_source = TableSourceNodeOptions(None) decl = Declaration("table_source", table_source) @@ -110,9 +111,9 @@ def test_filter(table_source): # requires a pyarrow Expression with pytest.raises(TypeError): - FilterNodeOptions(pa.array([True, False, True])) + FilterNodeOptions(pa.array([True, False, True])) # type: ignore[arg-type] with pytest.raises(TypeError): - FilterNodeOptions(None) + FilterNodeOptions(None) # type: ignore[arg-type] @pytest.mark.parametrize('source', [ @@ -294,10 +295,12 @@ def test_order_by(): _ = decl.to_table() with pytest.raises(ValueError, match="\"decreasing\" is not a valid sort order"): - _ = OrderByNodeOptions([("b", "decreasing")]) + _ = OrderByNodeOptions([("b", "decreasing")]) # type: ignore[arg-type] with pytest.raises(ValueError, match="\"start\" is not a valid null placement"): - _ = OrderByNodeOptions([("b", "ascending")], null_placement="start") + _ = OrderByNodeOptions( + [("b", "ascending")], null_placement="start" # type: ignore[arg-type] + ) def test_hash_join(): @@ -382,7 +385,9 @@ def test_hash_join_with_residual_filter(): # test filter expression referencing columns from both side join_opts = HashJoinNodeOptions( "left outer", left_keys="key", right_keys="key", - filter_expression=pc.equal(pc.field("a"), 5) | pc.equal(pc.field("b"), 10) + filter_expression=( + pc.equal(pc.field("a"), 5) + | pc.equal(pc.field("b"), 10)) # type: ignore[reportOperatorIssue] ) joined = Declaration( "hashjoin", options=join_opts, inputs=[left_source, right_source]) @@ -462,6 +467,8 @@ def test_asof_join(): @pytest.mark.dataset def test_scan(tempdir): + assert ds is not None + assert ScanNodeOptions is not None table = pa.table({'a': [1, 2, 3], 'b': [4, 5, 6]}) ds.write_dataset(table, tempdir / "dataset", format="parquet") dataset = ds.dataset(tempdir / "dataset", format="parquet") @@ -486,11 +493,10 @@ def test_scan(tempdir): assert decl.to_table().num_rows == 0 # projection scan option - scan_opts = ScanNodeOptions(dataset, columns={"a2": pc.multiply(field("a"), 2)}) decl = Declaration("scan", scan_opts) result = decl.to_table() # "a" is included in the result (needed later on for the actual projection) assert result["a"].to_pylist() == [1, 2, 3] # "b" is still included, but without data as it will be removed by the projection - assert pc.all(result["b"].is_null()).as_py() + assert pc.all(result.column("b").is_null()).as_py() diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py index 76a766984da..9f61bc7ddfe 100644 --- a/python/pyarrow/tests/test_adhoc_memory_leak.py +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -20,7 +20,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.util as test_util diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..150893617b9 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -18,19 +18,23 @@ from collections.abc import Iterable import datetime import decimal -import hypothesis as h -import hypothesis.strategies as st +import hypothesis as h # type: ignore[import-not-found] +import hypothesis.strategies as st # type: ignore[import-not-found] import itertools -import pytest +import pytest # type: ignore[import-not-found] import struct import subprocess import sys import weakref +from typing import TYPE_CHECKING -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + np = None import pyarrow as pa import pyarrow.tests.strategies as past @@ -71,7 +75,7 @@ def test_constructor_raises(): # This could happen by wrong capitalization. # ARROW-2638: prevent calling extension class constructors directly with pytest.raises(TypeError): - pa.Array([1, 2]) + pa.Array([1, 2]) # type: ignore[reportCallIssue] def test_list_format(): @@ -321,11 +325,11 @@ def test_asarray(): arr = pa.array(range(4)) - # The iterator interface gives back an array of Int64Value's + # The iterator interface gives back an array of Int64Type's np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') - assert isinstance(np_arr[0], pa.lib.Int64Value) + assert isinstance(np_arr[0], pa.lib.Int64Type) # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) @@ -649,8 +653,8 @@ def test_array_eq(): @pytest.mark.numpy def test_array_from_buffers(): - values_buf = pa.py_buffer(np.int16([4, 5, 6, 7])) - nulls_buf = pa.py_buffer(np.uint8([0b00001101])) + values_buf = pa.py_buffer(np.array([4, 5, 6, 7], dtype=np.int16())) + nulls_buf = pa.py_buffer(np.array([0b00001101], dtype=np.uint8())) arr = pa.Array.from_buffers(pa.int16(), 4, [nulls_buf, values_buf]) assert arr.type == pa.int16() assert arr.to_pylist() == [4, None, 6, 7] @@ -665,7 +669,9 @@ def test_array_from_buffers(): assert arr.to_pylist() == [None, 6, 7] with pytest.raises(TypeError): - pa.Array.from_buffers(pa.int16(), 3, ['', ''], offset=1) + pa.Array.from_buffers( + pa.int16(), 3, ['', ''], offset=1 # type: ignore[reportArgumentType] + ) def test_string_binary_from_buffers(): @@ -859,7 +865,8 @@ def test_struct_array_from_chunked(): chunked_arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError, match="Expected Array"): - pa.StructArray.from_arrays([chunked_arr], ["foo"]) + pa.StructArray.from_arrays( + [chunked_arr], ["foo"]) # type: ignore[reportArgumentType] @pytest.mark.parametrize("offset", (0, 1)) @@ -2861,7 +2868,7 @@ def test_buffers_primitive(): # Slicing does not affect the buffers but the offset a_sliced = a[1:] buffers = a_sliced.buffers() - a_sliced.offset == 1 + assert a_sliced.offset == 1 assert len(buffers) == 2 null_bitmap = buffers[0].to_pybytes() assert 1 <= len(null_bitmap) <= 64 # XXX this is varying @@ -2869,7 +2876,7 @@ def test_buffers_primitive(): assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4) - a = pa.array(np.int8([4, 5, 6])) + a = pa.array(np.array([4, 5, 6], dtype=np.int8)) buffers = a.buffers() assert len(buffers) == 2 # No null bitmap from Numpy int array @@ -2955,7 +2962,7 @@ def test_nbytes_size(): def test_invalid_tensor_constructor_repr(): # ARROW-2638: prevent calling extension class constructors directly with pytest.raises(TypeError): - repr(pa.Tensor([1])) + repr(pa.Tensor([1])) # type: ignore[reportCallIssue] def test_invalid_tensor_construction(): @@ -3473,7 +3480,7 @@ def test_array_supported_masks(): with pytest.raises(pa.ArrowTypeError): arr = pa.array([4, None, 4, 3], - mask=[1.0, 2.0, 3.0, 4.0]) + mask=[1.0, 2.0, 3.0, 4.0]) # type: ignore[reportArgumentType] with pytest.raises(pa.ArrowTypeError): arr = pa.array([4, None, 4, 3], @@ -3760,11 +3767,11 @@ def test_concat_array_invalid_type(): # ARROW-9920 - do not segfault on non-array input with pytest.raises(TypeError, match="should contain Array objects"): - pa.concat_arrays([None]) + pa.concat_arrays([None]) # type: ignore[reportArgumentType] arr = pa.chunked_array([[0, 1], [3, 4]]) with pytest.raises(TypeError, match="should contain Array objects"): - pa.concat_arrays(arr) + pa.concat_arrays(arr) # type: ignore[reportArgumentType] @pytest.mark.pandas @@ -4293,7 +4300,7 @@ def test_non_cpu_array(): with pytest.raises(NotImplementedError): [i for i in iter(arr)] with pytest.raises(NotImplementedError): - arr == arr2 + _ = arr == arr2 with pytest.raises(NotImplementedError): arr.is_null() with pytest.raises(NotImplementedError): diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 84290a6b880..a6bd48208d9 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -24,7 +24,7 @@ try: from pyarrow.cffi import ffi except ImportError: - ffi = None + pass import pytest @@ -32,7 +32,7 @@ import pandas as pd import pandas.testing as tm except ImportError: - pd = tm = None + pd = None # type: ignore[assignment] needs_cffi = pytest.mark.skipif(ffi is None, @@ -148,7 +148,7 @@ def test_export_import_type(): # Invalid format string pa.int32()._export_to_c(ptr_schema) bad_format = ffi.new("char[]", b"zzz") - c_schema.format = bad_format + c_schema.format = bad_format # type: ignore[attr-defined] with pytest.raises(ValueError, match="Invalid or unsupported format string"): pa.DataType._import_from_c(ptr_schema) @@ -248,9 +248,9 @@ def test_export_import_device_array(): arr = pa.array([[1], [2, 42]], type=pa.list_(pa.int32())) arr._export_to_c_device(ptr_array) - assert c_array.device_type == 1 # ARROW_DEVICE_CPU 1 - assert c_array.device_id == -1 - assert c_array.array.length == 2 + assert c_array.device_type == 1 # type: ignore[attr-defined] # ARROW_DEVICE_CPU 1 + assert c_array.device_id == -1 # type: ignore[attr-defined] + assert c_array.array.length == 2 # type: ignore[attr-defined] def check_export_import_schema(schema_factory, expected_schema_factory=None): @@ -310,9 +310,10 @@ def test_export_import_schema_float_pointer(): match = "Passing a pointer value as a float is unsafe" with pytest.warns(UserWarning, match=match): - make_schema()._export_to_c(float(ptr_schema)) + make_schema()._export_to_c(float(ptr_schema)) # type: ignore[arg-type] with pytest.warns(UserWarning, match=match): - schema_new = pa.Schema._import_from_c(float(ptr_schema)) + schema_new = pa.Schema._import_from_c( + float(ptr_schema)) # type: ignore[arg-type] assert schema_new == make_schema() @@ -405,9 +406,9 @@ def test_export_import_device_batch(): ptr_array = int(ffi.cast("uintptr_t", c_array)) batch = make_batch() batch._export_to_c_device(ptr_array) - assert c_array.device_type == 1 # ARROW_DEVICE_CPU 1 - assert c_array.device_id == -1 - assert c_array.array.length == 2 + assert c_array.device_type == 1 # type: ignore[attr-defined] # ARROW_DEVICE_CPU 1 + assert c_array.device_id == -1 # type: ignore[attr-defined] + assert c_array.array.length == 2 # type: ignore[attr-defined] def _export_import_batch_reader(ptr_stream, reader_factory): @@ -765,7 +766,7 @@ def test_import_device_no_cuda(): # patch the device type of the struct, this results in an invalid ArrowDeviceArray # but this is just to test we raise am error before actually importing buffers - c_array.device_type = 2 # ARROW_DEVICE_CUDA + c_array.device_type = 2 # type: ignore[attr-defined] # ARROW_DEVICE_CUDA with pytest.raises(ImportError, match="Trying to import data on a CUDA device"): pa.Array._import_from_c_device(ptr_array, arr.type) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index ca0df36cff2..a1d2aac1630 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -31,12 +31,12 @@ try: import numpy as np except ImportError: - np = None + pass try: import pandas as pd except ImportError: - pd = None + pass import pyarrow as pa import pyarrow.compute as pc @@ -45,7 +45,7 @@ try: import pyarrow.substrait as pas except ImportError: - pas = None + pas = None # type: ignore[assignment] exported_functions = [ func for (name, func) in sorted(pc.__dict__.items()) @@ -329,9 +329,11 @@ def test_function_attributes(): def test_input_type_conversion(): # Automatic array conversion from Python arr = pc.add([1, 2], [4, None]) + assert isinstance(arr, pa.Array) assert arr.to_pylist() == [5, None] # Automatic scalar conversion from Python arr = pc.add([1, 2], 4) + assert isinstance(arr, pa.Array) assert arr.to_pylist() == [5, 6] # Other scalar type assert pc.equal(["foo", "bar", None], @@ -779,9 +781,11 @@ def test_min_max(): assert s.as_py() == {'min': 1, 'max': 6} s = pc.min_max(data, options=pc.ScalarAggregateOptions()) assert s.as_py() == {'min': 1, 'max': 6} - s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True)) + s = pc.min_max(data, options=pc.ScalarAggregateOptions( + skip_nulls=True)) assert s.as_py() == {'min': 1, 'max': 6} - s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False)) + s = pc.min_max(data, options=pc.ScalarAggregateOptions( + skip_nulls=False)) assert s.as_py() == {'min': None, 'max': None} # Options as dict of kwargs @@ -799,11 +803,11 @@ def test_min_max(): # Wrong options type options = pc.TakeOptions() with pytest.raises(TypeError): - s = pc.min_max(data, options=options) + s = pc.min_max(data, options=options) # type: ignore[arg-type] # Missing argument with pytest.raises(TypeError, match="min_max takes 1 positional"): - s = pc.min_max() + s = pc.min_max() # type: ignore[call-arg] def test_any(): @@ -844,12 +848,12 @@ def test_all(): assert pc.all(a, options=options).as_py() is None a = pa.chunked_array([[True], [True, None]]) - assert pc.all(a).as_py() is True - assert pc.all(a, options=options).as_py() is None + assert pc.all(a).as_py() is True # type: ignore[arg-type] + assert pc.all(a, options=options).as_py() is None # type: ignore[arg-type] a = pa.chunked_array([[True], [False]]) - assert pc.all(a).as_py() is False - assert pc.all(a, options=options).as_py() is False + assert pc.all(a).as_py() is False # type: ignore[arg-type] + assert pc.all(a, options=options).as_py() is False # type: ignore[arg-type] def test_is_valid(): @@ -858,7 +862,7 @@ def test_is_valid(): assert pc.is_valid(data).to_pylist() == [True, True, False] with pytest.raises(TypeError): - pc.is_valid(data, options=None) + pc.is_valid(data, options=None) # type: ignore[call-arg] def test_generated_docstrings(): @@ -1037,21 +1041,6 @@ def find_new_unicode_codepoints(): 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8, 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, } -# utf8proc does not store if a codepoint is numeric -numeric_info_missing = { - 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, - 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, - 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, - 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, - 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, - 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, - 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, - 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, - 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, - 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, - 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, - 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, - 0x10fcb, } # utf8proc has no no digit/numeric information digit_info_missing = { 0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, @@ -1070,6 +1059,7 @@ def find_new_unicode_codepoints(): 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, } +# utf8proc does not store if a codepoint is numeric numeric_info_missing = { 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, @@ -1104,7 +1094,7 @@ def test_string_py_compat_boolean(function_name, variant): py_name = function_name.replace('_', '') ignore = codepoints_ignore.get(function_name, set()) | \ find_new_unicode_codepoints() - for i in range(128 if ascii else 0x11000): + for i in range(128 if ascii else 0x11000): # type: ignore[truthy-function] if i in range(0xD800, 0xE000): continue # bug? pyarrow doesn't allow utf16 surrogates # the issues we know of, we skip @@ -1593,10 +1583,10 @@ def test_filter_null_type(): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_array(typ): if typ == "array": - def con(values): + def con(values): # type: ignore[no-redef] return pa.array(values) else: - def con(values): + def con(values): # type: ignore[no-redef] return pa.chunked_array([values]) arr1 = con([1, 2, 3, 4, None]) @@ -1624,10 +1614,10 @@ def con(values): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_string_scalar(typ): if typ == "array": - def con(values): + def con(values): # type: ignore[no-redef] return pa.array(values) else: - def con(values): + def con(values): # type: ignore[no-redef] return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) @@ -1661,10 +1651,10 @@ def con(values): @pytest.mark.parametrize("typ", ["array", "chunked_array"]) def test_compare_scalar(typ): if typ == "array": - def con(values): + def con(values): # type: ignore[no-redef] return pa.array(values) else: - def con(values): + def con(values): # type: ignore[no-redef] return pa.chunked_array([values]) arr = con([1, 2, 3, None]) @@ -1757,8 +1747,9 @@ def test_round_to_integer(ty): "half_to_odd": [3, 3, 4, 5, -3, -3, -4, None], } for round_mode, expected in rmode_and_expected.items(): - options = RoundOptions(round_mode=round_mode) - result = round(values, options=options) + options = RoundOptions( + round_mode=round_mode) # type: ignore[arg-type] + result = round(values, options=options) # type: ignore[arg-type] expected_array = pa.array(expected, type=pa.float64()) assert expected_array.equals(result) @@ -1776,7 +1767,9 @@ def test_round(): for ndigits, expected in ndigits_and_expected.items(): options = pc.RoundOptions(ndigits, "half_towards_infinity") result = pc.round(values, options=options) - np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) + assert isinstance(result, pa.Array) + np.testing.assert_allclose( + result, pa.array(expected), equal_nan=True) assert pc.round(values, ndigits, round_mode="half_towards_infinity") == result assert pc.round(values, ndigits, "half_towards_infinity") == result @@ -1796,6 +1789,7 @@ def test_round_to_multiple(): for multiple, expected in multiple_and_expected.items(): options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity") result = pc.round_to_multiple(values, options=options) + assert isinstance(result, pa.Array) np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) assert pc.round_to_multiple(values, multiple, "half_towards_infinity") == result @@ -1992,7 +1986,8 @@ def test_logical(): def test_dictionary_decode(): array = pa.array(["a", "a", "b", "c", "b"]) dictionary_array = array.dictionary_encode() - dictionary_array_decode = pc.dictionary_decode(dictionary_array) + dictionary_array_decode = pc.dictionary_decode( + dictionary_array) assert array != dictionary_array @@ -2172,7 +2167,7 @@ def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx, # Round `expected` to `scale` digits after the decimal point expected = expected.quantize(decimal.Decimal(1).scaleb(-decimal_ty.scale)) s = pa.scalar(float_val, type=float_ty) - actual = pc.cast(s, decimal_ty).as_py() + actual = pc.cast(s, decimal_ty).as_py() # type: ignore[union-attr] if actual != expected: # Allow the last digit to vary. The tolerance is higher for # very high precisions as rounding errors can accumulate in @@ -2264,8 +2259,9 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): expected = decimal.Decimal(mantissa) / 2**-float_exp expected_as_int = round(expected.scaleb(scale)) actual = pc.cast( - pa.scalar(float_val, type=float_ty), decimal_ty).as_py() - actual_as_int = round(actual.scaleb(scale)) + pa.scalar(float_val, type=float_ty), decimal_ty + ).as_py() # type: ignore[union-attr] + actual_as_int = round(actual.scaleb(scale)) # type: ignore[union-attr] # We allow for a minor rounding error between expected and actual assert abs(actual_as_int - expected_as_int) <= 1 @@ -2490,10 +2486,11 @@ def test_extract_datetime_components(request): def test_offset_timezone(): - arr = pc.strptime(["2012-12-12T12:12:12"], format="%Y-%m-%dT%H:%M:%S", unit="s") + arr = pc.strptime(pa.array(["2012-12-12T12:12:12"]), + format="%Y-%m-%dT%H:%M:%S", unit="s") zoned_arr = arr.cast(pa.timestamp("s", tz="+05:30")) - assert pc.hour(zoned_arr)[0].as_py() == 17 - assert pc.minute(zoned_arr)[0].as_py() == 42 + assert pc.hour(zoned_arr)[0].as_py() == 17 # type: ignore[index,arg-type] + assert pc.minute(zoned_arr)[0].as_py() == 42 # type: ignore[index,arg-type] @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) @@ -2590,12 +2587,14 @@ def test_assume_timezone(): f"timezone '{timezone}'"): pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) - expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True]) + expected = ambiguous.tz_localize( + timezone, ambiguous=np.array([True, True, True])) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_earliest) result.equals(pa.array(expected)) - expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False]) + expected = ambiguous.tz_localize( + timezone, ambiguous=np.array([False, False, False])) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_latest) result.equals(pa.array(expected)) @@ -2684,7 +2683,9 @@ def _check_temporal_rounding(ts, values, unit): expected = np.where( expected == ts, - expected + pd.Timedelta(value, unit_shorthand[unit]), + expected + pd.Timedelta( + value, unit_shorthand[unit] # type: ignore[arg-type] + ), expected) np.testing.assert_array_equal(result, expected) @@ -2746,7 +2747,7 @@ def test_count(): with pytest.raises(ValueError, match='"something else" is not a valid count mode'): - pc.count(arr, 'something else') + pc.count(arr, 'something else') # type: ignore[arg-type] def test_index(): @@ -2796,7 +2797,7 @@ def test_partition_nth(): with pytest.raises( ValueError, match="'partition_nth_indices' cannot be called without options"): - pc.partition_nth_indices(data) + pc.partition_nth_indices(data) # type: ignore[call-arg] def test_partition_nth_null_placement(): @@ -2918,7 +2919,7 @@ def test_array_sort_indices(): assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="not a valid sort order"): - pc.array_sort_indices(arr, order="nonscending") + pc.array_sort_indices(arr, order="nonscending") # type: ignore[arg-type] def test_sort_indices_array(): @@ -2981,23 +2982,29 @@ def test_sort_indices_table(): pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) with pytest.raises(ValueError, match="not a valid sort order"): - pc.sort_indices(table, sort_keys=[("a", "nonscending")]) + pc.sort_indices( + table, sort_keys=[("a", "nonscending")] # type: ignore[list-item] + ) def test_is_in(): arr = pa.array([1, 2, None, 1, 2, 3]) result = pc.is_in(arr, value_set=pa.array([1, 3, None])) - assert result.to_pylist() == [True, False, True, True, False, True] + assert result.to_pylist() == [True, False, True, True, + False, True] result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) - assert result.to_pylist() == [True, False, False, True, False, True] + assert result.to_pylist() == [True, False, False, True, + False, True] result = pc.is_in(arr, value_set=pa.array([1, 3])) - assert result.to_pylist() == [True, False, False, True, False, True] + assert result.to_pylist() == [True, False, False, True, + False, True] result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) - assert result.to_pylist() == [True, False, False, True, False, True] + assert result.to_pylist() == [True, False, False, True, + False, True] def test_index_in(): @@ -3061,7 +3068,7 @@ def test_quantile(): with pytest.raises(ValueError, match="Quantile must be between 0 and 1"): pc.quantile(arr, q=1.1) with pytest.raises(ValueError, match="not a valid quantile interpolation"): - pc.quantile(arr, interpolation='zzz') + pc.quantile(arr, interpolation='zzz') # type: ignore[arg-type] def test_tdigest(): @@ -3170,6 +3177,7 @@ def test_cumulative_sum(start, skip_nulls): # Add `start` offset to expected array before comparing expected = pc.add(expected_arrays[i], strt if strt is not None else 0) + assert isinstance(expected, pa.Array) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) @@ -3225,6 +3233,7 @@ def test_cumulative_prod(start, skip_nulls): # Multiply `start` offset to expected array before comparing expected = pc.multiply(expected_arrays[i], strt if strt is not None else 1) + assert isinstance(expected, pa.Array) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) @@ -3283,8 +3292,10 @@ def test_cumulative_max(start, skip_nulls): expected = pc.max_element_wise( expected_arrays[i], strt if strt is not None else -1e9, skip_nulls=False) - np.testing.assert_array_almost_equal(result.to_numpy( - zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) + np.testing.assert_array_almost_equal( + result.to_numpy(zero_copy_only=False), + expected.to_numpy(zero_copy_only=False) + ) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): @@ -3341,8 +3352,10 @@ def test_cumulative_min(start, skip_nulls): expected = pc.min_element_wise( expected_arrays[i], strt if strt is not None else 1e9, skip_nulls=False) - np.testing.assert_array_almost_equal(result.to_numpy( - zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) + np.testing.assert_array_almost_equal( + result.to_numpy(zero_copy_only=False), + expected.to_numpy(zero_copy_only=False) + ) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): @@ -3472,7 +3485,7 @@ def test_utf8_normalize(): with pytest.raises( ValueError, match='"NFZ" is not a valid Unicode normalization form'): - pc.utf8_normalize(arr, form="NFZ") + pc.utf8_normalize(arr, form="NFZ") # type: ignore[arg-type] def test_random(): @@ -3499,7 +3512,7 @@ def test_random(): with pytest.raises(TypeError, match=r"initializer should be 'system', an integer, " r"or a hashable object; got \[\]"): - pc.random(100, initializer=[]) + pc.random(100, initializer=[]) # type: ignore[arg-type] @pytest.mark.parametrize( @@ -3549,7 +3562,7 @@ def test_rank_options(): match=r'"NonExisting" is not a valid tiebreaker'): pc.RankOptions(sort_keys="descending", null_placement="at_end", - tiebreaker="NonExisting") + tiebreaker="NonExisting") # type: ignore[arg-type] def test_rank_quantile_options(): @@ -3579,7 +3592,7 @@ def test_rank_quantile_options(): assert result.equals(expected_descending) with pytest.raises(ValueError, match="not a valid sort order"): - pc.rank_quantile(arr, sort_keys="XXX") + pc.rank_quantile(arr, sort_keys="XXX") # type: ignore[arg-type] def test_rank_normal_options(): @@ -3765,21 +3778,21 @@ def test_expression_construction(): nested_field = pc.field(("nested", "field")) nested_field2 = pc.field("nested", "field") - zero | one == string - ~true == false + _ = zero | one == string + _ = ~true == false for typ in ("bool", pa.bool_()): - field.cast(typ) == true + _ = field.cast(typ) == true - field.isin([1, 2]) - nested_mixed_types.isin(["foo", "bar"]) + _ = field.isin([1, 2]) + _ = nested_mixed_types.isin(["foo", "bar"]) nested_field.isin(["foo", "bar"]) nested_field2.isin(["foo", "bar"]) with pytest.raises(TypeError): - field.isin(1) + field.isin(1) # type: ignore[arg-type] with pytest.raises(pa.ArrowInvalid): - field != object() + _ = field != object() def test_expression_boolean_operators(): @@ -3788,16 +3801,16 @@ def test_expression_boolean_operators(): false = pc.scalar(False) with pytest.raises(ValueError, match="cannot be evaluated to python True"): - true and false + _ = true and false with pytest.raises(ValueError, match="cannot be evaluated to python True"): - true or false + _ = true or false with pytest.raises(ValueError, match="cannot be evaluated to python True"): bool(true) with pytest.raises(ValueError, match="cannot be evaluated to python True"): - not true + _ = not true def test_expression_call_function(): @@ -3826,7 +3839,7 @@ def test_cast_table_raises(): table = pa.table({'a': [1, 2]}) with pytest.raises(pa.lib.ArrowTypeError): - pc.cast(table, pa.int64()) + pc.cast(table, pa.int64()) # type: ignore[arg-type] @pytest.mark.parametrize("start,stop,expected", ( diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 07286125c4c..b5a472e3225 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -21,13 +21,18 @@ import itertools import math import re +from typing import TYPE_CHECKING, cast import hypothesis as h import pytest -try: + +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + np = None from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa @@ -66,7 +71,7 @@ def __int__(self): class MyBrokenInt: def __int__(self): - 1/0 # MARKER + _ = 1/0 # MARKER def check_struct_type(ty, expected): @@ -145,7 +150,7 @@ def test_object_with_getitem(): # https://github.com/apache/arrow/issues/34944 # considered as sequence because of __getitem__, but has no length with pytest.raises(TypeError, match="has no len()"): - pa.array(ObjectWithOnlyGetitem()) + pa.array(ObjectWithOnlyGetitem()) # type: ignore[arg-type] def _as_list(xs): @@ -853,7 +858,7 @@ def test_large_binary_value(ty): assert isinstance(arr, pa.Array) assert arr.type == ty assert len(arr) == 4 - buf = arr[1].as_buffer() + buf = cast(pa.FixedSizeBinaryScalar, arr[1]).as_buffer() assert len(buf) == len(s) * nrepeats @@ -1099,11 +1104,11 @@ def expected_datetime_value(dt): ), ] utcdata = [ - pytz.utc.localize(data[0]), + pytz.utc.localize(cast(datetime.datetime, data[0])), data[1], None, - data[3].astimezone(pytz.utc), - data[4].astimezone(pytz.utc), + cast(datetime.datetime, data[3]).astimezone(pytz.utc), + cast(datetime.datetime, data[4]).astimezone(pytz.utc), ] ty = pa.timestamp(unit, tz=timezone) @@ -1231,9 +1236,9 @@ def test_sequence_timestamp_from_mixed_builtin_and_pandas_datetimes(): None, ] utcdata = [ - data[0].astimezone(pytz.utc), - pytz.utc.localize(data[1]), - data[2].astimezone(pytz.utc), + cast(datetime.datetime, data[0]).astimezone(pytz.utc), + pytz.utc.localize(cast(datetime.datetime, data[1])), + cast(datetime.datetime, data[2]).astimezone(pytz.utc), None, ] @@ -2062,8 +2067,8 @@ def test_map_from_dicts(): assert arr.to_pylist() == expected # With omitted values - data[1] = None - expected[1] = None + data[1] = None # type: ignore[call-overload] + expected[1] = None # type: ignore[call-overload] arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) @@ -2388,6 +2393,7 @@ def test_nested_auto_chunking(ty, char): } +@pytest.mark.numpy @pytest.mark.large_memory def test_array_from_pylist_data_overflow(): # Regression test for ARROW-12983 @@ -2410,6 +2416,7 @@ def test_array_from_pylist_data_overflow(): assert len(arr.chunks) > 1 +@pytest.mark.numpy @pytest.mark.slow @pytest.mark.large_memory def test_array_from_pylist_offset_overflow(): @@ -2434,6 +2441,7 @@ def test_array_from_pylist_offset_overflow(): assert len(arr.chunks) > 1 +@pytest.mark.numpy @parametrize_with_collections_types @pytest.mark.parametrize(('data', 'scalar_data', 'value_type'), [ ([True, False, None], [pa.scalar(True), pa.scalar(False), None], pa.bool_()), @@ -2471,8 +2479,10 @@ def test_array_from_pylist_offset_overflow(): pa.timestamp('us') ), ( - [pa.MonthDayNano([1, -1, -10100])], - [pa.scalar(pa.MonthDayNano([1, -1, -10100]))], + [pa.MonthDayNano([1, -1, -10100])], # type: ignore[call-arg, arg-type] + [pa.scalar( + pa.MonthDayNano([1, -1, -10100]) # type: ignore[call-arg, arg-type] + )], pa.month_day_nano_interval() ), (["a", "b"], [pa.scalar("a"), pa.scalar("b")], pa.string()), diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py index 7508d8f0b98..7d652acf62f 100644 --- a/python/pyarrow/tests/test_cpp_internals.py +++ b/python/pyarrow/tests/test_cpp_internals.py @@ -20,7 +20,8 @@ import pytest -from pyarrow._pyarrow_cpp_tests import get_cpp_tests +from pyarrow._pyarrow_cpp_tests import ( # type: ignore[import-not-found, import-untyped] # noqa: E501 + get_cpp_tests) def inject_cpp_tests(ns): diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index f510c6dbe23..f26625a391a 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -178,6 +178,7 @@ def test_read_options(pickle_module): encoding='utf16', skip_rows_after_names=27) + assert opts.block_size is not None assert opts.block_size > 0 opts.block_size = 12345 assert opts.block_size == 12345 @@ -302,6 +303,7 @@ def test_convert_options(pickle_module): with pytest.raises(ValueError): opts.decimal_point = '..' + assert opts.auto_dict_max_cardinality is not None assert opts.auto_dict_max_cardinality > 0 opts.auto_dict_max_cardinality = 99999 assert opts.auto_dict_max_cardinality == 99999 @@ -323,7 +325,7 @@ def test_convert_options(pickle_module): with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): - opts.column_types = 0 + opts.column_types = 0 # type: ignore[reportAttributeAccessIssue] assert isinstance(opts.null_values, list) assert '' in opts.null_values @@ -1158,10 +1160,14 @@ def test_auto_dict_encode(self): table = self.read_bytes(rows, convert_options=opts, validate_full=False) assert table.schema == schema - dict_values = table['a'].chunk(0).dictionary + column_chunk = table.column('a').chunk(0) + assert isinstance(column_chunk, pa.DictionaryArray) + dict_values = column_chunk.dictionary assert len(dict_values) == 2 assert dict_values[0].as_py() == "ab" - assert dict_values[1].as_buffer() == b"cd\xff" + dict_value = dict_values[1] + assert isinstance(dict_value, pa.StringScalar) + assert dict_value.as_buffer() == b"cd\xff" # With invalid UTF8, checked opts.check_utf8 = True @@ -1502,7 +1508,7 @@ def signal_from_thread(): # Interruption should have arrived timely assert last_duration <= 2.0 - e = exc_info.__context__ + e = exc_info.__context__ # type: ignore[possibly-missing-attribute, misc] assert isinstance(e, pa.ArrowCancelled) assert e.signum == signal.SIGINT @@ -1866,6 +1872,9 @@ def use_threads(self): class BaseTestCompressedCSVRead: + def write_file(self, path, content): + pass + csv_filename = "" def setUp(self): self.tmpdir = tempfile.mkdtemp(prefix='arrow-csv-test-') diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py index e06f479987c..9d03a3bbff2 100644 --- a/python/pyarrow/tests/test_cuda.py +++ b/python/pyarrow/tests/test_cuda.py @@ -103,6 +103,7 @@ def make_random_buffer(size, target='host'): assert size >= 0 buf = pa.allocate_buffer(size) assert buf.size == size + assert isinstance(buf, pa.Buffer) arr = np.frombuffer(buf, dtype=np.uint8) assert arr.size == size arr[:] = np.random.randint(low=1, high=255, size=size, dtype=np.uint8) @@ -194,12 +195,14 @@ def test_context_device_buffer(size): np.testing.assert_equal(arr[soffset:soffset + ssize], arr2) # Creating a device buffer from a slice of an array - cudabuf = global_context.buffer_from_data(arr, offset=soffset, size=ssize) + cudabuf = global_context.buffer_from_data( + arr, offset=soffset, size=ssize) assert cudabuf.size == ssize arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8) np.testing.assert_equal(arr[soffset:soffset + ssize], arr2) - cudabuf = global_context.buffer_from_data(arr[soffset:soffset+ssize]) + cudabuf = global_context.buffer_from_data( + arr[soffset:soffset+ssize]) assert cudabuf.size == ssize arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8) np.testing.assert_equal(arr[soffset:soffset + ssize], arr2) @@ -235,7 +238,8 @@ def test_context_device_buffer(size): # Creating device buffer from HostBuffer slice - cudabuf = global_context.buffer_from_data(buf, offset=soffset, size=ssize) + cudabuf = global_context.buffer_from_data( + buf, offset=soffset, size=ssize) assert cudabuf.size == ssize arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8) np.testing.assert_equal(arr[soffset:soffset+ssize], arr2) @@ -384,7 +388,8 @@ def test_copy_from_to_host(size): device_buffer.copy_from_host(buf, position=0, nbytes=nbytes) # Copy back to host and compare contents - buf2 = device_buffer.copy_to_host(position=0, nbytes=nbytes) + buf2 = device_buffer.copy_to_host( + position=0, nbytes=nbytes) arr2 = np.frombuffer(buf2, dtype=dt) np.testing.assert_equal(arr, arr2) @@ -395,7 +400,8 @@ def test_copy_to_host(size): buf = dbuf.copy_to_host() assert buf.is_cpu - np.testing.assert_equal(arr, np.frombuffer(buf, dtype=np.uint8)) + np.testing.assert_equal(arr, np.frombuffer( + buf, dtype=np.uint8)) buf = dbuf.copy_to_host(position=size//4) assert buf.is_cpu @@ -437,11 +443,13 @@ def test_copy_to_host(size): np.frombuffer(buf, dtype=np.uint8)) dbuf.copy_to_host(buf=buf, nbytes=12) - np.testing.assert_equal(arr[:12], np.frombuffer(buf, dtype=np.uint8)[:12]) + np.testing.assert_equal(arr[:12], np.frombuffer( + buf, dtype=np.uint8)[:12]) dbuf.copy_to_host(buf=buf, nbytes=12, position=6) - np.testing.assert_equal(arr[6:6+12], - np.frombuffer(buf, dtype=np.uint8)[:12]) + np.testing.assert_equal( + arr[6:6+12], np.frombuffer(buf, dtype=np.uint8)[:12] + ) for (position, nbytes) in [ (0, size+10), (10, size-5), @@ -450,7 +458,8 @@ def test_copy_to_host(size): with pytest.raises(ValueError, match=('requested copy does not ' 'fit into host buffer')): - dbuf.copy_to_host(buf=buf, position=position, nbytes=nbytes) + dbuf.copy_to_host( + buf=buf, position=position, nbytes=nbytes) @pytest.mark.parametrize("dest_ctx", ['same', 'another']) @@ -460,7 +469,9 @@ def test_copy_from_device(dest_ctx, size): lst = arr.tolist() if dest_ctx == 'another': dest_ctx = global_context1 - if buf.context.device_number == dest_ctx.device_number: + if ( + buf.context.device_number == dest_ctx.device_number + ): pytest.skip("not a multi-GPU system") else: dest_ctx = buf.context @@ -563,7 +574,10 @@ def test_buffer_device(): _, buf = make_random_buffer(size=10, target='device') assert buf.device_type == pa.DeviceAllocationType.CUDA assert isinstance(buf.device, pa.Device) - assert buf.device == global_context.memory_manager.device + assert ( + buf.device == + global_context.memory_manager.device + ) assert isinstance(buf.memory_manager, pa.MemoryManager) assert not buf.is_cpu assert not buf.device.is_cpu @@ -807,8 +821,9 @@ def test_create_table_with_device_buffers(): def other_process_for_test_IPC(handle_buffer, expected_arr): - other_context = pa.cuda.Context(0) - ipc_handle = pa.cuda.IpcMemHandle.from_buffer(handle_buffer) + other_context = cuda.Context(0) + ipc_handle = cuda.IpcMemHandle.from_buffer( + handle_buffer) ipc_buf = other_context.open_ipc_buffer(ipc_handle) ipc_buf.context.synchronize() buf = ipc_buf.copy_to_host() @@ -848,7 +863,8 @@ def test_copy_to(): batch = pa.record_batch({"col": arr}) batch_cuda = batch.copy_to(dest) - buf_cuda = batch_cuda["col"].buffers()[1] + buf_cuda = batch_cuda.column("col").buffers()[1] + assert buf_cuda is not None assert not buf_cuda.is_cpu assert buf_cuda.device_type == pa.DeviceAllocationType.CUDA assert buf_cuda.device == mm_cuda.device @@ -949,7 +965,8 @@ def test_device_interface_batch_array(): cbatch._export_to_c_device(ptr_array, ptr_schema) # Delete and recreate C++ objects from exported pointers del cbatch - cbatch_new = pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema) + cbatch_new = pa.RecordBatch._import_from_c_device( + ptr_array, ptr_schema) assert cbatch_new.schema == schema batch_new = cbatch_new.copy_to(pa.default_cpu_memory_manager()) assert batch_new.equals(batch) @@ -957,13 +974,15 @@ def test_device_interface_batch_array(): del cbatch_new # Now released with pytest.raises(ValueError, match="Cannot import released ArrowSchema"): - pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema) + pa.RecordBatch._import_from_c_device( + ptr_array, ptr_schema) # Not a struct type pa.int32()._export_to_c(ptr_schema) with pytest.raises(ValueError, match="ArrowSchema describes non-struct type"): - pa.RecordBatch._import_from_c_device(ptr_array, ptr_schema) + pa.RecordBatch._import_from_c_device( + ptr_array, ptr_schema) def test_print_array(): diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py index 876f3c7f761..4a5bc797533 100644 --- a/python/pyarrow/tests/test_cuda_numba_interop.py +++ b/python/pyarrow/tests/test_cuda_numba_interop.py @@ -28,7 +28,6 @@ from numba.cuda.cudadrv.devicearray import DeviceNDArray # noqa: E402 - context_choices = None context_choice_ids = ['pyarrow.cuda', 'numba.cuda'] @@ -62,17 +61,19 @@ def test_context(c): def make_random_buffer(size, target='host', dtype='uint8', ctx=None): """Return a host or device buffer with random data. """ - dtype = np.dtype(dtype) + assert np is not None + dtype_obj = np.dtype(dtype) if target == 'host': assert size >= 0 - buf = pa.allocate_buffer(size*dtype.itemsize) - arr = np.frombuffer(buf, dtype=dtype) + buf = pa.allocate_buffer(size*dtype_obj.itemsize) + arr = np.frombuffer(buf, dtype=dtype_obj) arr[:] = np.random.randint(low=0, high=255, size=size, dtype=np.uint8) return arr, buf elif target == 'device': arr, buf = make_random_buffer(size, target='host', dtype=dtype) - dbuf = ctx.new_buffer(size * dtype.itemsize) + assert ctx is not None + dbuf = ctx.new_buffer(size * dtype_obj.itemsize) dbuf.copy_from_host(buf, position=0, nbytes=buf.size) return arr, dbuf raise ValueError('invalid target value') @@ -161,8 +162,8 @@ def __cuda_array_interface__(self): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_numba_memalloc(c, dtype): + assert np is not None ctx, nb_ctx = context_choices[c] - dtype = np.dtype(dtype) # Allocate memory using numba context # Warning: this will not be reflected in pyarrow context manager # (e.g bytes_allocated does not change) @@ -198,6 +199,7 @@ def test_pyarrow_memalloc(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_numba_context(c, dtype): + assert np is not None ctx, nb_ctx = context_choices[c] size = 10 with nb_cuda.gpus[0]: @@ -209,7 +211,10 @@ def test_numba_context(c, dtype): np.testing.assert_equal(darr.copy_to_host(), arr) darr[0] = 99 cbuf.context.synchronize() - arr2 = np.frombuffer(cbuf.copy_to_host(), dtype=dtype) + arr2 = np.frombuffer( + cbuf.copy_to_host(), + dtype=np.dtype(dtype) + ) assert arr2[0] == 99 @@ -217,6 +222,7 @@ def test_numba_context(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_pyarrow_jit(c, dtype): + assert np is not None ctx, nb_ctx = context_choices[c] @nb_cuda.jit @@ -234,5 +240,8 @@ def increment_by_one(an_array): darr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=mem) increment_by_one[blockspergrid, threadsperblock](darr) cbuf.context.synchronize() - arr1 = np.frombuffer(cbuf.copy_to_host(), dtype=arr.dtype) + arr1 = np.frombuffer( + cbuf.copy_to_host(), + dtype=arr.dtype + ) np.testing.assert_equal(arr1, arr + 1) diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py index e0116a4bb76..9f050a5e3f8 100644 --- a/python/pyarrow/tests/test_cython.py +++ b/python/pyarrow/tests/test_cython.py @@ -89,7 +89,7 @@ def test_cython_api(tmpdir): Basic test for the Cython API. """ # Fail early if cython is not found - import cython # noqa + import cython # type: ignore[import-untyped, import-not-found] # noqa with tmpdir.as_cwd(): # Set up temporary workspace diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 32bcebb28de..58f8262e4a5 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -32,7 +32,7 @@ try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa @@ -40,6 +40,7 @@ import pyarrow.csv import pyarrow.feather import pyarrow.fs as fs +from pyarrow.fs import FileInfo import pyarrow.json from pyarrow.lib import is_threading_enabled from pyarrow.tests.util import (FSProtocolClass, ProxyHandler, @@ -49,17 +50,17 @@ try: import pandas as pd except ImportError: - pd = None + pass try: import pyarrow.dataset as ds except ImportError: - ds = None + pass try: import pyarrow.parquet as pq except ImportError: - pq = None + pass # Marks all of the tests in this module # Ignore these with pytest ... -m 'not dataset' @@ -395,14 +396,16 @@ def test_filesystem_dataset(mockfs): # validation of required arguments with pytest.raises(TypeError, match="incorrect type"): - ds.FileSystemDataset(fragments, file_format, schema) + ds.FileSystemDataset(fragments, file_format, schema) # type: ignore[arg-type] # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): - ds.FileSystemDataset(fragments, schema=schema, - format=file_format, root_partition=1) + ds.FileSystemDataset( + fragments, schema=schema, format=file_format, + root_partition=1) # type: ignore[arg-type] # missing required argument in from_paths with pytest.raises(TypeError, match="incorrect type"): - ds.FileSystemDataset.from_paths(fragments, format=file_format) + ds.FileSystemDataset.from_paths( + fragments, format=file_format) # type: ignore[arg-type] def test_filesystem_dataset_no_filesystem_interaction(dataset_reader): @@ -820,7 +823,8 @@ def test_partitioning(): load_back = None with pytest.raises(ValueError, match="Expected Partitioning or PartitioningFactory"): - load_back = ds.dataset(tempdir, format='ipc', partitioning=int(0)) + load_back = ds.dataset( + tempdir, format='ipc', partitioning=int(0)) # type: ignore[arg-type] assert load_back is None @@ -852,8 +856,8 @@ def test_partitioning_pickling(pickle_module): ) def test_dataset_partitioning_format( flavor: str, - expected_defined_partition: tuple, - expected_undefined_partition: tuple, + expected_defined_partition: tuple[str], + expected_undefined_partition: tuple[str], ): partitioning_schema = pa.schema([("foo", pa.string()), ("bar", pa.string())]) @@ -1208,6 +1212,7 @@ def test_make_fragment(multisourcefs): parquet_format = ds.ParquetFileFormat() dataset = ds.dataset('/plain', filesystem=multisourcefs, format=parquet_format) + assert isinstance(dataset, ds.FileSystemDataset) for path in dataset.files: fragment = parquet_format.make_fragment(path, multisourcefs) @@ -1245,7 +1250,9 @@ def test_make_fragment_with_size(s3_example_simple): assert tbl.equals(table) # true sizes -> works - sizes_true = [dataset.filesystem.get_file_info(x).size for x in dataset.files] + dataset_file_info = [dataset.filesystem.get_file_info(x) for x in dataset.files] + sizes_true = [x.size if isinstance( + x, FileInfo) else None for x in dataset_file_info] fragments_with_size = [file_format.make_fragment(path, fs, file_size=size) for path, size in zip(paths, sizes_true)] dataset_with_size = ds.FileSystemDataset( @@ -1936,6 +1943,7 @@ def test_fragments_repr(tempdir, dataset): # single-file parquet dataset (no partition information in repr) table, path = _create_single_file(tempdir) dataset = ds.dataset(path, format="parquet") + assert isinstance(dataset, ds.FileSystemDataset) fragment = list(dataset.get_fragments())[0] assert ( repr(fragment) == @@ -1947,6 +1955,7 @@ def test_fragments_repr(tempdir, dataset): path = tempdir / "data.feather" pa.feather.write_feather(table, path) dataset = ds.dataset(path, format="feather") + assert isinstance(dataset, ds.FileSystemDataset) fragment = list(dataset.get_fragments())[0] assert ( repr(fragment) == @@ -2058,7 +2067,7 @@ def test_partitioning_factory_segment_encoding(pickled, pickle_module): actual = factory.finish().to_table(columns={ "date_int": ds.field("date").cast(pa.int64()), }) - assert actual[0][0].as_py() == 1620086400 + assert actual.column(0).chunk(0)[0].as_py() == 1620086400 partitioning_factory = ds.DirectoryPartitioning.discover( ["date", "string"], segment_encoding="none") @@ -2098,7 +2107,7 @@ def test_partitioning_factory_segment_encoding(pickled, pickle_module): actual = factory.finish().to_table(columns={ "date_int": ds.field("date").cast(pa.int64()), }) - assert actual[0][0].as_py() == 1620086400 + assert actual.column(0).chunk(0)[0].as_py() == 1620086400 partitioning_factory = ds.HivePartitioning.discover( segment_encoding="none") @@ -2166,7 +2175,7 @@ def test_partitioning_factory_hive_segment_encoding_key_encoded(pickled, pickle_ actual = factory.finish().to_table(columns={ "date_int": ds.field("test'; date").cast(pa.int64()), }) - assert actual[0][0].as_py() == 1620086400 + assert actual.column(0).chunk(0)[0].as_py() == 1620086400 partitioning_factory = ds.HivePartitioning.discover( segment_encoding="uri") @@ -2224,7 +2233,7 @@ def test_dictionary_partitioning_outer_nulls_raises(tempdir): def test_positional_keywords_raises(tempdir): table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']}) with pytest.raises(TypeError): - ds.write_dataset(table, tempdir, "basename-{i}.arrow") + ds.write_dataset(table, tempdir, "basename-{i}.arrow") # type: ignore[arg-type] @pytest.mark.parquet @@ -2238,20 +2247,20 @@ def test_read_partition_keys_only(tempdir): 'key': pa.repeat(0, BATCH_SIZE + 1), 'value': np.arange(BATCH_SIZE + 1)}) pq.write_to_dataset( - table[:BATCH_SIZE], + table[:BATCH_SIZE], # type: ignore[arg-type] tempdir / 'one', partition_cols=['key']) pq.write_to_dataset( - table[:BATCH_SIZE + 1], + table[:BATCH_SIZE + 1], # type: ignore[arg-type] tempdir / 'two', partition_cols=['key']) table = pq.read_table(tempdir / 'one', columns=['key']) - assert table['key'].num_chunks == 1 + assert table.column('key').num_chunks == 1 table = pq.read_table(tempdir / 'two', columns=['key', 'value']) - assert table['key'].num_chunks == 2 + assert table.column('key').num_chunks == 2 table = pq.read_table(tempdir / 'two', columns=['key']) - assert table['key'].num_chunks == 2 + assert table.column('key').num_chunks == 2 def _has_subdirs(basedir): @@ -2312,9 +2321,9 @@ def test_partitioning_function(): with pytest.raises(ValueError): ds.partitioning() with pytest.raises(ValueError, match="Expected list"): - ds.partitioning(field_names=schema) + ds.partitioning(field_names=schema) # type: ignore[arg-type] with pytest.raises(ValueError, match="Cannot specify both"): - ds.partitioning(schema, field_names=schema) + ds.partitioning(schema, field_names=schema) # type: ignore[call-overload] # Hive partitioning part = ds.partitioning(schema, flavor="hive") @@ -2325,13 +2334,13 @@ def test_partitioning_function(): assert isinstance(part, ds.PartitioningFactory) # cannot pass list of names with pytest.raises(ValueError): - ds.partitioning(names, flavor="hive") + ds.partitioning(names, flavor="hive") # type: ignore[arg-type] with pytest.raises(ValueError, match="Cannot specify 'field_names'"): ds.partitioning(field_names=names, flavor="hive") # unsupported flavor with pytest.raises(ValueError): - ds.partitioning(schema, flavor="unsupported") + ds.partitioning(schema, flavor="unsupported") # type: ignore[arg-type] @pytest.mark.parquet @@ -2346,6 +2355,8 @@ def test_directory_partitioning_dictionary_key(mockfs): dataset = ds.dataset( "subdir", format="parquet", filesystem=mockfs, partitioning=part ) + assert isinstance(dataset, ds.FileSystemDataset) + assert dataset.partitioning is not None assert dataset.partitioning.schema == schema table = dataset.to_table() @@ -2366,6 +2377,8 @@ def test_hive_partitioning_dictionary_key(multisourcefs): dataset = ds.dataset( "hive", format="parquet", filesystem=multisourcefs, partitioning=part ) + assert isinstance(dataset, ds.FileSystemDataset) + assert dataset.partitioning is not None assert dataset.partitioning.schema == schema table = dataset.to_table() @@ -2373,11 +2386,13 @@ def test_hive_partitioning_dictionary_key(multisourcefs): month_dictionary = list(range(1, 13)) assert table.column('year').type.equals(schema.types[0]) for chunk in table.column('year').chunks: + assert isinstance(chunk, pa.DictionaryArray) actual = chunk.dictionary.to_pylist() actual.sort() assert actual == year_dictionary assert table.column('month').type.equals(schema.types[1]) for chunk in table.column('month').chunks: + assert isinstance(chunk, pa.DictionaryArray) actual = chunk.dictionary.to_pylist() actual.sort() assert actual == month_dictionary @@ -2567,6 +2582,8 @@ def test_construct_from_mixed_child_datasets(mockfs): 'subdir/2/yyy/file1.parquet'], filesystem=mockfs) b = ds.dataset('subdir', filesystem=mockfs) + assert isinstance(a, ds.FileSystemDataset) + assert isinstance(b, ds.FileSystemDataset) dataset = ds.dataset([a, b]) assert isinstance(dataset, ds.UnionDataset) @@ -2578,8 +2595,8 @@ def test_construct_from_mixed_child_datasets(mockfs): assert len(dataset.children) == 2 for child in dataset.children: - assert child.files == ['subdir/1/xxx/file0.parquet', - 'subdir/2/yyy/file1.parquet'] + assert child.files == [ # type: ignore[attr-defined] + 'subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] def test_construct_empty_dataset(): @@ -2613,7 +2630,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs): batch2 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["b"]) with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'): - ds.dataset([child1, child2]) + ds.dataset([child1, child2]) # type: ignore[arg-type] expected = ( "Expected a list of path-like or dataset objects, or a list " @@ -2621,14 +2638,14 @@ def test_construct_from_invalid_sources_raise(multisourcefs): "types: int" ) with pytest.raises(TypeError, match=expected): - ds.dataset([1, 2, 3]) + ds.dataset([1, 2, 3]) # type: ignore[arg-type] expected = ( "Expected a path-like, list of path-likes or a list of Datasets " "instead of the given type: NoneType" ) with pytest.raises(TypeError, match=expected): - ds.dataset(None) + ds.dataset(None) # type: ignore[arg-type] expected = ( "Expected a path-like, list of path-likes or a list of Datasets " @@ -2655,7 +2672,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs): "batches or tables. The given list contains the following types:" ) with pytest.raises(TypeError, match=expected): - ds.dataset([batch1, 0]) + ds.dataset([batch1, 0]) # type: ignore[arg-type] expected = ( "Expected a list of tables or batches. The given list contains a int" @@ -2745,7 +2762,7 @@ def test_open_dataset_partitioned_directory(tempdir, dataset_reader, pickle_modu dataset = ds.dataset( str(path), partitioning=ds.partitioning( - pa.schema([("part", pa.int8())]), flavor="hive")) + schema=pa.schema([("part", pa.int8())]), flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int8())) assert dataset.schema.equals(expected_schema) @@ -2790,7 +2807,7 @@ def test_open_union_dataset(tempdir, dataset_reader, pickle_module): _, path = _create_single_file(tempdir) dataset = ds.dataset(path) - union = ds.dataset([dataset, dataset]) + union = ds.dataset([dataset, dataset]) # type: ignore[arg-type] assert isinstance(union, ds.UnionDataset) pickled = pickle_module.loads(pickle_module.dumps(union)) @@ -2800,7 +2817,7 @@ def test_open_union_dataset(tempdir, dataset_reader, pickle_module): def test_open_union_dataset_with_additional_kwargs(multisourcefs): child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') with pytest.raises(ValueError, match="cannot pass any additional"): - ds.dataset([child], format="parquet") + ds.dataset([child], format="parquet") # type: ignore[arg-type] def test_open_dataset_non_existing_file(): @@ -2887,7 +2904,7 @@ def expected_type(key): def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module): # https://issues.apache.org/jira/browse/ARROW-11400 table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)}) - part = ds.partitioning(table.select(['part']).schema, flavor="hive") + part = ds.partitioning(schema=table.select(['part']).schema, flavor="hive") ds.write_dataset(table, tempdir, partitioning=part, format="feather") dataset = ds.dataset( @@ -2895,7 +2912,7 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module) partitioning=ds.HivePartitioning.discover(infer_dictionary=True) ) expected = pa.table( - {'col': table['col'], 'part': table['part'].dictionary_encode()} + {'col': table.column('col'), 'part': table.column('part').dictionary_encode()} ) assert dataset.to_table().equals(expected) fragment = list(dataset.get_fragments())[0] @@ -2980,7 +2997,7 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple): assert dataset.to_table().equals(table) # directly passing the fsspec-handler - fs = PyFileSystem(FSSpecHandler(fs)) + fs = PyFileSystem(FSSpecHandler(fs)) # type: ignore[abstract] dataset = ds.dataset(path, format="parquet", filesystem=fs) assert dataset.to_table().equals(table) @@ -3082,7 +3099,7 @@ def test_file_format_inspect_fsspec(tempdir): format = ds.ParquetFileFormat() # manually creating a PyFileSystem instead of using fs._ensure_filesystem # which would convert an fsspec local filesystem to a native one - filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) + filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) # type: ignore[abstract] schema = format.inspect(path, filesystem) assert schema.equals(table.schema) @@ -3100,11 +3117,11 @@ def test_filter_timestamp(tempdir, dataset_reader): "id": range(10)}) # write dataset partitioned on dates (as strings) - part = ds.partitioning(table.select(['dates']).schema, flavor="hive") + part = ds.partitioning(schema=table.select(['dates']).schema, flavor="hive") ds.write_dataset(table, path, partitioning=part, format="feather") # read dataset partitioned on dates (as timestamps) - part = ds.partitioning(pa.schema([("dates", pa.timestamp("s"))]), + part = ds.partitioning(schema=pa.schema([("dates", pa.timestamp("s"))]), flavor="hive") dataset = ds.dataset(path, format="feather", partitioning=part) @@ -3155,7 +3172,7 @@ def test_filter_compute_expression(tempdir, dataset_reader): filter_ = pc.is_in(ds.field('A'), pa.array(["a", "b"])) assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 3 - filter_ = pc.hour(ds.field('B')) >= 3 + filter_ = pc.hour(ds.field('B')) >= 3 # type: ignore[operator] assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 2 days = pc.days_between(ds.field('B'), ds.field("C")) @@ -3187,12 +3204,12 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): assert child1.schema != child2.schema != child3.schema - assembled = ds.dataset([child1, child2, child3]) + assembled = ds.dataset([child1, child2, child3]) # type: ignore[arg-type] assert isinstance(assembled, ds.UnionDataset) msg = 'cannot pass any additional arguments' with pytest.raises(ValueError, match=msg): - ds.dataset([child1, child2], filesystem=multisourcefs) + ds.dataset([child1, child2], filesystem=multisourcefs) # type: ignore[arg-type] expected_schema = pa.schema([ ('date', pa.date32()), @@ -3206,7 +3223,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) - assembled = ds.dataset([child1, child3]) + assembled = ds.dataset([child1, child3]) # type: ignore[arg-type] expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), @@ -3223,6 +3240,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): ('color', pa.string()), ('date', pa.date32()), ]) + # type: ignore[arg-type] assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) @@ -3231,6 +3249,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): ('color', pa.string()), ('unknown', pa.string()) # fill with nulls ]) + # type: ignore[arg-type] assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) @@ -3241,7 +3260,7 @@ def test_union_dataset_from_other_datasets(tempdir, multisourcefs): child4 = ds.dataset(path) with pytest.raises(pa.ArrowTypeError, match='Unable to merge'): - ds.dataset([child1, child4]) + ds.dataset([child1, child4]) # type: ignore[arg-type] def test_dataset_from_a_list_of_local_directories_raises(multisourcefs): @@ -3252,7 +3271,7 @@ def test_dataset_from_a_list_of_local_directories_raises(multisourcefs): def test_union_dataset_filesystem_datasets(multisourcefs): # without partitioning - dataset = ds.dataset([ + dataset = ds.dataset([ # type: ignore[arg-type] ds.dataset('/plain', filesystem=multisourcefs), ds.dataset('/schema', filesystem=multisourcefs), ds.dataset('/hive', filesystem=multisourcefs), @@ -3266,7 +3285,7 @@ def test_union_dataset_filesystem_datasets(multisourcefs): assert dataset.schema.equals(expected_schema) # with hive partitioning for two hive sources - dataset = ds.dataset([ + dataset = ds.dataset([ # type: ignore[arg-type] ds.dataset('/plain', filesystem=multisourcefs), ds.dataset('/schema', filesystem=multisourcefs), ds.dataset('/hive', filesystem=multisourcefs, partitioning='hive') @@ -3326,7 +3345,7 @@ def _check_dataset(schema, expected, expected_schema=None): # Specifying with differing field types schema = pa.schema([('a', 'int32'), ('b', 'float64')]) dataset = ds.dataset(str(tempdir / "data.parquet"), schema=schema) - expected = pa.table([table['a'].cast('int32'), + expected = pa.table([table['a'].cast('int32'), # type: ignore[arg-type] table['b']], names=['a', 'b']) _check_dataset(schema, expected) @@ -3827,7 +3846,7 @@ def test_parquet_dataset_factory_fsspec(tempdir): fsspec_fs = fsspec.filesystem("file") # manually creating a PyFileSystem, because passing the local fsspec # filesystem would internally be converted to native LocalFileSystem - filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) + filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) # type: ignore[abstract] dataset = ds.parquet_dataset(metadata_path, filesystem=filesystem) assert dataset.schema.equals(table.schema) assert len(dataset.files) == 4 @@ -4035,12 +4054,14 @@ def test_filter_mismatching_schema(tempdir, dataset_reader): # filtering on a column with such type mismatch should implicitly # cast the column filtered = dataset_reader.to_table(dataset, filter=ds.field("col") > 2) - assert filtered["col"].equals(table["col"].cast('int64').slice(2)) + assert filtered["col"].equals(table["col"].cast( + 'int64').slice(2)) # type: ignore[arg-type] fragment = list(dataset.get_fragments())[0] filtered = dataset_reader.to_table( fragment, filter=ds.field("col") > 2, schema=schema) - assert filtered["col"].equals(table["col"].cast('int64').slice(2)) + assert filtered["col"].equals(table["col"].cast( + 'int64').slice(2)) # type: ignore[arg-type] @pytest.mark.parquet @@ -4105,6 +4126,7 @@ def test_dataset_preserved_partitioning(tempdir): # through discovery, but without partitioning _, path = _create_single_file(tempdir) dataset = ds.dataset(path) + assert isinstance(dataset, ds.FileSystemDataset) assert isinstance(dataset.partitioning, ds.DirectoryPartitioning) # TODO(GH-34884) partitioning attribute not preserved in pickling # dataset_ = ds.dataset(path) @@ -4114,10 +4136,12 @@ def test_dataset_preserved_partitioning(tempdir): # through discovery, with hive partitioning but not specified full_table, path = _create_partitioned_dataset(tempdir) dataset = ds.dataset(path) + assert isinstance(dataset, ds.FileSystemDataset) assert isinstance(dataset.partitioning, ds.DirectoryPartitioning) # through discovery, with hive partitioning (from a partitioning factory) dataset = ds.dataset(path, partitioning="hive") + assert isinstance(dataset, ds.FileSystemDataset) part = dataset.partitioning assert part is not None assert isinstance(part, ds.HivePartitioning) @@ -4126,11 +4150,12 @@ def test_dataset_preserved_partitioning(tempdir): assert part.dictionaries[0] == pa.array([0, 1, 2], pa.int32()) # through discovery, with hive partitioning (from a partitioning object) - part = ds.partitioning(pa.schema([("part", pa.int32())]), flavor="hive") + part = ds.partitioning(schema=pa.schema([("part", pa.int32())]), flavor="hive") assert isinstance(part, ds.HivePartitioning) # not a factory assert len(part.dictionaries) == 1 assert all(x is None for x in part.dictionaries) dataset = ds.dataset(path, partitioning=part) + assert isinstance(dataset, ds.FileSystemDataset) part = dataset.partitioning assert isinstance(part, ds.HivePartitioning) assert part.schema == pa.schema([("part", pa.int32())]) @@ -4140,6 +4165,7 @@ def test_dataset_preserved_partitioning(tempdir): # through manual creation -> not available dataset = ds.dataset(path, partitioning="hive") + assert isinstance(dataset, ds.FileSystemDataset) dataset2 = ds.FileSystemDataset( list(dataset.get_fragments()), schema=dataset.schema, format=dataset.format, filesystem=dataset.filesystem @@ -4185,7 +4211,7 @@ def _sort_table(tab, sort_col): import pyarrow.compute as pc sorted_indices = pc.sort_indices( tab, options=pc.SortOptions([(sort_col, 'ascending')])) - return pc.take(tab, sorted_indices) + return pc.take(tab, sorted_indices) # type: ignore[arg-type] def _check_dataset_roundtrip(dataset, base_dir, expected_files, sort_col, @@ -4258,7 +4284,7 @@ def test_write_dataset_partitioned(tempdir): target / "part=b", target / "part=b" / "part-0.arrow" ] partitioning_schema = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + schema=pa.schema([("part", pa.string())]), flavor="hive") _check_dataset_roundtrip( dataset, str(target), expected_paths, 'f1', target, partitioning=partitioning_schema) @@ -4270,7 +4296,7 @@ def test_write_dataset_partitioned(tempdir): target / "b", target / "b" / "part-0.arrow" ] partitioning_schema = ds.partitioning( - pa.schema([("part", pa.string())])) + schema=pa.schema([("part", pa.string())])) _check_dataset_roundtrip( dataset, str(target), expected_paths, 'f1', target, partitioning=partitioning_schema) @@ -4283,6 +4309,7 @@ def test_write_dataset_with_field_names(tempdir): partitioning=["b"]) load_back = ds.dataset(tempdir, format='ipc', partitioning=["b"]) + assert isinstance(load_back, ds.FileSystemDataset) files = load_back.files partitioning_dirs = { str(pathlib.Path(f).relative_to(tempdir).parent) for f in files @@ -4300,6 +4327,7 @@ def test_write_dataset_with_field_names_hive(tempdir): partitioning=["b"], partitioning_flavor="hive") load_back = ds.dataset(tempdir, format='ipc', partitioning="hive") + assert isinstance(load_back, ds.FileSystemDataset) files = load_back.files partitioning_dirs = { str(pathlib.Path(f).relative_to(tempdir).parent) for f in files @@ -4617,7 +4645,7 @@ def test_write_dataset_max_open_files(tempdir): record_batch_3, record_batch_4]) partitioning = ds.partitioning( - pa.schema([(column_names[partition_column_id], pa.string())]), + schema=pa.schema([(column_names[partition_column_id], pa.string())]), flavor="hive") data_source_1 = directory / "default" @@ -4631,7 +4659,8 @@ def test_write_dataset_max_open_files(tempdir): def _get_compare_pair(data_source, record_batch, file_format, col_id): num_of_files_generated = _get_num_of_files_generated( base_directory=data_source, file_format=file_format) - number_of_partitions = len(pa.compute.unique(record_batch[col_id])) + unique_vals = pa.compute.unique(record_batch[col_id]) + number_of_partitions = len(unique_vals) # type: ignore[arg-type] return num_of_files_generated, number_of_partitions # CASE 1: when max_open_files=default & max_open_files >= num_of_partitions @@ -4678,7 +4707,7 @@ def test_write_dataset_partitioned_dict(tempdir): target / "a", target / "a" / "part-0.arrow", target / "b", target / "b" / "part-0.arrow" ] - partitioning = ds.partitioning(pa.schema([ + partitioning = ds.partitioning(schema=pa.schema([ dataset.schema.field('part')]), dictionaries={'part': pa.array(['a', 'b'])}) # NB: dictionaries required here since we use partitioning to parse @@ -4697,7 +4726,7 @@ def test_write_dataset_use_threads(tempdir): dataset = ds.dataset(directory, partitioning="hive") partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + schema=pa.schema([("part", pa.string())]), flavor="hive") target1 = tempdir / 'partitioned1' paths_written = [] @@ -4737,7 +4766,7 @@ def test_write_dataset_use_threads_preserve_order(tempdir): batches = table.to_batches(max_chunksize=2) ds.write_dataset(batches, tempdir, format="parquet", use_threads=True, preserve_order=True) - seq = ds.dataset(tempdir).to_table(use_threads=False)['a'].to_numpy() + seq = ds.dataset(tempdir).to_table(use_threads=False).column('a').to_numpy() prev = -1 for item in seq: curr = int(item) @@ -4777,7 +4806,7 @@ def file_visitor(written_file): visited_sizes.append(written_file.size) partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + schema=pa.schema([("part", pa.string())]), flavor="hive") ds.write_dataset(table, base_dir, format="feather", basename_template='dat_{i}.arrow', partitioning=partitioning, file_visitor=file_visitor) @@ -4889,7 +4918,7 @@ def test_write_table_partitioned_dict(tempdir): pa.array(['a'] * 10 + ['b'] * 10).dictionary_encode(), ], names=['col', 'part']) - partitioning = ds.partitioning(table.select(["part"]).schema) + partitioning = ds.partitioning(schema=table.select(["part"]).schema) base_dir = tempdir / "dataset" ds.write_dataset( @@ -5007,7 +5036,7 @@ def test_partition_dataset_parquet_file_visitor(tempdir): root_path = tempdir / 'partitioned' partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + schema=pa.schema([("part", pa.string())]), flavor="hive") paths_written = [] @@ -5040,11 +5069,11 @@ def test_write_dataset_arrow_schema_metadata(tempdir): # ensure we serialize ARROW schema in the parquet metadata, to have a # correct roundtrip (e.g. preserve non-UTC timezone) table = pa.table({"a": [pd.Timestamp("2012-01-01", tz="Europe/Brussels")]}) - assert table["a"].type.tz == "Europe/Brussels" + assert table.column("a").type.tz == "Europe/Brussels" ds.write_dataset(table, tempdir, format="parquet") result = pq.read_table(tempdir / "part-0.parquet") - assert result["a"].type.tz == "Europe/Brussels" + assert result.column("a").type.tz == "Europe/Brussels" def test_write_dataset_schema_metadata(tempdir): @@ -5085,7 +5114,7 @@ def test_write_dataset_s3(s3_example_simple): pa.array(['a'] * 10 + ['b'] * 10)], names=["f1", "f2", "part"] ) - part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + part = ds.partitioning(schema=pa.schema([("part", pa.string())]), flavor="hive") # writing with filesystem object ds.write_dataset( @@ -5164,7 +5193,7 @@ def test_write_dataset_s3_put_only(s3_server): pa.array(['a']*10 + ['b'] * 10)], names=["f1", "f2", "part"] ) - part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + part = ds.partitioning(schema=pa.schema([("part", pa.string())]), flavor="hive") # writing with filesystem object with create_dir flag set to false ds.write_dataset( @@ -5542,7 +5571,7 @@ def test_union_dataset_filter(tempdir, dstype): else: raise NotImplementedError - filtered_union_ds = ds.dataset((ds1, ds2)).filter( + filtered_union_ds = ds.dataset((ds1, ds2)).filter( # type: ignore[arg-type] (pc.field("colA") < 3) | (pc.field("colA") == 9) ) assert filtered_union_ds.to_table() == pa.table({ @@ -5564,7 +5593,7 @@ def test_union_dataset_filter(tempdir, dstype): filtered_ds2 = ds2.filter(pc.field("colA") < 10) with pytest.raises(ValueError, match="currently not supported"): - ds.dataset((filtered_ds1, filtered_ds2)) + ds.dataset((filtered_ds1, filtered_ds2)) # type: ignore[arg-type] def test_parquet_dataset_filter(tempdir): @@ -5665,8 +5694,9 @@ def test_dataset_partition_with_slash(tmpdir): assert dt_table == read_table.sort_by("exp_id") exp_meta = dt_table.column(1).to_pylist() - exp_meta = sorted(set(exp_meta)) # take unique - encoded_paths = ["exp_meta=" + quote(path, safe='') for path in exp_meta] + exp_meta = sorted(set(exp_meta), key=lambda x: ( + x is None, x)) # take unique, handle None + encoded_paths = ["exp_meta=" + quote(str(path), safe='') for path in exp_meta] file_paths = sorted(os.listdir(path)) assert encoded_paths == file_paths @@ -5749,6 +5779,7 @@ def test_write_dataset_write_page_index(tempdir): ) ds1 = ds.dataset(base_dir, format="parquet") + assert isinstance(ds1, ds.FileSystemDataset) for file in ds1.files: # Can retrieve sorting columns from metadata metadata = pq.read_metadata(file) @@ -5891,13 +5922,13 @@ def test_make_write_options_error(): "'pyarrow._dataset_parquet.ParquetFileFormat' objects " "doesn't apply to a 'int'") with pytest.raises(TypeError) as excinfo: - pa.dataset.ParquetFileFormat.make_write_options(43) + pa.dataset.ParquetFileFormat.make_write_options(43) # type: ignore assert msg_1 in str(excinfo.value) or msg_2 in str(excinfo.value) pformat = pa.dataset.ParquetFileFormat() msg = "make_write_options\\(\\) takes exactly 0 positional arguments" with pytest.raises(TypeError, match=msg): - pformat.make_write_options(43) + pformat.make_write_options(43) # type: ignore def test_scanner_from_substrait(dataset): @@ -5932,3 +5963,4 @@ def test_scanner_from_substrait(dataset): filter=ps.BoundExpressions.from_substrait(filtering) ).to_table() assert result.to_pydict() == {'str': ['4', '4']} +# Type stubs fixes applied diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index eb79121b1cd..4fe31956ff1 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -29,8 +29,8 @@ import pyarrow.parquet as pq import pyarrow.dataset as ds except ImportError: - pq = None - ds = None + pq = None # type: ignore[assignment] + ds = None # type: ignore[assignment] try: from pyarrow.tests.parquet.encryption import InMemoryKmsClient @@ -79,7 +79,7 @@ def create_encryption_config(): def create_decryption_config(): - return pe.DecryptionConfiguration(cache_lifetime=300) + return pe.DecryptionConfiguration(cache_lifetime=timedelta(seconds=300)) def create_kms_connection_config(): @@ -105,6 +105,8 @@ def test_dataset_encryption_decryption(): decryption_config = create_decryption_config() kms_connection_config = create_kms_connection_config() + assert ds is not None + assert pe is not None crypto_factory = pe.CryptoFactory(kms_factory) parquet_encryption_cfg = ds.ParquetEncryptionConfig( crypto_factory, kms_connection_config, encryption_config @@ -177,11 +179,12 @@ def test_large_row_encryption_decryption(): """Test encryption and decryption of a large number of rows.""" class NoOpKmsClient(pe.KmsClient): - def wrap_key(self, key_bytes: bytes, _: str) -> bytes: + def wrap_key(self, key_bytes: bytes, _: str) -> bytes: # type: ignore[override] b = base64.b64encode(key_bytes) return b - def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes: + def unwrap_key(self, wrapped_key: bytes, _: str # type: ignore[override] + ) -> bytes: b = base64.b64decode(wrapped_key) return b @@ -202,10 +205,14 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes: plaintext_footer=False, data_key_length_bits=128, ) + assert ds is not None + assert pe is not None + assert pq is not None pqe_config = ds.ParquetEncryptionConfig( crypto_factory, kms_config, encryption_config ) pqd_config = ds.ParquetDecryptionConfig( + # type: ignore[arg-type] crypto_factory, kms_config, pe.DecryptionConfiguration() ) scan_options = ds.ParquetFragmentScanOptions(decryption_config=pqd_config) diff --git a/python/pyarrow/tests/test_device.py b/python/pyarrow/tests/test_device.py index dc1a51e6d00..00f8bbf720d 100644 --- a/python/pyarrow/tests/test_device.py +++ b/python/pyarrow/tests/test_device.py @@ -59,11 +59,15 @@ def test_copy_to(): batch_copied = batch.copy_to(dest) assert batch_copied.equals(batch) - assert batch_copied["col"].buffers()[1].device == mm.device - assert batch_copied["col"].buffers()[1].address != arr.buffers()[1].address + buffer = batch_copied.column("col").buffers()[1] + assert buffer is not None + assert buffer.device == mm.device + buffer_orig = arr.buffers()[1] + assert buffer_orig is not None + assert buffer.address != buffer_orig.address with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"): - arr.copy_to(mm.device.device_type) + arr.copy_to(mm.device.device_type) # type: ignore[arg-type] with pytest.raises(TypeError, match="Argument 'destination' has incorrect type"): - batch.copy_to(mm.device.device_type) + batch.copy_to(mm.device.device_type) # type: ignore[arg-type] diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ebac37e862b..941e73c8167 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -22,12 +22,13 @@ import weakref from uuid import uuid4, UUID import sys +from typing import cast import pytest try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow.vendored.version import Version @@ -79,12 +80,14 @@ def __init__(self): def __arrow_ext_serialize__(self): # XXX pa.BaseExtensionType should expose C++ serialization method + assert isinstance(self.storage_type, IntegerType) return self.storage_type.__arrow_ext_serialize__() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): + assert isinstance(storage_type, IntegerType) deserialized_storage_type = storage_type.__arrow_ext_deserialize__( - serialized) + storage_type, serialized) assert deserialized_storage_type == storage_type return cls() @@ -160,7 +163,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): class MyStructType(pa.ExtensionType): - storage_type = pa.struct([('left', pa.int64()), + storage_type = pa.struct([('left', pa.int64()), # type: ignore[assignment] ('right', pa.int64())]) def __init__(self): @@ -221,7 +224,7 @@ def __arrow_ext_serialize__(self): @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): assert serialized == b'' - return cls(storage_type) + return cls(storage_type, annotation=None) def ipc_write_batch(batch): @@ -432,8 +435,8 @@ def test_ext_array_wrap_array(): arr.validate(full=True) assert isinstance(arr, pa.ChunkedArray) assert arr.type == ty - assert arr.chunk(0).storage == storage.chunk(0) - assert arr.chunk(1).storage == storage.chunk(1) + assert arr.chunk(0).storage == storage.chunk(0) # type: ignore[union-attr] + assert arr.chunk(1).storage == storage.chunk(1) # type: ignore[union-attr] # Wrong storage type storage = pa.array([b"foo", b"bar", None]) @@ -442,7 +445,7 @@ def test_ext_array_wrap_array(): # Not an array or chunked array with pytest.raises(TypeError, match="Expected array or chunked array"): - ty.wrap_array(None) + ty.wrap_array(None) # type: ignore[arg-type] def test_ext_scalar_from_array(): @@ -876,7 +879,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): def __eq__(self, other): if isinstance(other, pa.BaseExtensionType): return (isinstance(self, type(other)) and - self.freq == other.freq) + self.freq == other.freq) # type: ignore[attr-defined] else: return NotImplemented @@ -902,7 +905,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): storage_type, serialized).freq return PeriodTypeWithToPandasDtype(freq) - def to_pandas_dtype(self): + def to_pandas_dtype(self): # type: ignore[override] import pandas as pd return pd.PeriodDtype(freq=self.freq) @@ -1033,7 +1036,7 @@ def test_generic_ext_array_pickling(registered_period_type, pickle_module): def test_generic_ext_type_register(registered_period_type): # test that trying to register other type does not segfault with pytest.raises(TypeError): - pa.register_extension_type(pa.string()) + pa.register_extension_type(pa.string()) # type: ignore[arg-type] # register second time raises KeyError period_type = PeriodType('D') @@ -1058,11 +1061,13 @@ def test_parquet_period(tmpdir, registered_period_type): # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" + assert meta.metadata is not None assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) - schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) + schema = pa.ipc.read_schema(pa.BufferReader( + decoded_schema)) # Since the type could be reconstructed, the extension type metadata is # absent. assert schema.field("ext").metadata == {} @@ -1434,6 +1439,7 @@ def test_tensor_class_methods(np_type_str): storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], pa.list_(arrow_type, 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) + arr = cast(pa.FixedShapeTensorArray, arr) expected = np.array( [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str) @@ -1442,7 +1448,7 @@ def test_tensor_class_methods(np_type_str): np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected) expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str)) - result = arr[1:].to_numpy_ndarray() + result = arr[1:].to_numpy_ndarray() # type: ignore[union-attr] np.testing.assert_array_equal(result, expected) values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] @@ -1452,35 +1458,43 @@ def test_tensor_class_methods(np_type_str): tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2]) result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = cast(pa.FixedShapeTensorArray, result) expected = np.array( [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=np.dtype(np_type_str) ) np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) - result = flat_arr.reshape(1, 2, 3, 2) + result_reshaped = flat_arr.reshape(1, 2, 3, 2) expected = np.array( [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=np.dtype(np_type_str) ) - np.testing.assert_array_equal(result, expected) + np.testing.assert_array_equal(result_reshaped, expected) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1]) result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = cast(pa.FixedShapeTensorArray, result) expected = as_strided(flat_arr, shape=(1, 2, 3, 2), strides=(bw * 12, bw * 6, bw, bw * 3)) np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[2, 0, 1]) - result = pa.ExtensionArray.from_storage(tensor_type, storage) + result = pa.ExtensionArray.from_storage( + tensor_type, storage) # type: ignore[assignment] expected = as_strided(flat_arr, shape=(1, 3, 2, 2), strides=(bw * 12, bw, bw * 6, bw * 2)) - np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) - - assert result.type.permutation == [2, 0, 1] - assert result.type.shape == [2, 2, 3] + np.testing.assert_array_equal( + result.to_numpy_ndarray(), expected) # type: ignore[union-attr] + + result_type = result.type + assert isinstance(result, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.permutation == [2, 0, 1] + assert result_type.shape == [2, 2, 3] assert result.to_tensor().shape == (1, 3, 2, 2) - assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) + assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, + 2 * bw) @pytest.mark.numpy @@ -1508,17 +1522,23 @@ def test_tensor_array_from_numpy(np_type_str): arr = flat_arr.reshape(1, 3, 4) tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - assert tensor_array_from_numpy.type.shape == [3, 4] - assert tensor_array_from_numpy.type.permutation == [0, 1] - assert tensor_array_from_numpy.type.dim_names is None + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.shape == [3, 4] + assert result_type.permutation == [0, 1] + assert result_type.dim_names is None assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) arr = as_strided(flat_arr, shape=(1, 2, 3, 2), strides=(bw * 12, bw * 6, bw, bw * 3)) tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - assert tensor_array_from_numpy.type.shape == [2, 2, 3] - assert tensor_array_from_numpy.type.permutation == [0, 2, 1] - assert tensor_array_from_numpy.type.dim_names is None + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.shape == [2, 2, 3] + assert result_type.permutation == [0, 2, 1] + assert result_type.dim_names is None assert tensor_array_from_numpy.to_tensor() == pa.Tensor.from_numpy(arr) arr = flat_arr.reshape(1, 2, 3, 2) @@ -1532,7 +1552,8 @@ def test_tensor_array_from_numpy(np_type_str): arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=np.dtype(np_type_str)) expected = arr[1:] - result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray() + result = cast(pa.FixedShapeTensorArray, pa.FixedShapeTensorArray.from_numpy_ndarray( + arr)[1:]).to_numpy_ndarray() np.testing.assert_array_equal(result, expected) arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.dtype(np_type_str)) @@ -1559,22 +1580,27 @@ def test_tensor_array_from_numpy(np_type_str): dim_names = ["a", "b"] tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray( arr, dim_names=dim_names) - assert tensor_array_from_numpy.type.value_type == arrow_type - assert tensor_array_from_numpy.type.shape == [2, 3] - assert tensor_array_from_numpy.type.dim_names == dim_names + result_type = tensor_array_from_numpy.type + assert isinstance(tensor_array_from_numpy, pa.FixedShapeTensorArray) + assert isinstance(result_type, pa.FixedShapeTensorType) + assert result_type.value_type == arrow_type + assert result_type.shape == [2, 3] + assert result_type.dim_names == dim_names with pytest.raises(ValueError, match="The length of dim_names"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=['only_one']) with pytest.raises(TypeError, match="dim_names must be a tuple or list"): - pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=123) + pa.FixedShapeTensorArray.from_numpy_ndarray( + arr, dim_names=123) # type: ignore[arg-type] with pytest.raises(TypeError, match="dim_names must be a tuple or list"): pa.FixedShapeTensorArray.from_numpy_ndarray( - arr, dim_names=(x for x in range(2))) + arr, dim_names=(x for x in range(2))) # type: ignore[arg-type] with pytest.raises(TypeError, match="Each element of dim_names must be a string"): - pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=[0, 1]) + pa.FixedShapeTensorArray.from_numpy_ndarray( + arr, dim_names=[0, 1]) # type: ignore[arg-type] @pytest.mark.numpy @@ -1845,14 +1871,18 @@ def test_bool8_to_numpy_conversion(): assert np.array_equal(arr_to_np, np_arr_no_nulls) # same underlying buffer - assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address + buffer = arr_no_nulls.buffers()[1] + assert buffer is not None + assert arr_to_np.ctypes.data == buffer.address # if the user requests a writable array, a copy should be performed arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, writable=True) assert np.array_equal(arr_to_np_writable, np_arr_no_nulls) # different underlying buffer - assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address + buffer = arr_no_nulls.buffers()[1] + assert buffer is not None + assert arr_to_np_writable.ctypes.data != buffer.address @pytest.mark.numpy @@ -1867,7 +1897,9 @@ def test_bool8_from_numpy_conversion(): assert arr_from_np == canonical_bool8_arr_no_nulls # same underlying buffer - assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data + buffer = arr_from_np.buffers()[1] + assert buffer is not None + assert buffer.address == np_arr_no_nulls.ctypes.data # conversion only valid for 1-D arrays with pytest.raises( @@ -1882,7 +1914,7 @@ def test_bool8_from_numpy_conversion(): ValueError, match="Cannot convert 0-D array to bool8 array", ): - pa.Bool8Array.from_numpy(np.bool_()) + pa.Bool8Array.from_numpy(np.bool_(False)) # type: ignore[arg-type] # must use compatible storage type with pytest.raises( diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 054bf920b26..a84b343b3dd 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -26,7 +26,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.strategies as past @@ -47,7 +47,7 @@ def datadir(base_datadir): def random_path(prefix='feather_'): - return tempfile.mktemp(prefix=prefix) + return tempfile.mktemp(prefix=prefix) # type: ignore[deprecated] @pytest.fixture(scope="module", params=[1, 2]) @@ -63,7 +63,7 @@ def compression(request): yield request.param -TEST_FILES = None +TEST_FILES: list[str] | None = None def setup_module(module): @@ -72,7 +72,7 @@ def setup_module(module): def teardown_module(module): - for path in TEST_FILES: + for path in TEST_FILES: # type: ignore[union-attr] try: os.remove(path) except os.error: @@ -95,6 +95,7 @@ def _check_pandas_roundtrip(df, expected=None, path=None, if version is None: version = 2 + assert TEST_FILES is not None TEST_FILES.append(path) write_feather(df, path, compression=compression, compression_level=compression_level, version=version) @@ -114,6 +115,7 @@ def _check_arrow_roundtrip(table, path=None, compression=None): if path is None: path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) write_feather(table, path, compression=compression) if not os.path.exists(path): @@ -126,10 +128,12 @@ def _check_arrow_roundtrip(table, path=None, compression=None): def _assert_error_on_write(df, exc, path=None, version=2): # check that we are raising the exception # on writing + assert version in (1, 2) if path is None: path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) def f(): @@ -149,6 +153,7 @@ def test_dataset(version): } table = pa.table(data) + assert TEST_FILES is not None TEST_FILES.extend(paths) for index, path in enumerate(paths): rows = ( @@ -156,7 +161,8 @@ def test_dataset(version): (index + 1) * (num_values[0] // num_files), ) - write_feather(table[rows[0]: rows[1]], path, version=version) + write_feather(table[rows[0]: rows[1]], path, + version=version) # type: ignore[arg-type] data = FeatherDataset(paths).read_table() assert data.equals(table) @@ -181,6 +187,7 @@ def test_read_table(version): num_values = (100, 100) path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) values = np.random.randint(0, 100, size=num_values) @@ -206,6 +213,7 @@ def test_use_threads(version): num_values = (10, 10) path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) values = np.random.randint(0, 10, size=num_values) @@ -231,6 +239,7 @@ def test_float_nulls(version): num_values = 100 path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 @@ -292,6 +301,7 @@ def test_platform_numpy_integers(version): def test_integer_with_nulls(version): # pandas requires upcast to float dtype path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] @@ -330,6 +340,7 @@ def test_boolean_no_nulls(version): def test_boolean_nulls(version): # pandas requires upcast to object dtype path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) num_values = 100 @@ -348,6 +359,7 @@ def test_boolean_nulls(version): def test_buffer_bounds_error(version): # ARROW-1676 path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) for i in range(16, 256): @@ -360,6 +372,7 @@ def test_buffer_bounds_error(version): @pytest.mark.numpy def test_boolean_object_nulls(version): + assert np is not None repeats = 100 table = pa.Table.from_arrays( [np.array([False, None, True] * repeats, dtype=object)], @@ -426,7 +439,8 @@ def test_empty_strings(version): @pytest.mark.pandas def test_all_none(version): df = pd.DataFrame({'all_none': [None] * 10}) - if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype(): + if (version == 1 and pa.pandas_compat # type: ignore[attr-defined] + ._pandas_api.uses_string_dtype()): expected = df.astype("str") else: expected = df @@ -552,6 +566,7 @@ def test_read_columns(version): @pytest.mark.numpy def test_overwritten_file(version): path = random_path() + assert TEST_FILES is not None TEST_FILES.append(path) num_values = 100 @@ -585,12 +600,12 @@ def test_filelike_objects(version): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_sparse_dataframe(version): - if not pa.pandas_compat._pandas_api.has_sparse: + if not pa.pandas_compat._pandas_api.has_sparse: # type: ignore[attr-defined] pytest.skip("version of pandas does not support SparseDataFrame") # GH #221 data = {'A': [0, 1, 2], 'B': [1, 0, 1]} - df = pd.DataFrame(data).to_sparse(fill_value=1) + df = pd.DataFrame(data).to_sparse(fill_value=1) # type: ignore[attr-defined] expected = df.to_dense() _check_pandas_roundtrip(df, expected, version=version) @@ -692,8 +707,9 @@ def test_v2_lz4_default_compression(): if not pa.Codec.is_available('lz4_frame'): pytest.skip("LZ4 compression support is not built in C++") + assert np is not None # some highly compressible data - t = pa.table([np.repeat(0, 100000)], names=['f0']) + t = pa.table([np.repeat(0, 100000)], names=['f0']) # type: ignore[arg-type] buf = io.BytesIO() write_feather(t, buf) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index e5edc0eaa2c..50618a56615 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -28,19 +28,21 @@ import traceback import json from datetime import datetime +from typing import Any try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa from pyarrow.lib import IpcReadOptions, ReadStats, tobytes from pyarrow.util import find_free_port from pyarrow.tests import util +from typing import TYPE_CHECKING -try: +if TYPE_CHECKING: from pyarrow import flight from pyarrow.flight import ( FlightClient, FlightServerBase, @@ -49,13 +51,26 @@ ClientMiddleware, ClientMiddlewareFactory, FlightCallOptions, ) -except ImportError: - flight = None - FlightClient, FlightServerBase = object, object - ServerAuthHandler, ClientAuthHandler = object, object - ServerMiddleware, ServerMiddlewareFactory = object, object - ClientMiddleware, ClientMiddlewareFactory = object, object - FlightCallOptions = object +else: + try: + from pyarrow import flight + from pyarrow.flight import ( + FlightClient, FlightServerBase, + ServerAuthHandler, ClientAuthHandler, + ServerMiddleware, ServerMiddlewareFactory, + ClientMiddleware, ClientMiddlewareFactory, + FlightCallOptions, + ) + except ImportError: + flight = None # type: ignore[assignment] + FlightClient, FlightServerBase = object, object + ServerAuthHandler, ClientAuthHandler = ( # type: ignore[misc] + object, object) # type: ignore[assignment] + ServerMiddleware, ServerMiddlewareFactory = ( # type: ignore[misc] + object, object) # type: ignore[assignment] + ClientMiddleware, ClientMiddlewareFactory = ( # type: ignore[misc] + object, object) # type: ignore[assignment] + # FlightCallOptions = object # type: ignore[assignment, misc] # Marks all of the tests in this module # Ignore these with pytest ... -m 'not flight' @@ -196,7 +211,7 @@ def do_put(self, context, descriptor, reader, writer): assert buf is not None client_counter, = struct.unpack(' 0 key = 'voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/part-0.parquet' with fs.open_input_stream(key) as f: @@ -1931,6 +1935,8 @@ def test_s3_real_aws_region_selection(): # Taken from a registry of open S3-hosted datasets # at https://github.com/awslabs/open-data-registry fs, path = FileSystem.from_uri('s3://mf-nwp-models/README.txt') + from pyarrow.fs import S3FileSystem + assert isinstance(fs, S3FileSystem) assert fs.region == 'eu-west-1' with fs.open_input_stream(path) as f: assert b"Meteo-France Atmospheric models on AWS" in f.read(50) @@ -1938,6 +1944,8 @@ def test_s3_real_aws_region_selection(): # Passing an explicit region disables auto-selection fs, path = FileSystem.from_uri( 's3://mf-nwp-models/README.txt?region=us-east-2') + from pyarrow.fs import S3FileSystem + assert isinstance(fs, S3FileSystem) assert fs.region == 'us-east-2' # Reading from the wrong region may still work for public buckets... @@ -1948,6 +1956,8 @@ def test_s3_real_aws_region_selection(): with pytest.raises(IOError, match="Bucket '.*' not found"): FileSystem.from_uri('s3://x-arrow..nonexistent-bucket') fs, path = FileSystem.from_uri('s3://x-arrow-nonexistent-bucket?region=us-east-3') + from pyarrow.fs import S3FileSystem + assert isinstance(fs, S3FileSystem) assert fs.region == 'us-east-3' # allow_delayed_open has a side-effect of delaying errors until I/O is performed. @@ -2189,13 +2199,16 @@ def test_uwsgi_integration(): def test_fsspec_filesystem_from_uri(): try: - from fsspec.implementations.local import LocalFileSystem - from fsspec.implementations.memory import MemoryFileSystem + from fsspec.implementations.local import ( # type: ignore[import-untyped] + LocalFileSystem) + from fsspec.implementations.memory import ( # type: ignore[import-untyped] + MemoryFileSystem) except ImportError: pytest.skip("fsspec not installed") fs, path = FileSystem.from_uri("fsspec+memory://path/to/data.parquet") - expected_fs = PyFileSystem(FSSpecHandler(MemoryFileSystem())) + expected_fs = PyFileSystem(FSSpecHandler( + MemoryFileSystem())) # type: ignore[abstract] assert fs == expected_fs assert path == "/path/to/data.parquet" @@ -2203,7 +2216,8 @@ def test_fsspec_filesystem_from_uri(): # arrow local filesystem uri = "file:///tmp/my.file" fs, _ = FileSystem.from_uri(f"fsspec+{uri}") - expected_fs = PyFileSystem(FSSpecHandler(LocalFileSystem())) + expected_fs = PyFileSystem(FSSpecHandler( + LocalFileSystem())) # type: ignore[abstract] assert fs == expected_fs @@ -2213,7 +2227,7 @@ def test_fsspec_delete_root_dir_contents(): except ImportError: pytest.skip("fsspec not installed") - fs = FSSpecHandler(MemoryFileSystem()) + fs = FSSpecHandler(MemoryFileSystem()) # type: ignore[abstract] # Create some files and directories fs.create_dir("test_dir", recursive=True) @@ -2227,7 +2241,7 @@ def test_fsspec_delete_root_dir_contents(): # Verify files exist before deletion def get_type(path): - return fs.get_file_info([path])[0].type + return cast(list[FileInfo], fs.get_file_info([path]))[0].type assert get_type("test_file.txt") == FileType.File assert get_type("test_dir") == FileType.Directory @@ -2245,13 +2259,13 @@ def get_type(path): def test_huggingface_filesystem_from_uri(): pytest.importorskip("fsspec") try: - from huggingface_hub import HfFileSystem + from huggingface_hub import HfFileSystem # type: ignore[import-not-found] except ImportError: pytest.skip("huggingface_hub not installed") fs, path = FileSystem.from_uri( "hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet" ) - expected_fs = PyFileSystem(FSSpecHandler(HfFileSystem())) + expected_fs = PyFileSystem(FSSpecHandler(HfFileSystem())) # type: ignore[abstract] assert fs == expected_fs assert path == "datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet" diff --git a/python/pyarrow/tests/test_gandiva.py b/python/pyarrow/tests/test_gandiva.py index 80d119a4853..01fc6f032d5 100644 --- a/python/pyarrow/tests/test_gandiva.py +++ b/python/pyarrow/tests/test_gandiva.py @@ -174,9 +174,12 @@ def test_in_expr_todo(): assert result.to_array().equals(pa.array([1, 2], type=pa.uint32())) # timestamp - datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877) - datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877) - datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877) + datetime_1 = datetime.datetime.fromtimestamp( + 1542238951.621877, tz=datetime.timezone.utc) + datetime_2 = datetime.datetime.fromtimestamp( + 1542238911.621877, tz=datetime.timezone.utc) + datetime_3 = datetime.datetime.fromtimestamp( + 1542238051.621877, tz=datetime.timezone.utc) arr = pa.array([datetime_1, datetime_2, datetime_3]) table = pa.Table.from_arrays([arr], ["a"]) diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 912953ae60d..50d81b686ac 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -101,6 +101,8 @@ def wait_until_ready(self): Record output until the gdb prompt displays. Return recorded output. """ # TODO: add timeout? + assert self.proc is not None + assert self.proc.stdout is not None while (not self.last_stdout_line.startswith(b"(gdb) ") and self.proc.poll() is None): block = self.proc.stdout.read(4096) @@ -125,6 +127,8 @@ def wait_until_ready(self): return out def issue_command(self, line): + assert self.proc is not None + assert self.proc.stdin is not None line = line.encode('utf-8') + b"\n" if self.verbose: sys.stdout.buffer.write(line) @@ -158,6 +162,7 @@ def select_frame(self, func_name): m = re.search(pat, out) if m is None: pytest.fail(f"Could not select frame for function {func_name}") + return # Never reached, but helps type checker frame_num = int(m[1]) out = self.run_command(f"frame {frame_num}") @@ -165,6 +170,8 @@ def select_frame(self, func_name): def join(self): if self.proc is not None: + assert self.proc.stdin is not None + assert self.proc.stdout is not None self.proc.stdin.close() self.proc.stdout.close() # avoid ResourceWarning self.proc.kill() diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index a6d3546e57c..3837b553b8b 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -24,16 +24,17 @@ import math import os import pathlib -import pytest +import pytest # type: ignore[import-not-found] import random import sys import tempfile +from typing import cast import weakref try: import numpy as np except ImportError: - np = None + pass from pyarrow.util import guid from pyarrow import Codec @@ -44,7 +45,7 @@ def check_large_seeks(file_factory, expected_error=None): if sys.platform in ('win32', 'darwin', 'emscripten'): pytest.skip("need sparse file support") try: - filename = tempfile.mktemp(prefix='test_io') + filename = tempfile.mkstemp(prefix='test_io')[1] with open(filename, 'wb') as f: f.truncate(2 ** 32 + 10) f.seek(2 ** 32 + 5) @@ -234,7 +235,7 @@ def read_buffer(self, nbytes): return memoryview(dst_buf)[:nbytes] duck_reader = DuckReader() - with pa.PythonFile(duck_reader, mode='r') as f: + with pa.PythonFile(duck_reader, mode='r') as f: # type: ignore[arg-type] buf = f.read_buffer(length) assert len(buf) == length assert memoryview(buf).tobytes() == dst_buf[:length] @@ -474,7 +475,7 @@ def test_buffer_to_numpy(): byte_array = bytearray(20) byte_array[0] = 42 buf = pa.py_buffer(byte_array) - array = np.frombuffer(buf, dtype="uint8") + array = np.frombuffer(buf, dtype="uint8") # type: ignore[arg-type] assert array[0] == byte_array[0] byte_array[0] += 1 assert array[0] == byte_array[0] @@ -557,7 +558,7 @@ def test_buffer_eq_bytes(): assert buf != b'some dat1' with pytest.raises(TypeError): - buf == 'some data' + _ = buf == 'some data' def test_buffer_getitem(): @@ -598,22 +599,22 @@ def test_buffer_slicing(): with pytest.raises(IndexError): buf.slice(len(buf) + 1) - assert buf[11:].to_pybytes() == b"" + assert cast(pa.Buffer, buf[11:]).to_pybytes() == b"" # Slice stop exceeds buffer length with pytest.raises(IndexError): buf.slice(1, len(buf)) - assert buf[1:11].to_pybytes() == buf.to_pybytes()[1:] + assert cast(pa.Buffer, buf[1:11]).to_pybytes() == buf.to_pybytes()[1:] # Negative length with pytest.raises(IndexError): buf.slice(1, -1) # Test slice notation - assert buf[2:].equals(buf.slice(2)) - assert buf[2:5].equals(buf.slice(2, 3)) - assert buf[-5:].equals(buf.slice(len(buf) - 5)) - assert buf[-5:-2].equals(buf.slice(len(buf) - 5, 3)) + assert cast(pa.Buffer, buf[2:]).equals(buf.slice(2)) + assert cast(pa.Buffer, buf[2:5]).equals(buf.slice(2, 3)) + assert cast(pa.Buffer, buf[-5:]).equals(buf.slice(len(buf) - 5)) + assert cast(pa.Buffer, buf[-5:-2]).equals(buf.slice(len(buf) - 5, 3)) with pytest.raises(IndexError): buf[::-1] @@ -623,7 +624,8 @@ def test_buffer_slicing(): n = len(buf) for start in range(-n * 2, n * 2): for stop in range(-n * 2, n * 2): - assert buf[start:stop].to_pybytes() == buf.to_pybytes()[start:stop] + assert cast(pa.Buffer, buf[start:stop]).to_pybytes( + ) == buf.to_pybytes()[start:stop] def test_buffer_hashing(): @@ -640,7 +642,7 @@ def test_buffer_protocol_respects_immutability(): # immutable a = b'12345' arrow_ref = pa.py_buffer(a) - numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8) + numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8) # type: ignore[arg-type] assert not numpy_ref.flags.writeable @@ -652,7 +654,8 @@ def test_foreign_buffer(): buf = pa.foreign_buffer(addr, size, obj) wr = weakref.ref(obj) del obj - assert np.frombuffer(buf, dtype=np.int32).tolist() == [1, 2] + assert (np.frombuffer(buf, dtype=np.int32).tolist() # type: ignore[arg-type] + == [1, 2]) assert wr() is not None del buf assert wr() is None @@ -688,6 +691,7 @@ def test_non_cpu_buffer(pickle_module): cuda_buf = ctx.buffer_from_data(data) arr = pa.FixedSizeBinaryArray.from_buffers(pa.binary(7), 1, [None, cuda_buf]) buf_on_gpu = arr.buffers()[1] + assert buf_on_gpu is not None assert buf_on_gpu.size == cuda_buf.size assert buf_on_gpu.address == cuda_buf.address @@ -708,7 +712,7 @@ def test_non_cpu_buffer(pickle_module): assert cuda_sliced.to_pybytes() == b'st' # Sliced buffers with same address - assert buf_on_gpu_sliced.equals(cuda_buf[2:4]) + assert cast(pa.Buffer, buf_on_gpu_sliced).equals(cuda_buf[2:4]) # Buffers on different devices msg_device = "Device on which the data resides differs between buffers" @@ -720,13 +724,14 @@ def test_non_cpu_buffer(pickle_module): arr_short = np.array([b'sting']) cuda_buf_short = ctx.buffer_from_data(arr_short) with pytest.raises(NotImplementedError, match=msg): - buf_on_gpu_sliced.equals(cuda_buf_short) + cast(pa.Buffer, buf_on_gpu_sliced).equals(cuda_buf_short) arr_short = pa.FixedSizeBinaryArray.from_buffers( pa.binary(5), 1, [None, cuda_buf_short] ) buf_on_gpu_short = arr_short.buffers()[1] + assert buf_on_gpu_short is not None with pytest.raises(NotImplementedError, match=msg): - buf_on_gpu_sliced.equals(buf_on_gpu_short) + cast(pa.Buffer, buf_on_gpu_sliced).equals(buf_on_gpu_short) with pytest.raises(NotImplementedError, match=msg): buf_on_gpu.hex() @@ -811,8 +816,9 @@ def test_cache_options_pickling(pickle_module): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -843,6 +849,7 @@ def test_compress_decompress(compression): assert isinstance(decompressed_bytes, bytes) + assert isinstance(decompressed_buf, pa.Buffer) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data @@ -852,8 +859,9 @@ def test_compress_decompress(compression): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ - pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("bz2", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ), "brotli", "gzip", @@ -910,6 +918,7 @@ def test_compression_level(compression): assert isinstance(decompressed_bytes, bytes) + assert isinstance(decompressed_buf, pa.Buffer) assert decompressed_buf.equals(test_buf) assert decompressed_bytes == test_data @@ -951,12 +960,12 @@ def test_buffer_memoryview_is_immutable(): assert result.readonly with pytest.raises(TypeError) as exc: - result[0] = b'h' + result[0] = b'h' # type: ignore[index] assert 'cannot modify read-only' in str(exc.value) b = bytes(buf) with pytest.raises(TypeError) as exc: - b[0] = b'h' + b[0] = b'h' # type: ignore[index] assert 'cannot modify read-only' in str(exc.value) @@ -1748,9 +1757,9 @@ def test_unknown_compression_raises(): "gzip", "lz4", "zstd", - pytest.param( - "snappy", - marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + pytest.param("snappy", marks=pytest.mark.xfail( + raises=pa.lib.ArrowNotImplementedError # type: ignore[attr-defined] + ) ) ]) def test_compressed_roundtrip(compression): @@ -2021,7 +2030,7 @@ def test_input_stream_native_file(): def test_input_stream_errors(tmpdir): buf = memoryview(b"") with pytest.raises(ValueError): - pa.input_stream(buf, compression="foo") + pa.input_stream(buf, compression="foo") # type: ignore[reportArgumentType] for arg in [bytearray(), StringIO()]: with pytest.raises(TypeError): @@ -2198,7 +2207,7 @@ def check_data(data, **kwargs): def test_output_stream_errors(tmpdir): buf = memoryview(bytearray()) with pytest.raises(ValueError): - pa.output_stream(buf, compression="foo") + pa.output_stream(buf, compression="foo") # type: ignore[reportArgumentType] for arg in [bytearray(), StringIO()]: with pytest.raises(TypeError): diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index b4db9cd0875..3ebdbcc046f 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -24,23 +24,27 @@ import socket import threading import weakref +from typing import TYPE_CHECKING, cast -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None + import pandas as pd + from pandas.testing import assert_frame_equal +else: + try: + import numpy as np + except ImportError: + pass + try: + from pandas.testing import assert_frame_equal + import pandas as pd + except ImportError: + pass import pyarrow as pa from pyarrow.tests.util import changed_environ, invoke_script -try: - from pandas.testing import assert_frame_equal - import pandas as pd -except ImportError: - pass - - class IpcFixture: write_stats = None @@ -48,6 +52,9 @@ def __init__(self, sink_factory=lambda: io.BytesIO()): self._sink_factory = sink_factory self.sink = self.get_sink() + def _get_writer(self, sink, schema): + ... # Implemented in subclasses + def get_sink(self): return self._sink_factory() @@ -59,6 +66,7 @@ def write_batches(self, num_batches=5, as_table=False): schema = pa.schema([('one', pa.float64()), ('two', pa.utf8())]) writer = self._get_writer(self.sink, schema) + assert writer is not None batches = [] for i in range(num_batches): @@ -929,7 +937,7 @@ def test_ipc_file_stream_has_eos(): buffer = sink.getvalue() # skip the file magic - reader = pa.ipc.open_stream(buffer[8:]) + reader = pa.ipc.open_stream(cast(pa.Buffer, buffer[8:])) # will fail if encounters footer data instead of eos rdf = reader.read_pandas() diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index c3f9fe333bd..c0b6b8ecd0d 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -23,11 +23,16 @@ import json import string import unittest +from typing import TYPE_CHECKING -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None +else: + try: + import numpy as np + except ImportError: + pass + import pytest import pyarrow as pa @@ -317,6 +322,9 @@ def test_stress_block_sizes(self): class BaseTestJSONRead(BaseTestJSON): + def read_json(self, *args, **kwargs) -> pa.Table: # type: ignore[empty-body] + ... # Implemented in subclasses + def read_bytes(self, b, **kwargs): return self.read_json(pa.py_buffer(b), **kwargs) @@ -352,6 +360,8 @@ def test_reconcile_across_blocks(self): class BaseTestStreamingJSONRead(BaseTestJSON): + use_threads: bool = False # Set by subclasses + def open_json(self, json, *args, **kwargs): """ Reads the JSON file into memory using pyarrow's open_json diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py index d2ba780efc7..b5d4e74f126 100644 --- a/python/pyarrow/tests/test_jvm.py +++ b/python/pyarrow/tests/test_jvm.py @@ -38,11 +38,13 @@ def root_allocator(): arrow_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..') pom_path = os.path.join(arrow_dir, 'java', 'pom.xml') tree = ET.parse(pom_path) - version = tree.getroot().find( + version_element = tree.getroot().find( 'POM:version', namespaces={ 'POM': 'http://maven.apache.org/POM/4.0.0' - }).text + }) + assert version_element is not None + version = version_element.text jar_path = os.path.join( arrow_dir, 'java', 'tools', 'target', f'arrow-tools-{version}-jar-with-dependencies.jar') @@ -76,8 +78,8 @@ def test_jvm_buffer(root_allocator): def test_jvm_buffer_released(root_allocator): - import jpype.imports # noqa - from java.lang import IllegalArgumentException + import jpype.imports # type: ignore[import-untyped, import-not-found] # noqa + from java.lang import IllegalArgumentException # type: ignore[import-not-found] jvm_buffer = root_allocator.buffer(8) jvm_buffer.release() diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 706fb3fe45c..f1c8765d5eb 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -77,7 +77,7 @@ def fix_example_values(actual_cols, expected_cols): if not pd.isnull(v): exp = d.as_tuple().exponent factor = 10 ** -exp - converted_decimals[i] = ( + converted_decimals[i] = ( # type: ignore[call-overload,assignment] decimal.Decimal(round(v * factor)).scaleb(exp)) expected = pd.Series(converted_decimals) @@ -308,7 +308,7 @@ def test_buffer_readwrite(): # deprecated keyword order buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): - orc.write_table(buffer_output_stream, table) + orc.write_table(buffer_output_stream, table) # type: ignore[arg-type] buffer_reader = pa.BufferReader(buffer_output_stream.getvalue()) orc_file = orc.ORCFile(buffer_reader) output_table = orc_file.read() @@ -350,8 +350,8 @@ def test_buffer_readwrite_with_writeoptions(): buffer_output_stream = pa.BufferOutputStream() with pytest.warns(FutureWarning): orc.write_table( - buffer_output_stream, - table, + buffer_output_stream, # type: ignore[reportArgumentType] + table, # type: ignore[reportArgumentType] compression='uncompressed', file_version='0.11', row_index_stride=20000, @@ -438,20 +438,20 @@ def test_buffer_readwrite_with_bad_writeoptions(): orc.write_table( table, buffer_output_stream, - compression=0, + compression=0, # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression='none', + compression='none', # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression='zlid', + compression='zlid', # type: ignore[reportArgumentType] ) # compression_block_size must be a positive integer @@ -481,20 +481,20 @@ def test_buffer_readwrite_with_bad_writeoptions(): orc.write_table( table, buffer_output_stream, - compression_strategy=0, + compression_strategy=0, # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression_strategy='no', + compression_strategy='no', # type: ignore[reportArgumentType] ) with pytest.raises(ValueError): orc.write_table( table, buffer_output_stream, - compression_strategy='large', + compression_strategy='large', # type: ignore[reportArgumentType] ) # row_index_stride must be a positive integer diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index ceea2527da0..91f8d4aaab3 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -28,37 +28,34 @@ import hypothesis as h import hypothesis.strategies as st import pytest -try: - import numpy as np - import numpy.testing as npt - try: - _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning - except AttributeError: - from numpy.exceptions import ( - VisibleDeprecationWarning as _np_VisibleDeprecationWarning - ) -except ImportError: - np = None +import pyarrow as pa from pyarrow.pandas_compat import get_logical_type, _pandas_api from pyarrow.tests.util import invoke_script, random_ascii, rands import pyarrow.tests.strategies as past import pyarrow.tests.util as test_util from pyarrow.vendored.version import Version -import pyarrow as pa try: from pyarrow import parquet as pq except ImportError: pass -try: - import pandas as pd - import pandas.testing as tm - from .pandas_examples import dataframe_with_arrays, dataframe_with_lists -except ImportError: - pass +pd = pytest.importorskip("pandas") +np = pytest.importorskip("numpy") + +import numpy.testing as npt # noqa: E402 +import pandas.testing as tm # noqa: E402 +from .pandas_examples import dataframe_with_arrays, dataframe_with_lists # noqa: E402 +try: + _np_VisibleDeprecationWarning = ( + np.VisibleDeprecationWarning # type: ignore[attr-defined] + ) +except AttributeError: + from numpy.exceptions import ( + VisibleDeprecationWarning as _np_VisibleDeprecationWarning + ) # Marks all of the tests in this module pytestmark = pytest.mark.pandas @@ -98,7 +95,7 @@ def _alltypes_example(size=100): def _check_pandas_roundtrip(df, expected=None, use_threads=False, expected_schema=None, check_dtype=True, schema=None, - preserve_index=False, + preserve_index: bool | None = False, as_batch=False): klass = pa.RecordBatch if as_batch else pa.Table table = klass.from_pandas(df, schema=schema, @@ -706,7 +703,7 @@ def test_mismatch_metadata_schema(self): # OPTION 1: casting after conversion table = pa.Table.from_pandas(df) # cast the "datetime" column to be tz-aware - new_col = table["datetime"].cast(pa.timestamp('ns', tz="UTC")) + new_col = table.column(0).cast(pa.timestamp('ns', tz="UTC")) new_table1 = table.set_column( 0, pa.field("datetime", new_col.type), new_col ) @@ -974,7 +971,7 @@ def test_float_with_null_as_integer(self): schema = pa.schema([pa.field('has_nulls', ty)]) result = pa.Table.from_pandas(df, schema=schema, preserve_index=False) - assert result[0].chunk(0).equals(expected) + assert result.column(0).chunk(0).equals(expected) def test_int_object_nulls(self): arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) @@ -1136,7 +1133,7 @@ def test_python_datetime(self): }) table = pa.Table.from_pandas(df) - assert isinstance(table[0].chunk(0), pa.TimestampArray) + assert isinstance(table.column(0).chunk(0), pa.TimestampArray) result = table.to_pandas() # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units @@ -1193,7 +1190,7 @@ class MyDatetime(datetime): df = pd.DataFrame({"datetime": pd.Series(date_array, dtype=object)}) table = pa.Table.from_pandas(df) - assert isinstance(table[0].chunk(0), pa.TimestampArray) + assert isinstance(table.column(0).chunk(0), pa.TimestampArray) result = table.to_pandas() @@ -1217,7 +1214,7 @@ class MyDate(date): df = pd.DataFrame({"date": pd.Series(date_array, dtype=object)}) table = pa.Table.from_pandas(df) - assert isinstance(table[0].chunk(0), pa.Date32Array) + assert isinstance(table.column(0).chunk(0), pa.Date32Array) result = table.to_pandas() expected_df = pd.DataFrame( @@ -1729,7 +1726,7 @@ def test_bytes_to_binary(self): df = pd.DataFrame({'strings': values}) table = pa.Table.from_pandas(df) - assert table[0].type == pa.binary() + assert table.column(0).type == pa.binary() values2 = [b'qux', b'foo', None, b'barz', b'qux', None] expected = pd.DataFrame({'strings': values2}) @@ -1750,7 +1747,7 @@ def test_bytes_exceed_2gb(self): arr = None table = pa.Table.from_pandas(df) - assert table[0].num_chunks == 2 + assert table.column(0).num_chunks == 2 @pytest.mark.large_memory @pytest.mark.parametrize('char', ['x', b'x']) @@ -1892,13 +1889,13 @@ def test_table_str_to_categorical_without_na(self, string_type): zero_copy_only=True) # chunked array - result = table["strings"].to_pandas(strings_to_categorical=True) + result = table.column("strings").to_pandas(strings_to_categorical=True) expected = pd.Series(pd.Categorical(values), name="strings") tm.assert_series_equal(result, expected) with pytest.raises(pa.ArrowInvalid): - table["strings"].to_pandas(strings_to_categorical=True, - zero_copy_only=True) + table.column("strings").to_pandas(strings_to_categorical=True, + zero_copy_only=True) @pytest.mark.parametrize( "string_type", [pa.string(), pa.large_string(), pa.string_view()] @@ -1919,13 +1916,13 @@ def test_table_str_to_categorical_with_na(self, string_type): zero_copy_only=True) # chunked array - result = table["strings"].to_pandas(strings_to_categorical=True) + result = table.column("strings").to_pandas(strings_to_categorical=True) expected = pd.Series(pd.Categorical(values), name="strings") tm.assert_series_equal(result, expected) with pytest.raises(pa.ArrowInvalid): - table["strings"].to_pandas(strings_to_categorical=True, - zero_copy_only=True) + table.column("strings").to_pandas(strings_to_categorical=True, + zero_copy_only=True) # Regression test for ARROW-2101 def test_array_of_bytes_to_strings(self): @@ -2507,7 +2504,7 @@ def test_auto_chunking_on_list_overflow(self): table = pa.Table.from_pandas(df) table.validate(full=True) - column_a = table[0] + column_a = table.column(0) assert column_a.num_chunks == 2 assert len(column_a.chunk(0)) == 2**21 - 1 assert len(column_a.chunk(1)) == 1 @@ -3767,8 +3764,8 @@ def test_recordbatchlist_to_pandas(): def test_recordbatch_table_pass_name_to_pandas(): rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0']) t = pa.table([pa.array([1, 2, 3, 4])], names=['a0']) - assert rb[0].to_pandas().name == 'a0' - assert t[0].to_pandas().name == 'a0' + assert rb.column(0).to_pandas().name == 'a0' + assert t.column(0).to_pandas().name == 'a0' # ---------------------------------------------------------------------- @@ -4306,13 +4303,13 @@ def test_array_protocol(): # default conversion result = pa.table(df) expected = pa.array([1, 2, None], pa.int64()) - assert result[0].chunk(0).equals(expected) + assert result.column(0).chunk(0).equals(expected) # with specifying schema schema = pa.schema([('a', pa.float64())]) result = pa.table(df, schema=schema) expected2 = pa.array([1, 2, None], pa.float64()) - assert result[0].chunk(0).equals(expected2) + assert result.column(0).chunk(0).equals(expected2) # pass Series to pa.array result = pa.array(df['a']) @@ -4442,7 +4439,7 @@ def __init__(self): def __arrow_ext_serialize__(self): return b'' - def to_pandas_dtype(self): + def to_pandas_dtype(self): # type: ignore[override] return pd.Int64Dtype() @@ -4542,7 +4539,7 @@ def test_array_to_pandas(): expected = pd.Series(arr) tm.assert_series_equal(result, expected) - result = pa.table({"col": arr})["col"].to_pandas() + result = pa.table({"col": arr}).column("col").to_pandas() expected = pd.Series(arr, name="col") tm.assert_series_equal(result, expected) @@ -4601,7 +4598,6 @@ def test_array_to_pandas_types_mapper(): assert result.dtype == np.dtype("int64") -@pytest.mark.pandas def test_chunked_array_to_pandas_types_mapper(): # https://issues.apache.org/jira/browse/ARROW-9664 if Version(pd.__version__) < Version("1.2.0"): @@ -5092,7 +5088,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced(): ty = pa.list_(pa.map_(pa.string(), pa.list_(pa.string()))) - def assert_roundtrip(series: pd.Series, data) -> None: + def assert_roundtrip(series, data): array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, type=ty)) array_roundtrip.validate(full=True) assert data.equals(array_roundtrip) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 65f0c608136..70afd19e54f 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -20,11 +20,12 @@ import pytest import weakref from collections.abc import Sequence, Mapping +from typing import cast try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.compute as pc @@ -68,7 +69,7 @@ pa.Time32Scalar), (datetime.datetime.now().time(), None, pa.Time64Scalar), (datetime.timedelta(days=1), None, pa.DurationScalar), - (pa.MonthDayNano([1, -1, -10100]), None, + (pa.MonthDayNano([1, -1, -10100]), None, # type: ignore[call-arg, arg-type] pa.MonthDayNanoIntervalScalar), ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar), @@ -360,7 +361,8 @@ def test_time_from_datetime_time(): def test_temporal_values(value, time_type: pa.DataType): time_scalar = pa.scalar(value, type=time_type) time_scalar.validate(full=True) - assert time_scalar.value == value + assert (time_scalar.value # type: ignore[union-attr, reportAttributeAccessIssue] + == value) def test_cast(): @@ -422,7 +424,7 @@ def test_timestamp(): expected = pd.Timestamp('2000-01-01 12:34:56') assert arrow_arr[0].as_py() == expected - assert arrow_arr[0].value * 1000**i == expected.value + assert cast(pa.TimestampScalar, arrow_arr[0]).value * 1000**i == expected.value tz = 'America/New_York' arrow_type = pa.timestamp(unit, tz=tz) @@ -434,7 +436,7 @@ def test_timestamp(): .tz_convert(tz)) assert arrow_arr[0].as_py() == expected - assert arrow_arr[0].value * 1000**i == expected.value + assert cast(pa.TimestampScalar, arrow_arr[0]).value * 1000**i == expected.value @pytest.mark.nopandas @@ -577,7 +579,7 @@ def test_binary(value, ty, scalar_typ): with pytest.raises(ValueError): memoryview(s) else: - assert buf.to_pybytes() == value + assert buf.to_pybytes() == value # type: ignore[union-attr] assert isinstance(buf, pa.Buffer) assert bytes(s) == value @@ -852,7 +854,7 @@ def test_dictionary(pickle_module): assert arr.to_pylist() == expected for j, (i, v) in enumerate(zip(indices, expected)): - s = arr[j] + s = cast(pa.DictionaryScalar, arr[j]) assert s.as_py() == v assert s.value.as_py() == v @@ -868,14 +870,14 @@ def test_run_end_encoded(): values = [1, 2, 1, None, 3] arr = pa.RunEndEncodedArray.from_arrays(run_ends, values) - scalar = arr[0] + scalar = cast(pa.RunEndEncodedScalar, arr[0]) assert isinstance(scalar, pa.RunEndEncodedScalar) assert isinstance(scalar.value, pa.Int64Scalar) assert scalar.value == pa.array(values)[0] assert scalar.as_py() == 1 # null -> .value is still a scalar, as_py returns None - scalar = arr[10] + scalar = cast(pa.RunEndEncodedScalar, arr[10]) assert isinstance(scalar.value, pa.Int64Scalar) assert scalar.as_py() is None @@ -901,13 +903,13 @@ def test_union(pickle_module): with pytest.raises(pa.ArrowNotImplementedError): pickle_module.loads(pickle_module.dumps(s)) - assert arr[0].type_code == 0 + assert cast(pa.UnionScalar, arr[0]).type_code == 0 assert arr[0].as_py() == "a" - assert arr[1].type_code == 0 + assert cast(pa.UnionScalar, arr[1]).type_code == 0 assert arr[1].as_py() == "b" - assert arr[2].type_code == 1 + assert cast(pa.UnionScalar, arr[2]).type_code == 1 assert arr[2].as_py() == 3 - assert arr[3].type_code == 1 + assert cast(pa.UnionScalar, arr[3]).type_code == 1 assert arr[3].as_py() == 4 # dense @@ -927,9 +929,9 @@ def test_union(pickle_module): with pytest.raises(pa.ArrowNotImplementedError): pickle_module.loads(pickle_module.dumps(s)) - assert arr[0].type_code == 0 + assert cast(pa.UnionScalar, arr[0]).type_code == 0 assert arr[0].as_py() == b'a' - assert arr[5].type_code == 1 + assert cast(pa.UnionScalar, arr[5]).type_code == 1 assert arr[5].as_py() == 3 diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 029e14ca162..4f2b8ac749d 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.util as test_util @@ -259,7 +259,7 @@ def test_schema(): child 0, item: int8""" with pytest.raises(TypeError): - pa.schema([None]) + pa.schema([None]) # type: ignore[list-item] def test_schema_weakref(): @@ -594,7 +594,7 @@ def test_schema_get_fields(): with pytest.raises(KeyError): schema.field('other') with pytest.raises(TypeError): - schema.field(0.0) + schema.field(0.0) # type: ignore[arg-type] with pytest.raises(IndexError): schema.field(4) @@ -706,6 +706,7 @@ def test_empty_table(): assert table.schema == schema +@pytest.mark.numpy @pytest.mark.pandas def test_schema_from_pandas(): import pandas as pd @@ -782,7 +783,7 @@ def test_schema_merge(): # raise proper error when passing a non-Schema value with pytest.raises(TypeError): - pa.unify_schemas([a, 1]) + pa.unify_schemas([a, 1]) # type: ignore[list-item] def test_undecodable_metadata(): diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index eca8090d77a..2ce48b651b1 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -26,15 +26,16 @@ import pyarrow as pa try: - from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix + from scipy.sparse import ( # type: ignore[reportMissingModuleSource] + csr_array, coo_array, csr_matrix, coo_matrix) except ImportError: - coo_matrix = None - csr_matrix = None - csr_array = None - coo_array = None + coo_matrix = None # type: ignore[assignment, misc] + csr_matrix = None # type: ignore[assignment, misc] + csr_array = None # type: ignore[assignment, misc] + coo_array = None # type: ignore[assignment, misc] try: - import sparse + import sparse # type: ignore[import-untyped, import-not-found] except ImportError: sparse = None @@ -401,7 +402,7 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): assert np.array_equal(array, result_array) -@pytest.mark.skipif(not coo_matrix, reason="requires scipy") +@pytest.mark.skipif(coo_matrix is None, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, @@ -443,7 +444,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, assert out_scipy_matrix.has_canonical_format -@pytest.mark.skipif(not csr_matrix, reason="requires scipy") +@pytest.mark.skipif(csr_matrix is None, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, @@ -483,7 +484,8 @@ def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): shape = (4, 6) dim_names = ("x", "y") - sparse_array = sparse.COO(data=data, coords=coords, shape=shape) + sparse_array = sparse.COO( # type: ignore[reportOptionalMemberAccess] + data=data, coords=coords, shape=shape) sparse_tensor = pa.SparseCOOTensor.from_pydata_sparse(sparse_array, dim_names=dim_names) out_sparse_array = sparse_tensor.to_pydata_sparse() diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py index babb839b534..9505b9a11b0 100644 --- a/python/pyarrow/tests/test_strategies.py +++ b/python/pyarrow/tests/test_strategies.py @@ -25,7 +25,7 @@ @h.given(past.all_types) def test_types(ty): - assert isinstance(ty, pa.lib.DataType) + assert isinstance(ty, pa.DataType) @h.given(past.all_fields) @@ -41,7 +41,7 @@ def test_schemas(schema): @pytest.mark.numpy @h.given(past.all_arrays) def test_arrays(array): - assert isinstance(array, pa.lib.Array) + assert isinstance(array, pa.Array) @pytest.mark.numpy diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index fcd1c8d48c5..9ad65f0738d 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -25,13 +25,10 @@ from pyarrow.lib import tobytes from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError -try: - import pyarrow.substrait as substrait -except ImportError: - substrait = None - # Marks all of the tests in this module # Ignore these with pytest ... -m 'not substrait' +substrait = pytest.importorskip('pyarrow.substrait') +_substrait = pytest.importorskip('pyarrow._substrait') pytestmark = pytest.mark.substrait @@ -85,7 +82,7 @@ def test_run_serialized_query(tmpdir, use_threads): query = tobytes(substrait_query.replace( "FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri())) - buf = pa._substrait._parse_json_plan(query) + buf = _substrait._parse_json_plan(query) reader = substrait.run_query(buf, use_threads=use_threads) res_tb = reader.read_all() @@ -116,7 +113,7 @@ def test_invalid_plan(): ] } """ - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) exec_message = "Plan has no relations" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf) @@ -162,7 +159,7 @@ def test_binary_conversion_with_json_options(tmpdir, use_threads): path = _write_dummy_data_to_disk(tmpdir, file_name, table) query = tobytes(substrait_query.replace( "FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri())) - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) reader = substrait.run_query(buf, use_threads=use_threads) res_tb = reader.read_all() @@ -181,7 +178,7 @@ def has_function(fns, ext_file, fn_name): def test_get_supported_functions(): - supported_functions = pa._substrait.get_supported_functions() + supported_functions = _substrait.get_supported_functions() # It probably doesn't make sense to exhaustively verify this list but # we can check a sample aggregate and a sample non-aggregate entry assert has_function(supported_functions, @@ -232,8 +229,8 @@ def table_provider(names, schema): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(tobytes(substrait_query)) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() assert res_tb == test_table_1 @@ -275,7 +272,7 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) + buf = _substrait._parse_json_plan(tobytes(substrait_query)) exec_message = "Invalid NamedTable Source" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf, table_provider=table_provider) @@ -317,7 +314,7 @@ def table_provider(names, _): } """ query = tobytes(substrait_query) - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) exec_message = "names for NamedTable not provided" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf, table_provider=table_provider) @@ -436,8 +433,8 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(substrait_query) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(substrait_query) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -559,9 +556,9 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(substrait_query) + buf = _substrait._parse_json_plan(substrait_query) with pytest.raises(pa.ArrowKeyError) as excinfo: - pa.substrait.run_query(buf, table_provider=table_provider) + substrait.run_query(buf, table_provider=table_provider) assert "No function registered" in str(excinfo.value) @@ -598,8 +595,8 @@ def table_provider(names, schema): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(tobytes(substrait_query)) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -744,8 +741,8 @@ def table_provider(names, _): ], } """ - buf = pa._substrait._parse_json_plan(substrait_query) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(substrait_query) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=False) res_tb = reader.read_all() @@ -913,8 +910,8 @@ def table_provider(names, _): ], } """ - buf = pa._substrait._parse_json_plan(substrait_query) - reader = pa.substrait.run_query( + buf = _substrait._parse_json_plan(substrait_query) + reader = substrait.run_query( buf, table_provider=table_provider, use_threads=False) res_tb = reader.read_all() @@ -929,8 +926,8 @@ def table_provider(names, _): @pytest.mark.parametrize("expr", [ - pc.equal(pc.field("x"), 7), - pc.equal(pc.field("x"), pc.field("y")), + pc.equal(pc.field("x"), 7), # type: ignore[attr-defined] + pc.equal(pc.field("x"), pc.field("y")), # type: ignore[attr-defined] pc.field("x") > 50 ]) def test_serializing_expressions(expr): @@ -939,8 +936,8 @@ def test_serializing_expressions(expr): pa.field("y", pa.int32()) ]) - buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = substrait.serialize_expressions([expr], ["test_expr"], schema) + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 1 assert "test_expr" in returned.expressions @@ -958,8 +955,8 @@ def test_arrow_specific_types(): schema = pa.schema([pa.field(name, typ) for name, (typ, _) in fields.items()]) def check_round_trip(expr): - buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = substrait.serialize_expressions([expr], ["test_expr"], schema) + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema for name, (typ, val) in fields.items(): @@ -986,8 +983,8 @@ def test_arrow_one_way_types(): def check_one_way(field): expr = pc.is_null(pc.field(field.name)) - buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = substrait.serialize_expressions([expr], ["test_expr"], schema) + returned = substrait.deserialize_expressions(buf) assert alt_schema == returned.schema for field in schema: @@ -1003,14 +1000,14 @@ def test_invalid_expression_ser_des(): bad_expr = pc.equal(pc.field("z"), 7) # Invalid number of names with pytest.raises(ValueError) as excinfo: - pa.substrait.serialize_expressions([expr], [], schema) + substrait.serialize_expressions([expr], [], schema) assert 'need to have the same length' in str(excinfo.value) with pytest.raises(ValueError) as excinfo: - pa.substrait.serialize_expressions([expr], ["foo", "bar"], schema) + substrait.serialize_expressions([expr], ["foo", "bar"], schema) assert 'need to have the same length' in str(excinfo.value) # Expression doesn't match schema with pytest.raises(ValueError) as excinfo: - pa.substrait.serialize_expressions([bad_expr], ["expr"], schema) + substrait.serialize_expressions([bad_expr], ["expr"], schema) assert 'No match for FieldRef' in str(excinfo.value) @@ -1020,8 +1017,8 @@ def test_serializing_multiple_expressions(): pa.field("y", pa.int32()) ]) exprs = [pc.equal(pc.field("x"), 7), pc.equal(pc.field("x"), pc.field("y"))] - buf = pa.substrait.serialize_expressions(exprs, ["first", "second"], schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = substrait.serialize_expressions(exprs, ["first", "second"], schema) + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 2 @@ -1037,8 +1034,8 @@ def test_serializing_with_compute(): ]) expr = pc.equal(pc.field("x"), 7) expr_norm = pc.equal(pc.field(0), 7) - buf = expr.to_substrait(schema) - returned = pa.substrait.deserialize_expressions(buf) + buf = expr.to_substrait(schema) # type: ignore[union-attr] + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 1 @@ -1046,13 +1043,13 @@ def test_serializing_with_compute(): assert str(returned.expressions["expression"]) == str(expr_norm) # Compute can't deserialize messages with multiple expressions - buf = pa.substrait.serialize_expressions([expr, expr], ["first", "second"], schema) + buf = substrait.serialize_expressions([expr, expr], ["first", "second"], schema) with pytest.raises(ValueError) as excinfo: pc.Expression.from_substrait(buf) assert 'contained multiple expressions' in str(excinfo.value) # Deserialization should be possible regardless of the expression name - buf = pa.substrait.serialize_expressions([expr], ["weirdname"], schema) + buf = substrait.serialize_expressions([expr], ["weirdname"], schema) expr2 = pc.Expression.from_substrait(buf) assert str(expr2) == str(expr_norm) @@ -1069,11 +1066,11 @@ def test_serializing_udfs(): exprs = [pc.shift_left(a, b)] with pytest.raises(ArrowNotImplementedError): - pa.substrait.serialize_expressions(exprs, ["expr"], schema) + substrait.serialize_expressions(exprs, ["expr"], schema) - buf = pa.substrait.serialize_expressions( + buf = substrait.serialize_expressions( exprs, ["expr"], schema, allow_arrow_extensions=True) - returned = pa.substrait.deserialize_expressions(buf) + returned = substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 1 assert str(returned.expressions["expr"]) == str(exprs[0]) @@ -1085,19 +1082,19 @@ def test_serializing_schema(): pa.field("x", pa.int32()), pa.field("y", pa.string()) ]) - returned = pa.substrait.deserialize_schema(substrait_schema) + returned = substrait.deserialize_schema(substrait_schema) assert expected_schema == returned - arrow_substrait_schema = pa.substrait.serialize_schema(returned) + arrow_substrait_schema = substrait.serialize_schema(returned) assert arrow_substrait_schema.schema == substrait_schema - returned = pa.substrait.deserialize_schema(arrow_substrait_schema) + returned = substrait.deserialize_schema(arrow_substrait_schema) assert expected_schema == returned - returned = pa.substrait.deserialize_schema(arrow_substrait_schema.schema) + returned = substrait.deserialize_schema(arrow_substrait_schema.schema) assert expected_schema == returned - returned = pa.substrait.deserialize_expressions(arrow_substrait_schema.expression) + returned = substrait.deserialize_expressions(arrow_substrait_schema.expression) assert returned.schema == expected_schema @@ -1114,7 +1111,7 @@ def SerializeToString(self): b'\x1a\x19\n\x06\x12\x04\n\x02\x12\x00\x1a\x0fproject_version' b'"0\n\x0fproject_version\n\x0fproject_release' b'\x12\x0c\n\x04:\x02\x10\x01\n\x04b\x02\x10\x01') - exprs = pa.substrait.BoundExpressions.from_substrait(FakeMessage(message)) + exprs = substrait.BoundExpressions.from_substrait(FakeMessage(message)) assert len(exprs.expressions) == 2 assert 'project_release' in exprs.expressions assert 'project_version' in exprs.expressions diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b65fb7d952c..0b25187bfd9 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -18,12 +18,13 @@ from collections import OrderedDict from collections.abc import Iterable import sys +from typing import cast import weakref try: import numpy as np except ImportError: - np = None + pass import pytest import pyarrow as pa import pyarrow.compute as pc @@ -418,7 +419,8 @@ def test_to_pandas_empty_table(): table = pa.table(df) result = table.schema.empty_table().to_pandas() assert result.shape == (0, 2) - tm.assert_frame_equal(result, df.iloc[:0]) + expected = cast(pd.DataFrame, df.iloc[:0]) + tm.assert_frame_equal(result, expected) @pytest.mark.pandas @@ -486,12 +488,25 @@ def test_chunked_array_unify_dictionaries(): pa.array(["foo", "bar", None, "foo"]).dictionary_encode(), pa.array(["quux", None, "foo"]).dictionary_encode(), ]) - assert arr.chunk(0).dictionary.equals(pa.array(["foo", "bar"])) - assert arr.chunk(1).dictionary.equals(pa.array(["quux", "foo"])) + chunk_0 = arr.chunk(0) + assert isinstance(chunk_0, pa.DictionaryArray) + assert chunk_0.dictionary.equals(pa.array(["foo", "bar"])) + + chunk_1 = arr.chunk(1) + assert isinstance(chunk_1, pa.DictionaryArray) + assert chunk_1.dictionary.equals(pa.array(["quux", "foo"])) + arr = arr.unify_dictionaries() expected_dict = pa.array(["foo", "bar", "quux"]) - assert arr.chunk(0).dictionary.equals(expected_dict) - assert arr.chunk(1).dictionary.equals(expected_dict) + + chunk_0 = arr.chunk(0) + assert isinstance(chunk_0, pa.DictionaryArray) + assert chunk_0.dictionary.equals(expected_dict) + + chunk_1 = arr.chunk(1) + assert isinstance(chunk_1, pa.DictionaryArray) + assert chunk_1.dictionary.equals(expected_dict) + assert arr.to_pylist() == ["foo", "bar", None, "foo", "quux", None, "foo"] @@ -716,7 +731,7 @@ def test_recordbatch_take(): def test_recordbatch_column_sets_private_name(): # ARROW-6429 rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0']) - assert rb[0]._name == 'a0' + assert rb.column(0)._name == 'a0' def test_recordbatch_from_arrays_validate_schema(): @@ -798,7 +813,7 @@ def test_recordbatch_get_field(): batch.field('d') with pytest.raises(TypeError): - batch.field(None) + batch.field(None) # type: ignore[arg-type] with pytest.raises(IndexError): batch.field(4) @@ -819,7 +834,7 @@ def test_recordbatch_select_column(): batch.column('d') with pytest.raises(TypeError): - batch.column(None) + batch.column(None) # type: ignore[arg-type] with pytest.raises(IndexError): batch.column(4) @@ -933,7 +948,10 @@ def test_table_from_struct_array_chunked_array(): [[{"ints": 1}, {"floats": 1.0}]], type=pa.struct([("ints", pa.int32()), ("floats", pa.float32())]), ) - result = pa.Table.from_struct_array(chunked_struct_array) + assert isinstance(chunked_struct_array.type, pa.StructType) + # Cast to the proper type for type checker + struct_chunked_array = cast(pa.ChunkedArray, chunked_struct_array) + result = pa.Table.from_struct_array(struct_chunked_array) assert result.equals(pa.Table.from_arrays( [ pa.array([1, None], type=pa.int32()), @@ -1339,7 +1357,7 @@ def test_recordbatchlist_schema_equals(): def test_table_column_sets_private_name(): # ARROW-6429 t = pa.table([pa.array([1, 2, 3, 4])], names=['a0']) - assert t[0]._name == 'a0' + assert t.column(0)._name == 'a0' def test_table_equals(): @@ -1500,7 +1518,8 @@ def test_table_from_arrays_preserves_column_metadata(): field1 = pa.field('field2', pa.int64(), nullable=False) table = pa.Table.from_arrays([arr0, arr1], schema=pa.schema([field0, field1])) - assert b"a" in table.field(0).metadata + field0_metadata = table.field(0).metadata + assert field0_metadata is not None and b"a" in field0_metadata assert table.field(1).nullable is False @@ -1565,7 +1584,7 @@ def test_table_get_field(): table.field('d') with pytest.raises(TypeError): - table.field(None) + table.field(None) # type: ignore[arg-type] with pytest.raises(IndexError): table.field(4) @@ -1586,7 +1605,7 @@ def test_table_select_column(): table.column('d') with pytest.raises(TypeError): - table.column(None) + table.column(None) # type: ignore[arg-type] with pytest.raises(IndexError): table.column(4) @@ -1879,22 +1898,41 @@ def test_table_unify_dictionaries(): table = pa.Table.from_batches([batch1, batch2]) table = table.replace_schema_metadata({b"key1": b"value1"}) - assert table.column(0).chunk(0).dictionary.equals( - pa.array(["foo", "bar"])) - assert table.column(0).chunk(1).dictionary.equals( - pa.array(["quux", "foo"])) - assert table.column(1).chunk(0).dictionary.equals( - pa.array([123, 456, 789])) - assert table.column(1).chunk(1).dictionary.equals( - pa.array([456, 789])) + chunk_0_0 = table.column(0).chunk(0) + assert isinstance(chunk_0_0, pa.DictionaryArray) + assert chunk_0_0.dictionary.equals(pa.array(["foo", "bar"])) + + chunk_0_1 = table.column(0).chunk(1) + assert isinstance(chunk_0_1, pa.DictionaryArray) + assert chunk_0_1.dictionary.equals(pa.array(["quux", "foo"])) + + chunk_1_0 = table.column(1).chunk(0) + assert isinstance(chunk_1_0, pa.DictionaryArray) + assert chunk_1_0.dictionary.equals(pa.array([123, 456, 789])) + + chunk_1_1 = table.column(1).chunk(1) + assert isinstance(chunk_1_1, pa.DictionaryArray) + assert chunk_1_1.dictionary.equals(pa.array([456, 789])) table = table.unify_dictionaries(pa.default_memory_pool()) expected_dict_0 = pa.array(["foo", "bar", "quux"]) expected_dict_1 = pa.array([123, 456, 789]) - assert table.column(0).chunk(0).dictionary.equals(expected_dict_0) - assert table.column(0).chunk(1).dictionary.equals(expected_dict_0) - assert table.column(1).chunk(0).dictionary.equals(expected_dict_1) - assert table.column(1).chunk(1).dictionary.equals(expected_dict_1) + + chunk_0_0 = table.column(0).chunk(0) + assert isinstance(chunk_0_0, pa.DictionaryArray) + assert chunk_0_0.dictionary.equals(expected_dict_0) + + chunk_0_1 = table.column(0).chunk(1) + assert isinstance(chunk_0_1, pa.DictionaryArray) + assert chunk_0_1.dictionary.equals(expected_dict_0) + + chunk_1_0 = table.column(1).chunk(0) + assert isinstance(chunk_1_0, pa.DictionaryArray) + assert chunk_1_0.dictionary.equals(expected_dict_1) + + chunk_1_1 = table.column(1).chunk(1) + assert isinstance(chunk_1_1, pa.DictionaryArray) + assert chunk_1_1.dictionary.equals(expected_dict_1) assert table.to_pydict() == { 'a': ["foo", "bar", None, "foo", "quux", "foo", None, "quux"], @@ -1964,13 +2002,13 @@ def test_concat_tables_invalid_option(): t = pa.Table.from_arrays([list(range(10))], names=('a',)) with pytest.raises(ValueError, match="Invalid promote_options: invalid"): - pa.concat_tables([t, t], promote_options="invalid") + pa.concat_tables([t, t], promote_options="invalid") # type: ignore[arg-type] def test_concat_tables_none_table(): # ARROW-11997 with pytest.raises(AttributeError): - pa.concat_tables([None]) + pa.concat_tables([None]) # type: ignore[arg-type] @pytest.mark.pandas @@ -2113,7 +2151,7 @@ def test_concat_batches_different_schema(): def test_concat_batches_none_batches(): # ARROW-11997 with pytest.raises(AttributeError): - pa.concat_batches([None]) + pa.concat_batches([None]) # type: ignore[arg-type] @pytest.mark.parametrize( @@ -2264,7 +2302,7 @@ def test_from_arrays_schema(data, klass): # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] # Cannot pass both schema and metadata / names with pytest.raises(ValueError): @@ -2369,7 +2407,7 @@ def test_table_from_pydict_arrow_arrays(data, klass): # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] @pytest.mark.parametrize('data, klass', [ @@ -2386,7 +2424,7 @@ def test_table_from_pydict_schema(data, klass): schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64()), ('ints', pa.int64())]) with pytest.raises(KeyError, match='ints'): - pa.Table.from_pydict(data, schema=schema) + pa.Table.from_pydict(data, schema=schema) # type: ignore[arg-type] # data has columns not present in schema -> ignored schema = pa.schema([('strs', pa.utf8())]) @@ -2590,10 +2628,10 @@ def test_table_factory_function_args_pandas(): def test_factory_functions_invalid_input(): with pytest.raises(TypeError, match="Expected pandas DataFrame, python"): - pa.table("invalid input") + pa.table("invalid input") # type: ignore[arg-type] with pytest.raises(TypeError, match="Expected pandas DataFrame"): - pa.record_batch("invalid input") + pa.record_batch("invalid input") # type: ignore[arg-type] def test_table_repr_to_string(): @@ -2727,8 +2765,8 @@ def test_table_function_unicode_schema(): schema = pa.schema([(col_a, pa.int32()), (col_b, pa.string())]) result = pa.table(d, schema=schema) - assert result[0].chunk(0).equals(pa.array([1, 2, 3], type='int32')) - assert result[1].chunk(0).equals(pa.array(['a', 'b', 'c'], type='string')) + assert result.column(0).chunk(0).equals(pa.array([1, 2, 3], type='int32')) + assert result.column(1).chunk(0).equals(pa.array(['a', 'b', 'c'], type='string')) def test_table_take_vanilla_functionality(): @@ -3603,7 +3641,7 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr # equals() test with pytest.raises(NotImplementedError): - cuda_chunked_array == cuda_chunked_array + cuda_chunked_array == cuda_chunked_array # type: ignore[reportUnusedExpression] # to_pandas() test with pytest.raises(NotImplementedError): @@ -3860,7 +3898,7 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, # __dataframe__() test with pytest.raises(NotImplementedError): - from_dataframe(cuda_recordbatch.__dataframe__()) + from_dataframe(cuda_recordbatch.__dataframe__()) # type: ignore[misc] def verify_cuda_table(table, expected_schema): @@ -4059,7 +4097,7 @@ def test_table_non_cpu(cuda_context, cpu_table, cuda_table, # __dataframe__() test with pytest.raises(NotImplementedError): - from_dataframe(cuda_table.__dataframe__()) + from_dataframe(cuda_table.__dataframe__()) # type: ignore[misc] # __reduce__() test with pytest.raises(NotImplementedError): diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py index debb1066280..c3726fdbbf4 100644 --- a/python/pyarrow/tests/test_tensor.py +++ b/python/pyarrow/tests/test_tensor.py @@ -213,7 +213,7 @@ def test_tensor_memoryview(): dtype = data.dtype lst = data.tolist() tensor = pa.Tensor.from_numpy(data) - m = memoryview(tensor) + m = memoryview(tensor) # type: ignore[reportArgumentType] assert m.format == expected_format assert m.shape == data.shape assert m.strides == data.strides diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e628e559b84..1779791ea49 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -24,16 +24,22 @@ import pytest import hypothesis as h import hypothesis.strategies as st -try: - import hypothesis.extra.pytz as tzst -except ImportError: - tzst = None +from typing import Any, TYPE_CHECKING import weakref -try: +if TYPE_CHECKING: import numpy as np -except ImportError: - np = None + import hypothesis.extra.pytz as tzst +else: + try: + import numpy as np + except ImportError: + np = None + try: + import hypothesis.extra.pytz as tzst + except ImportError: + tzst = None + import pyarrow as pa import pyarrow.types as types import pyarrow.tests.strategies as past @@ -414,7 +420,7 @@ def test_tzinfo_to_string_errors(): if tzst: timezones = tzst.timezones() else: - timezones = st.none() + timezones = st.none() # type: ignore[assignment] @h.given(timezones) @@ -468,7 +474,7 @@ class BuggyTimezone2(datetime.tzinfo): def tzname(self, dt): return None - def utcoffset(self, dt): + def utcoffset(self, dt): # type: ignore[override] return "one hour" class BuggyTimezone3(datetime.tzinfo): @@ -476,7 +482,7 @@ class BuggyTimezone3(datetime.tzinfo): Wrong timezone name type """ - def tzname(self, dt): + def tzname(self, dt): # type: ignore[override] return 240 def utcoffset(self, dt): @@ -735,13 +741,13 @@ def test_struct_type(): # Neither integer nor string with pytest.raises(TypeError): - ty[None] + ty[None] # type: ignore[reportArgumentType] with pytest.raises(TypeError): - ty.field(None) + ty.field(None) # type: ignore[reportArgumentType] for a, b in zip(ty, fields): - a == b + assert a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), @@ -749,7 +755,7 @@ def test_struct_type(): ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): - a == b + assert a == b # Construct from mapping fields = [pa.field('a', pa.int64()), @@ -758,7 +764,7 @@ def test_struct_type(): ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): - a == b + assert a == b # Invalid args with pytest.raises(TypeError): @@ -865,7 +871,7 @@ def test_dictionary_type(): # invalid index type raises with pytest.raises(TypeError): - pa.dictionary(pa.string(), pa.int64()) + pa.dictionary(pa.string(), pa.int64()) # type: ignore[reportArgumentType] def test_dictionary_ordered_equals(): @@ -954,7 +960,7 @@ def test_run_end_encoded_type(): pa.run_end_encoded(None, pa.utf8()) with pytest.raises(ValueError): - pa.run_end_encoded(pa.int8(), pa.utf8()) + pa.run_end_encoded(pa.int8(), pa.utf8()) # type: ignore[reportArgumentType] @pytest.mark.parametrize('t,check_func', [ @@ -1087,12 +1093,12 @@ def test_timedelta_overflow(): pa.scalar(d, type=pa.duration('ns')) # microsecond resolution, not overflow - pa.scalar(d, type=pa.duration('us')).as_py() == d + assert pa.scalar(d, type=pa.duration('us')).as_py() == d # second/millisecond resolution, not overflow for d in [datetime.timedelta.min, datetime.timedelta.max]: - pa.scalar(d, type=pa.duration('ms')).as_py() == d - pa.scalar(d, type=pa.duration('s')).as_py() == d + _ = pa.scalar(d, type=pa.duration('ms')).as_py() == d + _ = pa.scalar(d, type=pa.duration('s')).as_py() == d def test_type_equality_operators(): @@ -1130,11 +1136,11 @@ def test_key_value_metadata(): assert m1 != {'a': 'A', 'b': 'C'} with pytest.raises(TypeError): - pa.KeyValueMetadata({'a': 1}) + pa.KeyValueMetadata({'a': 1}) # type: ignore[reportArgumentType] with pytest.raises(TypeError): - pa.KeyValueMetadata({1: 'a'}) + pa.KeyValueMetadata({1: 'a'}) # type: ignore[reportArgumentType] with pytest.raises(TypeError): - pa.KeyValueMetadata(a=1) + pa.KeyValueMetadata(a=1) # type: ignore[reportArgumentType] expected = [(b'a', b'A'), (b'b', b'B')] result = [(k, v) for k, v in m3.items()] @@ -1261,6 +1267,7 @@ def test_field_metadata(): assert f1.metadata is None assert f2.metadata == {} + assert f3.metadata is not None assert f3.metadata[b'bizz'] == b'bazz' @@ -1397,7 +1404,7 @@ def __arrow_c_schema__(self): return self.schema.__arrow_c_schema__() -class SchemaMapping(Mapping): +class SchemaMapping(Mapping[Any, Any]): def __init__(self, schema): self.schema = schema diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 93004a30618..e028f1c0484 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -21,7 +21,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa from pyarrow import compute as pc @@ -35,7 +35,7 @@ try: import pyarrow.dataset as ds except ImportError: - ds = None + pass def mock_udf_context(batch_length=10): @@ -381,6 +381,7 @@ def check_scalar_function(func_fixture, func = pc.get_function(name) assert func.name == name + assert batch_length is not None result = pc.call_function(name, inputs, length=batch_length) expected_output = function(mock_udf_context(batch_length), *inputs) @@ -580,8 +581,8 @@ def identity(ctx, val): } with pytest.raises(TypeError, match="DataType expected, got "): - pc.register_scalar_function(identity, func_name, - doc, in_types, out_type) + pc.register_scalar_function( + identity, func_name, doc, in_types, out_type) # type: ignore[arg-type] def test_wrong_input_type_declaration(): @@ -597,8 +598,9 @@ def identity(ctx, val): } with pytest.raises(TypeError, match="DataType expected, got "): - pc.register_scalar_function(identity, func_name, doc, - in_types, out_type) + pc.register_scalar_function( + identity, func_name, doc, in_types, # type: ignore[arg-type] + out_type) def test_scalar_udf_context(unary_func_fixture): diff --git a/python/pyarrow/tests/test_without_numpy.py b/python/pyarrow/tests/test_without_numpy.py index 55c12602ce8..c5f5671aabc 100644 --- a/python/pyarrow/tests/test_without_numpy.py +++ b/python/pyarrow/tests/test_without_numpy.py @@ -50,6 +50,7 @@ def test_tensor_to_np(): arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] storage = pa.array(arr, pa.list_(pa.int32(), 4)) tensor_array = pa.ExtensionArray.from_storage(tensor_type, storage) + assert isinstance(tensor_array, pa.FixedShapeTensorArray) tensor = tensor_array.to_tensor() msg = "Cannot return a numpy.ndarray if NumPy is not present" diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 7e3dd4324e9..8183b91a92e 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -336,6 +336,7 @@ def _ensure_minio_component_version(component, minimum_year): stderr=subprocess.PIPE, encoding='utf-8') as proc: if proc.wait(10) != 0: return False + assert proc.stdout is not None stdout = proc.stdout.read() pattern = component + r' version RELEASE\.(\d+)-.*' version_match = re.search(pattern, stdout) @@ -367,6 +368,8 @@ def _run_mc_command(mcdir, *args): cmd_str = ' '.join(full_args) print(f'Cmd: {cmd_str}') print(f' Return: {retval}') + assert proc.stdout is not None + assert proc.stderr is not None print(f' Stdout: {proc.stdout.read()}') print(f' Stderr: {proc.stderr.read()}') if retval != 0: diff --git a/python/pyarrow/vendored/docscrape.py b/python/pyarrow/vendored/docscrape.py index 6c4d6e01400..47aeeed40ae 100644 --- a/python/pyarrow/vendored/docscrape.py +++ b/python/pyarrow/vendored/docscrape.py @@ -18,7 +18,7 @@ import sys -def strip_blank_lines(l): +def strip_blank_lines(l): # noqa: E741 "Remove leading and trailing blank lines from a list of lines" while l and not l[0].strip(): del l[0] @@ -62,7 +62,7 @@ def read(self): return '' def seek_next_non_empty_line(self): - for l in self[self._l:]: + for l in self[self._l:]: # noqa: E741 if l.strip(): break else: @@ -185,8 +185,9 @@ def _is_at_section(self): l2 = self._doc.peek(1).strip() # ---------- or ========== if len(l2) >= 3 and (set(l2) in ({'-'}, {'='})) and len(l2) != len(l1): snip = '\n'.join(self._doc._str[:2])+'...' - self._error_location("potentially wrong underline length... \n%s \n%s in \n%s" - % (l1, l2, snip), error=False) + self._error_location( + "potentially wrong underline length... \n%s \n%s in \n%s" + % (l1, l2, snip), error=False) return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1)) def _strip(self, doc): diff --git a/python/pyproject.toml b/python/pyproject.toml index fe812227ebe..e50c8e4d490 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -85,7 +85,7 @@ include = ["pyarrow"] namespaces = false [tool.setuptools.package-data] -pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"] +pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd", "py.typed"] [tool.setuptools_scm] root = '..' @@ -93,3 +93,27 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '23.0.0a0' + +[tool.mypy] +files = ["pyarrow"] +exclude = 'pyarrow/interchange/.*|pyarrow/vendored/.*|pyarrow/tests/test_cuda*' +mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" + +[tool.pyright] +pythonPlatform = "All" +pythonVersion = "3.10" +include = ["pyarrow"] +exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/test_cuda*"] +stubPath = "pyarrow-stubs" +typeCheckingMode = "basic" + +[tool.ty.src] +include = ["pyarrow"] +exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/test_cuda*"] + +[tool.ty.environment] +root = ["pyarrow"] + +[tool.ty.rules] +unresolved-import = "ignore" +unresolved-attribute = "ignore" diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py index 53d3dd52bd8..6015cc211c1 100644 --- a/python/scripts/run_emscripten_tests.py +++ b/python/scripts/run_emscripten_tests.py @@ -114,7 +114,7 @@ def end_headers(self): def run_server_thread(dist_dir, q): - global _SERVER_ADDRESS + global _SERVER_ADDRESS # noqa: F824 os.chdir(dist_dir) server = http.server.HTTPServer(("", 0), TemplateOverrider) q.put(server.server_address)