From 1b0ef0224e30bba75c970d6a8c2f64d48bbbced6 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Fri, 5 Apr 2024 20:13:04 +1100 Subject: [PATCH] Update py03 from 0.20 to 0.21 (#5566) * Update py03 from 0.20 to 0.21 * Bump pyo3 in arrow-pyarrow-integration-testing * Update pyarrow API to align with pyo3 0.21 changes * Fix arrow-pyarrow-integration-testing clippy * Minor * Fix typo * Use PyBackedStr when extracting * Bump to pyo3 0.21.1 * Trigger --- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow-pyarrow-integration-testing/src/lib.rs | 25 ++---- arrow/Cargo.toml | 2 +- arrow/src/pyarrow.rs | 95 +++++++++++--------- arrow/tests/pyarrow.rs | 4 +- 5 files changed, 67 insertions(+), 61 deletions(-) diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 8c60c086c29a..6f07d42d88c1 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -34,4 +34,4 @@ crate-type = ["cdylib"] [dependencies] arrow = { path = "../arrow", features = ["pyarrow"] } -pyo3 = { version = "0.20", features = ["extension-module"] } +pyo3 = { version = "0.21.1", features = ["extension-module"] } diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index a53447b53c31..918fa74e3083 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -40,9 +40,9 @@ fn to_py_err(err: ArrowError) -> PyErr { /// Returns `array + array` of an int64 array. #[pyfunction] -fn double(array: &PyAny, py: Python) -> PyResult { +fn double(array: &Bound, py: Python) -> PyResult { // import - let array = make_array(ArrayData::from_pyarrow(array)?); + let array = make_array(ArrayData::from_pyarrow_bound(&array)?); // perform some operation let array = array @@ -60,7 +60,7 @@ fn double(array: &PyAny, py: Python) -> PyResult { /// calls a lambda function that receives and returns an array /// whose result must be the array multiplied by two #[pyfunction] -fn double_py(lambda: &PyAny, py: Python) -> PyResult { +fn double_py(lambda: &Bound, py: Python) -> PyResult { // create let array = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)])); let expected = Arc::new(Int64Array::from(vec![Some(2), None, Some(6)])) as ArrayRef; @@ -68,7 +68,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { // to py let pyarray = array.to_data().to_pyarrow(py)?; let pyarray = lambda.call1((pyarray,))?; - let array = make_array(ArrayData::from_pyarrow(pyarray)?); + let array = make_array(ArrayData::from_pyarrow_bound(&pyarray)?); Ok(array == expected) } @@ -82,16 +82,12 @@ fn make_empty_array(datatype: PyArrowType, py: Python) -> PyResult, - start: i64, -) -> PyResult> { +fn substring(array: PyArrowType, start: i64) -> PyResult> { // import let array = make_array(array.0); // substring - let array = - kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; + let array = kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; Ok(array.to_data().into()) } @@ -102,8 +98,7 @@ fn concatenate(array: PyArrowType, py: Python) -> PyResult let array = make_array(array.0); // concat - let array = - kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; + let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; array.to_data().to_pyarrow(py) } @@ -129,9 +124,7 @@ fn round_trip_array(obj: PyArrowType) -> PyResult, -) -> PyResult> { +fn round_trip_record_batch(obj: PyArrowType) -> PyResult> { Ok(obj) } @@ -168,7 +161,7 @@ fn boxed_reader_roundtrip( } #[pymodule] -fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> { +fn arrow_pyarrow_integration_testing(_py: Python, m: &Bound) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(double))?; m.add_wrapped(wrap_pyfunction!(double_py))?; m.add_wrapped(wrap_pyfunction!(make_empty_array))?; diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 4f7fda9b8075..a938d75b1a6f 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -54,7 +54,7 @@ arrow-select = { workspace = true } arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -pyo3 = { version = "0.20", default-features = false, optional = true } +pyo3 = { version = "0.21.1", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 39702ce01aea..1733067c738a 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -64,6 +64,7 @@ use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; use pyo3::prelude::*; +use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyCapsule, PyList, PyTuple}; use crate::array::{make_array, ArrayData}; @@ -82,7 +83,12 @@ fn to_py_err(err: ArrowError) -> PyErr { } pub trait FromPyArrow: Sized { - fn from_pyarrow(value: &PyAny) -> PyResult; + #[deprecated(since = "52.0.0", note = "Use from_pyarrow_bound")] + fn from_pyarrow(value: &PyAny) -> PyResult { + Self::from_pyarrow_bound(&value.as_borrowed()) + } + + fn from_pyarrow_bound(value: &Bound) -> PyResult; } /// Create a new PyArrow object from a arrow-rs type. @@ -101,15 +107,17 @@ impl IntoPyArrow for T { } } -fn validate_class(expected: &str, value: &PyAny) -> PyResult<()> { - let pyarrow = PyModule::import(value.py(), "pyarrow")?; +fn validate_class(expected: &str, value: &Bound) -> PyResult<()> { + let pyarrow = PyModule::import_bound(value.py(), "pyarrow")?; let class = pyarrow.getattr(expected)?; - if !value.is_instance(class)? { - let expected_module = class.getattr("__module__")?.extract::<&str>()?; - let expected_name = class.getattr("__name__")?.extract::<&str>()?; + if !value.is_instance(&class)? { + let expected_module = class.getattr("__module__")?.extract::()?; + let expected_name = class.getattr("__name__")?.extract::()?; let found_class = value.get_type(); - let found_module = found_class.getattr("__module__")?.extract::<&str>()?; - let found_name = found_class.getattr("__name__")?.extract::<&str>()?; + let found_module = found_class + .getattr("__module__")? + .extract::()?; + let found_name = found_class.getattr("__name__")?.extract::()?; return Err(PyTypeError::new_err(format!( "Expected instance of {}.{}, got {}.{}", expected_module, expected_name, found_module, found_name @@ -118,7 +126,7 @@ fn validate_class(expected: &str, value: &PyAny) -> PyResult<()> { Ok(()) } -fn validate_pycapsule(capsule: &PyCapsule, name: &str) -> PyResult<()> { +fn validate_pycapsule(capsule: &Bound, name: &str) -> PyResult<()> { let capsule_name = capsule.name()?; if capsule_name.is_none() { return Err(PyValueError::new_err( @@ -138,13 +146,13 @@ fn validate_pycapsule(capsule: &PyCapsule, name: &str) -> PyResult<()> { } impl FromPyArrow for DataType { - fn from_pyarrow(value: &PyAny) -> PyResult { + fn from_pyarrow_bound(value: &Bound) -> PyResult { // Newer versions of PyArrow as well as other libraries with Arrow data implement this // method, so prefer it over _export_to_c. // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html if value.hasattr("__arrow_c_schema__")? { - let capsule: &PyCapsule = - PyTryInto::try_into(value.getattr("__arrow_c_schema__")?.call0()?)?; + let capsule = value.getattr("__arrow_c_schema__")?.call0()?; + let capsule = capsule.downcast::()?; validate_pycapsule(capsule, "arrow_schema")?; let schema_ptr = unsafe { capsule.reference::() }; @@ -166,7 +174,7 @@ impl ToPyArrow for DataType { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; - let module = py.import("pyarrow")?; + let module = py.import_bound("pyarrow")?; let class = module.getattr("DataType")?; let dtype = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(dtype.into()) @@ -174,13 +182,13 @@ impl ToPyArrow for DataType { } impl FromPyArrow for Field { - fn from_pyarrow(value: &PyAny) -> PyResult { + fn from_pyarrow_bound(value: &Bound) -> PyResult { // Newer versions of PyArrow as well as other libraries with Arrow data implement this // method, so prefer it over _export_to_c. // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html if value.hasattr("__arrow_c_schema__")? { - let capsule: &PyCapsule = - PyTryInto::try_into(value.getattr("__arrow_c_schema__")?.call0()?)?; + let capsule = value.getattr("__arrow_c_schema__")?.call0()?; + let capsule = capsule.downcast::()?; validate_pycapsule(capsule, "arrow_schema")?; let schema_ptr = unsafe { capsule.reference::() }; @@ -202,7 +210,7 @@ impl ToPyArrow for Field { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; - let module = py.import("pyarrow")?; + let module = py.import_bound("pyarrow")?; let class = module.getattr("Field")?; let dtype = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(dtype.into()) @@ -210,13 +218,13 @@ impl ToPyArrow for Field { } impl FromPyArrow for Schema { - fn from_pyarrow(value: &PyAny) -> PyResult { + fn from_pyarrow_bound(value: &Bound) -> PyResult { // Newer versions of PyArrow as well as other libraries with Arrow data implement this // method, so prefer it over _export_to_c. // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html if value.hasattr("__arrow_c_schema__")? { - let capsule: &PyCapsule = - PyTryInto::try_into(value.getattr("__arrow_c_schema__")?.call0()?)?; + let capsule = value.getattr("__arrow_c_schema__")?.call0()?; + let capsule = capsule.downcast::()?; validate_pycapsule(capsule, "arrow_schema")?; let schema_ptr = unsafe { capsule.reference::() }; @@ -238,7 +246,7 @@ impl ToPyArrow for Schema { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; - let module = py.import("pyarrow")?; + let module = py.import_bound("pyarrow")?; let class = module.getattr("Schema")?; let schema = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(schema.into()) @@ -246,7 +254,7 @@ impl ToPyArrow for Schema { } impl FromPyArrow for ArrayData { - fn from_pyarrow(value: &PyAny) -> PyResult { + fn from_pyarrow_bound(value: &Bound) -> PyResult { // Newer versions of PyArrow as well as other libraries with Arrow data implement this // method, so prefer it over _export_to_c. // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html @@ -259,8 +267,10 @@ impl FromPyArrow for ArrayData { )); } - let schema_capsule: &PyCapsule = PyTryInto::try_into(tuple.get_item(0)?)?; - let array_capsule: &PyCapsule = PyTryInto::try_into(tuple.get_item(1)?)?; + let schema_capsule = tuple.get_item(0)?; + let schema_capsule = schema_capsule.downcast::()?; + let array_capsule = tuple.get_item(1)?; + let array_capsule = array_capsule.downcast::()?; validate_pycapsule(schema_capsule, "arrow_schema")?; validate_pycapsule(array_capsule, "arrow_array")?; @@ -296,7 +306,7 @@ impl ToPyArrow for ArrayData { let array = FFI_ArrowArray::new(self); let schema = FFI_ArrowSchema::try_from(self.data_type()).map_err(to_py_err)?; - let module = py.import("pyarrow")?; + let module = py.import_bound("pyarrow")?; let class = module.getattr("Array")?; let array = class.call_method1( "_import_from_c", @@ -310,9 +320,9 @@ impl ToPyArrow for ArrayData { } impl FromPyArrow for Vec { - fn from_pyarrow(value: &PyAny) -> PyResult { + fn from_pyarrow_bound(value: &Bound) -> PyResult { let list = value.downcast::()?; - list.iter().map(|x| T::from_pyarrow(x)).collect() + list.iter().map(|x| T::from_pyarrow_bound(&x)).collect() } } @@ -327,7 +337,7 @@ impl ToPyArrow for Vec { } impl FromPyArrow for RecordBatch { - fn from_pyarrow(value: &PyAny) -> PyResult { + fn from_pyarrow_bound(value: &Bound) -> PyResult { // Newer versions of PyArrow as well as other libraries with Arrow data implement this // method, so prefer it over _export_to_c. // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html @@ -340,8 +350,10 @@ impl FromPyArrow for RecordBatch { )); } - let schema_capsule: &PyCapsule = PyTryInto::try_into(tuple.get_item(0)?)?; - let array_capsule: &PyCapsule = PyTryInto::try_into(tuple.get_item(1)?)?; + let schema_capsule = tuple.get_item(0)?; + let schema_capsule = schema_capsule.downcast::()?; + let array_capsule = tuple.get_item(1)?; + let array_capsule = array_capsule.downcast::()?; validate_pycapsule(schema_capsule, "arrow_schema")?; validate_pycapsule(array_capsule, "arrow_array")?; @@ -370,12 +382,13 @@ impl FromPyArrow for RecordBatch { validate_class("RecordBatch", value)?; // TODO(kszucs): implement the FFI conversions in arrow-rs for RecordBatches let schema = value.getattr("schema")?; - let schema = Arc::new(Schema::from_pyarrow(schema)?); + let schema = Arc::new(Schema::from_pyarrow_bound(&schema)?); - let arrays = value.getattr("columns")?.downcast::()?; + let arrays = value.getattr("columns")?; let arrays = arrays + .downcast::()? .iter() - .map(|a| Ok(make_array(ArrayData::from_pyarrow(a)?))) + .map(|a| Ok(make_array(ArrayData::from_pyarrow_bound(&a)?))) .collect::>()?; let batch = RecordBatch::try_new(schema, arrays).map_err(to_py_err)?; @@ -395,13 +408,13 @@ impl ToPyArrow for RecordBatch { /// Supports conversion from `pyarrow.RecordBatchReader` to [ArrowArrayStreamReader]. impl FromPyArrow for ArrowArrayStreamReader { - fn from_pyarrow(value: &PyAny) -> PyResult { + fn from_pyarrow_bound(value: &Bound) -> PyResult { // Newer versions of PyArrow as well as other libraries with Arrow data implement this // method, so prefer it over _export_to_c. // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html if value.hasattr("__arrow_c_stream__")? { - let capsule: &PyCapsule = - PyTryInto::try_into(value.getattr("__arrow_c_stream__")?.call0()?)?; + let capsule = value.getattr("__arrow_c_stream__")?.call0()?; + let capsule = capsule.downcast::()?; validate_pycapsule(capsule, "arrow_array_stream")?; let stream = unsafe { FFI_ArrowArrayStream::from_raw(capsule.pointer() as _) }; @@ -421,7 +434,7 @@ impl FromPyArrow for ArrowArrayStreamReader { // make the conversion through PyArrow's private API // this changes the pointer's memory and is thus unsafe. // In particular, `_export_to_c` can go out of bounds - let args = PyTuple::new(value.py(), [stream_ptr as Py_uintptr_t]); + let args = PyTuple::new_bound(value.py(), [stream_ptr as Py_uintptr_t]); value.call_method1("_export_to_c", args)?; let stream_reader = ArrowArrayStreamReader::try_new(stream) @@ -439,9 +452,9 @@ impl IntoPyArrow for Box { let mut stream = FFI_ArrowArrayStream::new(self); let stream_ptr = (&mut stream) as *mut FFI_ArrowArrayStream; - let module = py.import("pyarrow")?; + let module = py.import_bound("pyarrow")?; let class = module.getattr("RecordBatchReader")?; - let args = PyTuple::new(py, [stream_ptr as Py_uintptr_t]); + let args = PyTuple::new_bound(py, [stream_ptr as Py_uintptr_t]); let reader = class.call_method1("_import_from_c", args)?; Ok(PyObject::from(reader)) @@ -463,8 +476,8 @@ impl IntoPyArrow for ArrowArrayStreamReader { pub struct PyArrowType(pub T); impl<'source, T: FromPyArrow> FromPyObject<'source> for PyArrowType { - fn extract(value: &'source PyAny) -> PyResult { - Ok(Self(T::from_pyarrow(value)?)) + fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult { + Ok(Self(T::from_pyarrow_bound(value)?)) } } diff --git a/arrow/tests/pyarrow.rs b/arrow/tests/pyarrow.rs index 4b6991da0063..a1c365c31798 100644 --- a/arrow/tests/pyarrow.rs +++ b/arrow/tests/pyarrow.rs @@ -32,9 +32,9 @@ fn test_to_pyarrow() { let res = Python::with_gil(|py| { let py_input = input.to_pyarrow(py)?; - let records = RecordBatch::from_pyarrow(py_input.as_ref(py))?; + let records = RecordBatch::from_pyarrow_bound(py_input.bind(py))?; let py_records = records.to_pyarrow(py)?; - RecordBatch::from_pyarrow(py_records.as_ref(py)) + RecordBatch::from_pyarrow_bound(py_records.bind(py)) }) .unwrap();