Skip to content

Commit

Permalink
feat(python): Expose Arrow C interface directly on Polars
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 18, 2024
1 parent a970d16 commit a1e9d00
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 2 deletions.
13 changes: 13 additions & 0 deletions docs/src/python/user-guide/misc/arrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# --8<-- [start:to_arrow]
import polars as pl

df = pl.DataFrame({"foo": [1, 2, 3], "bar": ["ham", "spam", "jam"]})

arrow_table = df.to_arrow()
print(arrow_table)
# --8<-- [end:to_arrow]

# --8<-- [start:to_arrow_zero]
arrow_table_zero_copy = df.to_arrow(compat_level=pl.CompatLevel.newest())
print(arrow_table_zero_copy)
# --8<-- [end:to_arrow_zero]
40 changes: 40 additions & 0 deletions docs/user-guide/misc/arrow.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Arrow producer/consumer

## Using pyarrow

Polars can move data in and out of arrow zero copy. This can be done either via pyarrow
or natively. Let's first start by showing the pyarrow solution:

{{code_block('user-guide/misc/arrow','to_arrow',[])}}

```
pyarrow.Table
foo: int64
bar: large_string
----
foo: [[1,2,3]]
bar: [["ham","spam","jam"]]
```

Or if you want to ensure the output is zero-copy:

{{code_block('user-guide/misc/arrow','to_arrow_zero',[])}}

```
pyarrow.Table
foo: int64
bar: string_view
----
foo: [[1,2,3]]
bar: [["ham","spam","jam"]]
```

Importing from pyarrow can be achieved with `pl.from_arrow`.

## Using Polars directly

Polars can also consume and export to and import from the [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
directly. This is recommended for library maintainers that want to interop with Polars without requiring a pyarrow installation.

- To export `ArrowArray` C structs, Polars exposes: `Series._export_arrow_to_c`.
- To import an `ArrowArray` C struct, Polars exposes `Series._import_arrow_from_c`.
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ nav:
- user-guide/misc/visualization.md
- user-guide/misc/styling.md
- user-guide/misc/comparison.md
- user-guide/misc/arrow.md

- API reference: api/index.md

Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@
when,
zeros,
)
from polars.interchange import CompatLevel
from polars.io import (
read_avro,
read_clipboard,
Expand Down Expand Up @@ -375,6 +376,7 @@
"SQLContext",
"sql",
"sql_expr",
"CompatLevel",
]


Expand Down
4 changes: 4 additions & 0 deletions py-polars/polars/interchange/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@
Details on the protocol:
https://data-apis.org/dataframe-protocol/latest/index.html
"""

from polars.interchange.protocol import CompatLevel

__all__ = ["CompatLevel"]
45 changes: 44 additions & 1 deletion py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,16 +355,59 @@ def _from_pyseries(cls, pyseries: PySeries) -> Self:
return series

@classmethod
@deprecate_function("use _import_arrow_from_c", version="1.3")
def _import_from_c(cls, name: str, pointers: list[tuple[int, int]]) -> Self:
return cls._from_pyseries(PySeries._import_arrow_from_c(name, pointers))

@classmethod
def _import_arrow_from_c(cls, name: str, pointers: list[tuple[int, int]]) -> Self:
"""
Construct a Series from Arrows C interface.
Parameters
----------
name
The name that should be given to the `Series`.
pointers
A list with tuples containing two entries:
- The raw pointer to a C ArrowArray struct
- The raw pointer to a C ArrowSchema struct
Warning
-------
This will read the `array` pointer without moving it. The host process should
garbage collect the heap pointer, but not its contents.
"""
return cls._from_pyseries(PySeries._import_from_c(name, pointers))
return cls._from_pyseries(PySeries._import_arrow_from_c(name, pointers))

def _export_arrow_to_c(self, out_ptr: int, out_schema_ptr: int) -> None:
"""
Export to a C ArrowArray and C ArrowSchema struct, given their pointers.
Parameters
----------
out_ptr: int
The raw pointer to a C ArrowArray struct.
out_schema_ptr: int (optional)
The raw pointer to a C ArrowSchema struct.
Notes
-----
The series should only contain a single chunk. If you want to export all chunks,
first call `Series.get_chunks` to give you a list of chunks.
Warning
-------
Safety
This function will write to the pointers given in `out_ptr` and `out_schema_ptr`
and thus is highly unsafe.
Leaking
If you don't pass the ArrowArray struct to a consumer,
array memory will leak. This is a low-level function intended for
expert users.
"""
self._s._export_arrow_to_c(out_ptr, out_schema_ptr)

def _get_buffer_info(self) -> BufferInfo:
"""
Expand Down
30 changes: 29 additions & 1 deletion py-polars/src/series/c_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use super::*;
#[pymethods]
impl PySeries {
#[staticmethod]
unsafe fn _import_from_c(
unsafe fn _import_arrow_from_c(
name: &str,
chunks: Vec<(Py_uintptr_t, Py_uintptr_t)>,
) -> PyResult<Self> {
Expand All @@ -29,4 +29,32 @@ impl PySeries {
let s = Series::try_from((name, chunks)).map_err(PyPolarsErr::from)?;
Ok(s.into())
}

unsafe fn _export_arrow_to_c(
&self,
out_ptr: Py_uintptr_t,
out_schema_ptr: Py_uintptr_t,
) -> PyResult<()> {
export_chunk(&self.series, out_ptr, out_schema_ptr).map_err(PyPolarsErr::from)?;
Ok(())
}
}

unsafe fn export_chunk(
s: &Series,
out_ptr: Py_uintptr_t,
out_schema_ptr: Py_uintptr_t,
) -> PolarsResult<()> {
polars_ensure!(s.chunks().len() == 1, InvalidOperation: "expect a single chunk");

let c_array = arrow::ffi::export_array_to_c(s.chunks()[0].clone());
let out_ptr = out_ptr as *mut arrow::ffi::ArrowArray;
*out_ptr = c_array;

let field = ArrowField::new(s.name(), s.dtype().to_arrow(CompatLevel::newest()), true);
let c_schema = arrow::ffi::export_field_to_c(&field);

let out_schema_ptr = out_schema_ptr as *mut arrow::ffi::ArrowSchema;
*out_schema_ptr = c_schema;
Ok(())
}

0 comments on commit a1e9d00

Please sign in to comment.