Skip to content

Commit

Permalink
Move df_to_arrow to the io module (#3694)
Browse files Browse the repository at this point in the history
Move `df_to_arrow` from `_arrow_types` to the `io.conversions` module.
  • Loading branch information
jp-dark authored Feb 13, 2025
1 parent bd6b6cd commit cc33291
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 142 deletions.
133 changes: 0 additions & 133 deletions apis/python/src/tiledbsoma/_arrow_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@

import numpy as np
import numpy.typing as npt
import pandas as pd
import pyarrow as pa

_ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = {
Expand Down Expand Up @@ -194,138 +193,6 @@ def is_string_dtype(dtype: Any) -> bool:
return dtype.name in ["object", "string", "str32", "str64"]


def df_to_arrow(df: pd.DataFrame) -> pa.Table:
"""
Handle special cases where pa.Table.from_pandas is not sufficient.
"""
nullable_fields = set()
# Not for name, col in df.items() since we need df[k] on the left-hand sides
for key in df:
# Make attributes nullable. Context:
# * df_to_arrow is _solely_ for use of tiledbsoma.io
# o Anyone calling the SOMA API directly has user-provided Arrow
# schema which must be respected
# o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata,
# and update_obs/update_var -- does not provide an Arrow schema
# explicitly. We compute an Arrow schema for them here.
# * Even when the _initial_ data is all non-null down a particular
# string column, there are two ways a _subsequent_ write can provide
# nulls: append-mode ingest, or, update_obs/update_var wherein the new
# data has nulls even when the data used at schema-create time was
# non-null.
# * We have no way of knowing at initial ingest time whether or not
# users will later be appending, or updating, with null data.
# * Note that Arrow has a per-field nullable flag in its schema metadata
# -- and so do TileDB array schemas.
#
# Note in particular this is for the use of tiledbsoma.io:
#
# * In the tiledbsoma API (e.g. DataFrame.create) the user passes an
# Arrow schema and we respect it as-is. They specify nullability, or
# not, as they wish.
# * In tiledbsoma.io, the user-provided inputs are AnnData objects.
# We compute the Arrow schema _for_ them. And we must accommodate
# reasonable/predictable needs.

nullable_fields.add(key)

# Handle special cases for all null columns where the dtype is "object"
# or "category" and must be explicitly casted to the correct pandas
# extension dtype.
#
# Note: with
# anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
# the dtype comes in to us via `tiledbsoma.io.from_anndata` not
# as `pd.StringDtype()` but rather as `object`.
#
# Note: we're working around a subtle issue involving Pandas, and Arrow's
# from_pandas, and categoricals.
#
# * If you do this:
# pd.Series(["a", "b", "c", "d"], dtype=pd.CategoricalDtype())
# then you get Pandas categorical of string with no nulls -- as desired.
# * If you do this:
# pd.Series(["a", "b", None, "d"], dtype=pd.CategoricalDtype())
# or
# pd.Series(["a", "b", np.nan, "d"], dtype=pd.CategoricalDtype())
# then you get Pandas categorical of string, with some nulls -- as desired
# * If you do this:
# pd.Series([None] * 4, dtype=pd.CategoricalDtype())
# or
# pd.Series([np.nan] * 4, dtype=pd.CategoricalDtype())
# then you get Pandas categorical of double -- with NaN values -- not as desired.
if df[key].isnull().all():
if df[key].dtype.name == "object":
df[key] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
elif df[key].dtype.name == "category":
# This is a trick to avoid getting float64 value type in the Pandas categorical.
# That's the good news. The bad news is that pa.Table.from_pandas() of this
# will result in Arrow value-type of pa.null(). Part two, to deal with
# this, is below.
df[key] = pd.Series(
["X"] * df.shape[0], dtype=pd.CategoricalDtype()
).cat.remove_categories(["X"])

# For categoricals, it's possible to get
# TypeError: Object of type bool_ is not JSON serializable
# deep within library functions. Debugging reveals that this happens when
# the df[key].values.ordered is of type np.bool_ rather than Python bool.
# So, we cast and reconstruct.
for key in df:
column = df[key]
if isinstance(column.dtype, pd.CategoricalDtype):
if hasattr(column.values, "categories"):
categories = column.values.categories

if hasattr(column.values, "ordered"):
ordered = bool(column.values.ordered)

df[key] = pd.Categorical(
values=column, categories=categories, ordered=ordered
)

arrow_table = pa.Table.from_pandas(df)

md = arrow_table.schema.metadata
md.update(dict.fromkeys(nullable_fields, "nullable"))
arrow_table = arrow_table.replace_schema_metadata(md)

# For tiledbsoma.io (for which this method exists) _any_ dataset can be appended to
# later on. This means that on fresh ingest we must use a larger bit-width than
# the bare minimum necessary.
new_map = {}
for field in arrow_table.schema:
if field.name == "__index_level_0__":
continue
elif pa.types.is_dictionary(field.type):
old_index_type = field.type.index_type
new_index_type = (
pa.int32()
if old_index_type in [pa.int8(), pa.int16()]
else old_index_type
)
# This is part two of what we need to do to get null-filled Pandas
# categorical-of-string conveyed to Arrow. An entirely null-filled
# Pandas categorical-of-string series, after py.Table.from_pandas(),
# will have type pa.null.
old_value_type = field.type.value_type
new_value_type = (
pa.large_string() if old_value_type == pa.null() else old_value_type
)
new_map[field.name] = pa.dictionary(
new_index_type,
new_value_type,
field.type.ordered,
)
else:
new_map[field.name] = field.type
new_schema = pa.schema(new_map, metadata=arrow_table.schema.metadata)

arrow_table = pa.Table.from_pandas(df, schema=new_schema)

return arrow_table


def pyarrow_to_carrow_type(pa_type: pa.DataType) -> str:
try:
return _PYARROW_TO_CARROW[pa_type]
Expand Down
2 changes: 1 addition & 1 deletion apis/python/src/tiledbsoma/io/_registration/signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

import tiledbsoma
import tiledbsoma.logging
from tiledbsoma._arrow_types import df_to_arrow
from tiledbsoma.io._util import read_h5ad # Allow us to read over S3 in backed mode
from tiledbsoma.io.conversions import df_to_arrow
from tiledbsoma.options import SOMATileDBContext

_EQUIVALENCES = {
Expand Down
132 changes: 132 additions & 0 deletions apis/python/src/tiledbsoma/io/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,135 @@ def csr_from_coo_table(
tbl, (num_rows, num_cols), "csr", True, context
).to_scipy()
return s


def df_to_arrow(df: pd.DataFrame) -> pa.Table:
"""
Handle special cases where pa.Table.from_pandas is not sufficient.
"""
nullable_fields = set()
# Not for name, col in df.items() since we need df[k] on the left-hand sides
for key in df:
# Make attributes nullable. Context:
# * df_to_arrow is _solely_ for use of tiledbsoma.io
# o Anyone calling the SOMA API directly has user-provided Arrow
# schema which must be respected
# o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata,
# and update_obs/update_var -- does not provide an Arrow schema
# explicitly. We compute an Arrow schema for them here.
# * Even when the _initial_ data is all non-null down a particular
# string column, there are two ways a _subsequent_ write can provide
# nulls: append-mode ingest, or, update_obs/update_var wherein the new
# data has nulls even when the data used at schema-create time was
# non-null.
# * We have no way of knowing at initial ingest time whether or not
# users will later be appending, or updating, with null data.
# * Note that Arrow has a per-field nullable flag in its schema metadata
# -- and so do TileDB array schemas.
#
# Note in particular this is for the use of tiledbsoma.io:
#
# * In the tiledbsoma API (e.g. DataFrame.create) the user passes an
# Arrow schema and we respect it as-is. They specify nullability, or
# not, as they wish.
# * In tiledbsoma.io, the user-provided inputs are AnnData objects.
# We compute the Arrow schema _for_ them. And we must accommodate
# reasonable/predictable needs.

nullable_fields.add(key)

# Handle special cases for all null columns where the dtype is "object"
# or "category" and must be explicitly casted to the correct pandas
# extension dtype.
#
# Note: with
# anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
# the dtype comes in to us via `tiledbsoma.io.from_anndata` not
# as `pd.StringDtype()` but rather as `object`.
#
# Note: we're working around a subtle issue involving Pandas, and Arrow's
# from_pandas, and categoricals.
#
# * If you do this:
# pd.Series(["a", "b", "c", "d"], dtype=pd.CategoricalDtype())
# then you get Pandas categorical of string with no nulls -- as desired.
# * If you do this:
# pd.Series(["a", "b", None, "d"], dtype=pd.CategoricalDtype())
# or
# pd.Series(["a", "b", np.nan, "d"], dtype=pd.CategoricalDtype())
# then you get Pandas categorical of string, with some nulls -- as desired
# * If you do this:
# pd.Series([None] * 4, dtype=pd.CategoricalDtype())
# or
# pd.Series([np.nan] * 4, dtype=pd.CategoricalDtype())
# then you get Pandas categorical of double -- with NaN values -- not as desired.
if df[key].isnull().all():
if df[key].dtype.name == "object":
df[key] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
elif df[key].dtype.name == "category":
# This is a trick to avoid getting float64 value type in the Pandas categorical.
# That's the good news. The bad news is that pa.Table.from_pandas() of this
# will result in Arrow value-type of pa.null(). Part two, to deal with
# this, is below.
df[key] = pd.Series(
["X"] * df.shape[0], dtype=pd.CategoricalDtype()
).cat.remove_categories(["X"])

# For categoricals, it's possible to get
# TypeError: Object of type bool_ is not JSON serializable
# deep within library functions. Debugging reveals that this happens when
# the df[key].values.ordered is of type np.bool_ rather than Python bool.
# So, we cast and reconstruct.
for key in df:
column = df[key]
if isinstance(column.dtype, pd.CategoricalDtype):
if hasattr(column.values, "categories"):
categories = column.values.categories

if hasattr(column.values, "ordered"):
ordered = bool(column.values.ordered)

df[key] = pd.Categorical(
values=column, categories=categories, ordered=ordered
)

arrow_table = pa.Table.from_pandas(df)

md = arrow_table.schema.metadata
md.update(dict.fromkeys(nullable_fields, "nullable"))
arrow_table = arrow_table.replace_schema_metadata(md)

# For tiledbsoma.io (for which this method exists) _any_ dataset can be appended to
# later on. This means that on fresh ingest we must use a larger bit-width than
# the bare minimum necessary.
new_map = {}
for field in arrow_table.schema:
if field.name == "__index_level_0__":
continue
elif pa.types.is_dictionary(field.type):
old_index_type = field.type.index_type
new_index_type = (
pa.int32()
if old_index_type in [pa.int8(), pa.int16()]
else old_index_type
)
# This is part two of what we need to do to get null-filled Pandas
# categorical-of-string conveyed to Arrow. An entirely null-filled
# Pandas categorical-of-string series, after py.Table.from_pandas(),
# will have type pa.null.
old_value_type = field.type.value_type
new_value_type = (
pa.large_string() if old_value_type == pa.null() else old_value_type
)
new_map[field.name] = pa.dictionary(
new_index_type,
new_value_type,
field.type.ordered,
)
else:
new_map[field.name] = field.type
new_schema = pa.schema(new_map, metadata=arrow_table.schema.metadata)

arrow_table = pa.Table.from_pandas(df, schema=new_schema)

return arrow_table
5 changes: 2 additions & 3 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
logging,
)
from .. import pytiledbsoma as clib
from .._arrow_types import df_to_arrow
from .._collection import AnyTileDBCollection, CollectionBase
from .._common_nd_array import NDArray
from .._constants import SOMA_JOINID
Expand Down Expand Up @@ -1322,7 +1321,7 @@ def _write_dataframe_impl(
s = _util.get_start_stamp()
logging.log_io(None, f"START WRITING {df_uri}")

arrow_table = df_to_arrow(df)
arrow_table = conversions.df_to_arrow(df)

# Don't many-layer the almost-always-repeated var dataframe.
# And for obs, if there are duplicate values for whatever reason, don't write them
Expand Down Expand Up @@ -1726,7 +1725,7 @@ def _update_dataframe(
msg = ", ".join(msgs)
raise ValueError(f"unsupported type updates: {msg}")

arrow_table = df_to_arrow(new_data)
arrow_table = conversions.df_to_arrow(new_data)
arrow_schema = arrow_table.schema.remove_metadata()

add_attrs = dict()
Expand Down
5 changes: 2 additions & 3 deletions apis/python/src/tiledbsoma/io/spatial/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
_util,
logging,
)
from ..._arrow_types import df_to_arrow
from ..._constants import SPATIAL_DISCLAIMER
from ..._exception import (
AlreadyExistsError,
Expand All @@ -57,7 +56,7 @@
TileDBCreateOptions,
TileDBWriteOptions,
)
from .. import from_anndata
from .. import conversions, from_anndata
from ..ingest import (
IngestCtx,
IngestionParams,
Expand Down Expand Up @@ -718,7 +717,7 @@ def _write_visium_spots(
(0, max_joinid_len - 1),
)

arrow_table = df_to_arrow(df)
arrow_table = conversions.df_to_arrow(df)

with warnings.catch_warnings():
warnings.simplefilter("ignore")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pyarrow as pa
import pytest

import tiledbsoma
from tiledbsoma.io.conversions import df_to_arrow


@pytest.mark.parametrize(
Expand All @@ -27,5 +27,5 @@
],
)
def test_df_to_arrow(input_df: pd.DataFrame, expected: pa.Table):
actual = tiledbsoma._arrow_types.df_to_arrow(input_df)
actual = df_to_arrow(input_df)
assert actual == expected

0 comments on commit cc33291

Please sign in to comment.