Move df_to_arrow to the io module (#3694)

Move `df_to_arrow` from `_arrow_types` to the `io.conversions` module.
single-cell-data · Feb 13, 2025 · cc33291 · cc33291
1 parent bd6b6cd
commit cc33291
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 142 deletions.
diff --git a/apis/python/src/tiledbsoma/_arrow_types.py b/apis/python/src/tiledbsoma/_arrow_types.py
@@ -29,7 +29,6 @@
 
 import numpy as np
 import numpy.typing as npt
-import pandas as pd
 import pyarrow as pa
 
 _ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = {
@@ -194,138 +193,6 @@ def is_string_dtype(dtype: Any) -> bool:
     return dtype.name in ["object", "string", "str32", "str64"]
 
 
-def df_to_arrow(df: pd.DataFrame) -> pa.Table:
-    """
-    Handle special cases where pa.Table.from_pandas is not sufficient.
-    """
-    nullable_fields = set()
-    # Not for name, col in df.items() since we need df[k] on the left-hand sides
-    for key in df:
-        # Make attributes nullable. Context:
-        # * df_to_arrow is _solely_ for use of tiledbsoma.io
-        #   o Anyone calling the SOMA API directly has user-provided Arrow
-        #     schema which must be respected
-        #   o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata,
-        #     and update_obs/update_var -- does not provide an Arrow schema
-        #     explicitly.  We compute an Arrow schema for them here.
-        # * Even when the _initial_ data is all non-null down a particular
-        #   string column, there are two ways a _subsequent_ write can provide
-        #   nulls: append-mode ingest, or, update_obs/update_var wherein the new
-        #   data has nulls even when the data used at schema-create time was
-        #   non-null.
-        # * We have no way of knowing at initial ingest time whether or not
-        #   users will later be appending, or updating, with null data.
-        # * Note that Arrow has a per-field nullable flag in its schema metadata
-        #   -- and so do TileDB array schemas.
-        #
-        # Note in particular this is for the use of tiledbsoma.io:
-        #
-        # * In the tiledbsoma API (e.g. DataFrame.create) the user passes an
-        #   Arrow schema and we respect it as-is. They specify nullability, or
-        #   not, as they wish.
-        # * In tiledbsoma.io, the user-provided inputs are AnnData objects.
-        #   We compute the Arrow schema _for_ them. And we must accommodate
-        #   reasonable/predictable needs.
-
-        nullable_fields.add(key)
-
-        # Handle special cases for all null columns where the dtype is "object"
-        # or "category" and must be explicitly casted to the correct pandas
-        # extension dtype.
-        #
-        # Note: with
-        #   anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
-        # the dtype comes in to us via `tiledbsoma.io.from_anndata` not
-        # as `pd.StringDtype()` but rather as `object`.
-        #
-        # Note: we're working around a subtle issue involving Pandas, and Arrow's
-        # from_pandas, and categoricals.
-        #
-        # * If you do this:
-        #     pd.Series(["a", "b", "c", "d"], dtype=pd.CategoricalDtype())
-        #   then you get Pandas categorical of string with no nulls -- as desired.
-        # * If you do this:
-        #     pd.Series(["a", "b", None, "d"], dtype=pd.CategoricalDtype())
-        #   or
-        #     pd.Series(["a", "b", np.nan, "d"], dtype=pd.CategoricalDtype())
-        #   then you get Pandas categorical of string, with some nulls -- as desired
-        # * If you do this:
-        #     pd.Series([None] * 4, dtype=pd.CategoricalDtype())
-        #   or
-        #     pd.Series([np.nan] * 4, dtype=pd.CategoricalDtype())
-        #   then you get Pandas categorical of double -- with NaN values -- not as desired.
-        if df[key].isnull().all():
-            if df[key].dtype.name == "object":
-                df[key] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
-            elif df[key].dtype.name == "category":
-                # This is a trick to avoid getting float64 value type in the Pandas categorical.
-                # That's the good news. The bad news is that pa.Table.from_pandas() of this
-                # will result in Arrow value-type of pa.null().  Part two, to deal with
-                # this, is below.
-                df[key] = pd.Series(
-                    ["X"] * df.shape[0], dtype=pd.CategoricalDtype()
-                ).cat.remove_categories(["X"])
-
-    # For categoricals, it's possible to get
-    #   TypeError: Object of type bool_ is not JSON serializable
-    # deep within library functions. Debugging reveals that this happens when
-    # the df[key].values.ordered is of type np.bool_ rather than Python bool.
-    # So, we cast and reconstruct.
-    for key in df:
-        column = df[key]
-        if isinstance(column.dtype, pd.CategoricalDtype):
-            if hasattr(column.values, "categories"):
-                categories = column.values.categories
-
-            if hasattr(column.values, "ordered"):
-                ordered = bool(column.values.ordered)
-
-            df[key] = pd.Categorical(
-                values=column, categories=categories, ordered=ordered
-            )
-
-    arrow_table = pa.Table.from_pandas(df)
-
-    md = arrow_table.schema.metadata
-    md.update(dict.fromkeys(nullable_fields, "nullable"))
-    arrow_table = arrow_table.replace_schema_metadata(md)
-
-    # For tiledbsoma.io (for which this method exists) _any_ dataset can be appended to
-    # later on. This means that on fresh ingest we must use a larger bit-width than
-    # the bare minimum necessary.
-    new_map = {}
-    for field in arrow_table.schema:
-        if field.name == "__index_level_0__":
-            continue
-        elif pa.types.is_dictionary(field.type):
-            old_index_type = field.type.index_type
-            new_index_type = (
-                pa.int32()
-                if old_index_type in [pa.int8(), pa.int16()]
-                else old_index_type
-            )
-            # This is part two of what we need to do to get null-filled Pandas
-            # categorical-of-string conveyed to Arrow. An entirely null-filled
-            # Pandas categorical-of-string series, after py.Table.from_pandas(),
-            # will have type pa.null.
-            old_value_type = field.type.value_type
-            new_value_type = (
-                pa.large_string() if old_value_type == pa.null() else old_value_type
-            )
-            new_map[field.name] = pa.dictionary(
-                new_index_type,
-                new_value_type,
-                field.type.ordered,
-            )
-        else:
-            new_map[field.name] = field.type
-    new_schema = pa.schema(new_map, metadata=arrow_table.schema.metadata)
-
-    arrow_table = pa.Table.from_pandas(df, schema=new_schema)
-
-    return arrow_table
-
-
 def pyarrow_to_carrow_type(pa_type: pa.DataType) -> str:
     try:
         return _PYARROW_TO_CARROW[pa_type]

diff --git a/apis/python/src/tiledbsoma/io/_registration/signatures.py b/apis/python/src/tiledbsoma/io/_registration/signatures.py
@@ -15,8 +15,8 @@
 
 import tiledbsoma
 import tiledbsoma.logging
-from tiledbsoma._arrow_types import df_to_arrow
 from tiledbsoma.io._util import read_h5ad  # Allow us to read over S3 in backed mode
+from tiledbsoma.io.conversions import df_to_arrow
 from tiledbsoma.options import SOMATileDBContext
 
 _EQUIVALENCES = {

diff --git a/apis/python/src/tiledbsoma/io/conversions.py b/apis/python/src/tiledbsoma/io/conversions.py
@@ -88,3 +88,135 @@ def csr_from_coo_table(
         tbl, (num_rows, num_cols), "csr", True, context
     ).to_scipy()
     return s
+
+
+def df_to_arrow(df: pd.DataFrame) -> pa.Table:
+    """
+    Handle special cases where pa.Table.from_pandas is not sufficient.
+    """
+    nullable_fields = set()
+    # Not for name, col in df.items() since we need df[k] on the left-hand sides
+    for key in df:
+        # Make attributes nullable. Context:
+        # * df_to_arrow is _solely_ for use of tiledbsoma.io
+        #   o Anyone calling the SOMA API directly has user-provided Arrow
+        #     schema which must be respected
+        #   o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata,
+        #     and update_obs/update_var -- does not provide an Arrow schema
+        #     explicitly.  We compute an Arrow schema for them here.
+        # * Even when the _initial_ data is all non-null down a particular
+        #   string column, there are two ways a _subsequent_ write can provide
+        #   nulls: append-mode ingest, or, update_obs/update_var wherein the new
+        #   data has nulls even when the data used at schema-create time was
+        #   non-null.
+        # * We have no way of knowing at initial ingest time whether or not
+        #   users will later be appending, or updating, with null data.
+        # * Note that Arrow has a per-field nullable flag in its schema metadata
+        #   -- and so do TileDB array schemas.
+        #
+        # Note in particular this is for the use of tiledbsoma.io:
+        #
+        # * In the tiledbsoma API (e.g. DataFrame.create) the user passes an
+        #   Arrow schema and we respect it as-is. They specify nullability, or
+        #   not, as they wish.
+        # * In tiledbsoma.io, the user-provided inputs are AnnData objects.
+        #   We compute the Arrow schema _for_ them. And we must accommodate
+        #   reasonable/predictable needs.
+
+        nullable_fields.add(key)
+
+        # Handle special cases for all null columns where the dtype is "object"
+        # or "category" and must be explicitly casted to the correct pandas
+        # extension dtype.
+        #
+        # Note: with
+        #   anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
+        # the dtype comes in to us via `tiledbsoma.io.from_anndata` not
+        # as `pd.StringDtype()` but rather as `object`.
+        #
+        # Note: we're working around a subtle issue involving Pandas, and Arrow's
+        # from_pandas, and categoricals.
+        #
+        # * If you do this:
+        #     pd.Series(["a", "b", "c", "d"], dtype=pd.CategoricalDtype())
+        #   then you get Pandas categorical of string with no nulls -- as desired.
+        # * If you do this:
+        #     pd.Series(["a", "b", None, "d"], dtype=pd.CategoricalDtype())
+        #   or
+        #     pd.Series(["a", "b", np.nan, "d"], dtype=pd.CategoricalDtype())
+        #   then you get Pandas categorical of string, with some nulls -- as desired
+        # * If you do this:
+        #     pd.Series([None] * 4, dtype=pd.CategoricalDtype())
+        #   or
+        #     pd.Series([np.nan] * 4, dtype=pd.CategoricalDtype())
+        #   then you get Pandas categorical of double -- with NaN values -- not as desired.
+        if df[key].isnull().all():
+            if df[key].dtype.name == "object":
+                df[key] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
+            elif df[key].dtype.name == "category":
+                # This is a trick to avoid getting float64 value type in the Pandas categorical.
+                # That's the good news. The bad news is that pa.Table.from_pandas() of this
+                # will result in Arrow value-type of pa.null().  Part two, to deal with
+                # this, is below.
+                df[key] = pd.Series(
+                    ["X"] * df.shape[0], dtype=pd.CategoricalDtype()
+                ).cat.remove_categories(["X"])
+
+    # For categoricals, it's possible to get
+    #   TypeError: Object of type bool_ is not JSON serializable
+    # deep within library functions. Debugging reveals that this happens when
+    # the df[key].values.ordered is of type np.bool_ rather than Python bool.
+    # So, we cast and reconstruct.
+    for key in df:
+        column = df[key]
+        if isinstance(column.dtype, pd.CategoricalDtype):
+            if hasattr(column.values, "categories"):
+                categories = column.values.categories
+
+            if hasattr(column.values, "ordered"):
+                ordered = bool(column.values.ordered)
+
+            df[key] = pd.Categorical(
+                values=column, categories=categories, ordered=ordered
+            )
+
+    arrow_table = pa.Table.from_pandas(df)
+
+    md = arrow_table.schema.metadata
+    md.update(dict.fromkeys(nullable_fields, "nullable"))
+    arrow_table = arrow_table.replace_schema_metadata(md)
+
+    # For tiledbsoma.io (for which this method exists) _any_ dataset can be appended to
+    # later on. This means that on fresh ingest we must use a larger bit-width than
+    # the bare minimum necessary.
+    new_map = {}
+    for field in arrow_table.schema:
+        if field.name == "__index_level_0__":
+            continue
+        elif pa.types.is_dictionary(field.type):
+            old_index_type = field.type.index_type
+            new_index_type = (
+                pa.int32()
+                if old_index_type in [pa.int8(), pa.int16()]
+                else old_index_type
+            )
+            # This is part two of what we need to do to get null-filled Pandas
+            # categorical-of-string conveyed to Arrow. An entirely null-filled
+            # Pandas categorical-of-string series, after py.Table.from_pandas(),
+            # will have type pa.null.
+            old_value_type = field.type.value_type
+            new_value_type = (
+                pa.large_string() if old_value_type == pa.null() else old_value_type
+            )
+            new_map[field.name] = pa.dictionary(
+                new_index_type,
+                new_value_type,
+                field.type.ordered,
+            )
+        else:
+            new_map[field.name] = field.type
+    new_schema = pa.schema(new_map, metadata=arrow_table.schema.metadata)
+
+    arrow_table = pa.Table.from_pandas(df, schema=new_schema)
+
+    return arrow_table
diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py
@@ -62,7 +62,6 @@
     logging,
 )
 from .. import pytiledbsoma as clib
-from .._arrow_types import df_to_arrow
 from .._collection import AnyTileDBCollection, CollectionBase
 from .._common_nd_array import NDArray
 from .._constants import SOMA_JOINID
@@ -1322,7 +1321,7 @@ def _write_dataframe_impl(
     s = _util.get_start_stamp()
     logging.log_io(None, f"START  WRITING {df_uri}")
 
-    arrow_table = df_to_arrow(df)
+    arrow_table = conversions.df_to_arrow(df)
 
     # Don't many-layer the almost-always-repeated var dataframe.
     # And for obs, if there are duplicate values for whatever reason, don't write them
@@ -1726,7 +1725,7 @@ def _update_dataframe(
         msg = ", ".join(msgs)
         raise ValueError(f"unsupported type updates: {msg}")
 
-    arrow_table = df_to_arrow(new_data)
+    arrow_table = conversions.df_to_arrow(new_data)
     arrow_schema = arrow_table.schema.remove_metadata()
 
     add_attrs = dict()

diff --git a/apis/python/src/tiledbsoma/io/spatial/ingest.py b/apis/python/src/tiledbsoma/io/spatial/ingest.py
@@ -44,7 +44,6 @@
     _util,
     logging,
 )
-from ..._arrow_types import df_to_arrow
 from ..._constants import SPATIAL_DISCLAIMER
 from ..._exception import (
     AlreadyExistsError,
@@ -57,7 +56,7 @@
     TileDBCreateOptions,
     TileDBWriteOptions,
 )
-from .. import from_anndata
+from .. import conversions, from_anndata
 from ..ingest import (
     IngestCtx,
     IngestionParams,
@@ -718,7 +717,7 @@ def _write_visium_spots(
         (0, max_joinid_len - 1),
     )
 
-    arrow_table = df_to_arrow(df)
+    arrow_table = conversions.df_to_arrow(df)
 
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")

diff --git a/apis/python/tests/test_arrow_type.py → apis/python/tests/test_io_conversions.py b/apis/python/tests/test_arrow_type.py → apis/python/tests/test_io_conversions.py
@@ -3,7 +3,7 @@
 import pyarrow as pa
 import pytest
 
-import tiledbsoma
+from tiledbsoma.io.conversions import df_to_arrow
 
 
 @pytest.mark.parametrize(
@@ -27,5 +27,5 @@
     ],
 )
 def test_df_to_arrow(input_df: pd.DataFrame, expected: pa.Table):
-    actual = tiledbsoma._arrow_types.df_to_arrow(input_df)
+    actual = df_to_arrow(input_df)
     assert actual == expected