diff --git a/pyproject.toml b/pyproject.toml index 40d7329..d2afc81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,10 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.9" dependencies = [ + "numpy", + # We use internal pd._libs.missing and experimental ArrowExtensionArray + "pandas>=2.2,<2.3", + "pyarrow>=15", ] [project.urls] diff --git a/src/nested_pandas/__init__.py b/src/nested_pandas/__init__.py index b564b85..4577a8f 100644 --- a/src/nested_pandas/__init__.py +++ b/src/nested_pandas/__init__.py @@ -1,3 +1,7 @@ from .example_module import greetings, meaning -__all__ = ["greetings", "meaning"] +# Import for registering +from .series.accessor import NestSeriesAccessor # noqa: F401 +from .series.dtype import NestedDtype + +__all__ = ["greetings", "meaning", "NestedDtype"] diff --git a/src/nested_pandas/series/__init__.py b/src/nested_pandas/series/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py new file mode 100644 index 0000000..49876fb --- /dev/null +++ b/src/nested_pandas/series/accessor.py @@ -0,0 +1,240 @@ +# Python 3.9 doesn't support "|" for types +from __future__ import annotations + +from collections.abc import Generator, MutableMapping +from typing import cast + +import numpy as np +import pandas as pd +import pyarrow as pa +from numpy.typing import ArrayLike +from pandas.api.extensions import register_series_accessor + +from nested_pandas.series.dtype import NestedDtype +from nested_pandas.series.packer import pack_sorted_df_into_struct + +__all__ = ["NestSeriesAccessor"] + + +@register_series_accessor("nest") +class NestSeriesAccessor(MutableMapping): + """Accessor for operations on Series of NestedDtype + + This accessor implements `MutableMapping` interface over the fields of the + struct, so you can access, change and delete the fields as if it was a + dictionary, with `[]`, `[] =` and `del` operators. + """ + + def __init__(self, series): + self._check_series(series) + + self._series = series + + @staticmethod + def _check_series(series): + dtype = series.dtype + if not isinstance(dtype, NestedDtype): + raise AttributeError(f"Can only use .nest accessor with a Series of NestedDtype, got {dtype}") + + def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame: + """Convert nested series into dataframe of list-array columns + + Parameters + ---------- + fields : list[str] or None, optional + Names of the fields to include. Default is None, which means all fields. + + Returns + ------- + pd.DataFrame + Dataframe of list-arrays. + """ + df = self._series.struct.explode() + if fields is None: + return df + return df[fields] + + def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame: + """Convert nested series into dataframe of flat arrays + + Parameters + ---------- + fields : list[str] or None, optional + Names of the fields to include. Default is None, which means all fields. + + Returns + ------- + pd.DataFrame + Dataframe of flat arrays. + """ + # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly + fields = fields if fields is not None else list(self._series.array.field_names) + if len(fields) == 0: + raise ValueError("Cannot flatten a struct with no fields") + + flat_series = {} + index = None + for field in fields: + list_array = cast(pa.ListArray, pa.array(self._series.struct.field(field))) + if index is None: + index = np.repeat(self._series.index.values, np.diff(list_array.offsets)) + flat_series[field] = pd.Series( + list_array.flatten(), + index=index, + name=field, + copy=False, + ) + return pd.DataFrame(flat_series) + + @property + def flat_length(self) -> int: + """Length of the flat arrays""" + return self._series.array.flat_length + + @property + def fields(self) -> list[str]: + """Names of the nested columns""" + # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly + return self._series.array.field_names + + def set_flat_field(self, field: str, value: ArrayLike) -> None: + """Set the field from flat-array of values, in-place + + Parameters + ---------- + field : str + Name of the field to set. If not present, it will be added. + value : ArrayLike + Array of values to set. It must be a scalar or have the same length + as the flat arrays, e.g. `self.flat_length`. + """ + self._series.array.set_flat_field(field, value) + + def set_list_field(self, field: str, value: ArrayLike) -> None: + """Set the field from list-array, in-place + + Parameters + ---------- + field : str + Name of the field to set. If not present, it will be added. + value : ArrayLike + Array of values to set. It must be a list-array of the same length + as the series, e.g. length of the series. + """ + self._series.array.set_list_field(field, value) + + # I intentionally don't call it `drop` or `drop_field` because `pd.DataFrame.drop` is not inplace + # by default, and I wouldn't like to surprise the user. + def pop_field(self, field: str) -> pd.Series: + """Delete the field from the struct and return it. + + Parameters + ---------- + field : str + Name of the field to delete. + + Returns + ------- + pd.Series + The deleted field. + """ + series = self[field] + self._series.array.pop_field(field) + return series + + def query_flat(self, query: str) -> pd.Series: + """Query the flat arrays with a boolean expression + + Currently, it will remove empty rows from the output series. + # TODO: preserve the index keeping empty rows + + Parameters + ---------- + query : str + Boolean expression to filter the rows. + + Returns + ------- + pd.Series + The filtered series. + """ + flat = self.to_flat().query(query) + if len(flat) == 0: + return pd.Series([], dtype=self._series.dtype) + return pack_sorted_df_into_struct(flat) + + def get_list_series(self, field: str) -> pd.Series: + """Get the list-array field as a Series + + Parameters + ---------- + field : str + Name of the field to get. + + Returns + ------- + pd.Series + The list-array field. + """ + return self._series.struct.field(field) + + def __getitem__(self, key: str | list[str]) -> pd.Series: + if isinstance(key, list): + new_array = self._series.array.view_fields(key) + return pd.Series(new_array, index=self._series.index, name=self._series.name) + + series = self._series.struct.field(key).list.flatten() + series.index = np.repeat(self._series.index.values, np.diff(self._series.array.list_offsets)) + series.name = key + return series + + def __setitem__(self, key: str, value: ArrayLike) -> None: + # TODO: we can be much-much smarter about the performance here + # TODO: think better about underlying pa.ChunkArray in both self._series.array and value + + # Everything is empty, do nothing + if len(self._series) == 0 and np.ndim(value) != 0: + array = pa.array(value) + if len(array) == 0: + return + + if len(self._series) == self.flat_length: + raise ValueError( + f"Cannot use `.nest[{key}] = value` when the series has the same count of 'list' rows as" + "'flat' rows, because it is ambiguous whether the input is a 'flat' or a 'list' array. Use" + "`.nest.set_flat_field()` or `.nest.set_list_field()` instead." + ) + + # Set single value for all rows + if np.ndim(value) == 0: + self.set_flat_field(key, value) + return + + pa_array = pa.array(value) + + # Input is a flat array of values + if len(pa_array) == self.flat_length: + self.set_flat_field(key, pa_array) + return + + # Input is a list-array of values + if len(pa_array) == len(self._series): + self.set_list_field(key, pa_array) + return + + raise ValueError( + f"Cannot set field {key} with value of length {len(pa_array)}, the value is expected to be " + f"either a scalar, a 'flat' array of length {self.flat_length}, or a 'list' array of length " + f"{len(self._series)}." + ) + + def __delitem__(self, key: str) -> None: + self.pop_field(key) + + def __iter__(self) -> Generator[str, None, None]: + # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly + yield from iter(self._series.array.field_names) + + def __len__(self) -> int: + # For some reason, .struct.dtypes is cached, so we will use NestedExtensionArray directly + return len(self._series.array.field_names) diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py new file mode 100644 index 0000000..7dd8c95 --- /dev/null +++ b/src/nested_pandas/series/dtype.py @@ -0,0 +1,174 @@ +# Use Self, which is not available until Python 3.11 +from __future__ import annotations + +from collections.abc import Mapping +from typing import cast + +import pandas as pd +import pyarrow as pa +from pandas import ArrowDtype +from pandas.api.extensions import register_extension_dtype +from pandas.core.arrays import ArrowExtensionArray + +from nested_pandas.series.utils import is_pa_type_a_list + +__all__ = ["NestedDtype"] + + +@register_extension_dtype +class NestedDtype(ArrowDtype): + """Data type to handle packed time series data""" + + pyarrow_dtype: pa.StructType + + def __init__(self, pyarrow_dtype: pa.DataType) -> None: + pyarrow_dtype = self._validate_dtype(pyarrow_dtype) + super().__init__(pyarrow_dtype=pyarrow_dtype) + + @classmethod + def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821 + """Make NestedDtype from a mapping of field names and list item types. + + Parameters + ---------- + fields : Mapping[str, pa.DataType] + A mapping of field names and their item types. Since all fields are lists, the item types are + inner types of the lists, not the list types themselves. + + Returns + ------- + NestedDtype + The constructed NestedDtype. + + Examples + -------- + >>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()}) + >>> dtype + nested + >>> assert ( + ... dtype.pyarrow_dtype + ... == pa.struct({"a": pa.list_(pa.float64()), "b": pa.list_(pa.int64())}) + ... ) + """ + pyarrow_dtype = pa.struct({field: pa.list_(pa_type) for field, pa_type in fields.items()}) + pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) + return cls(pyarrow_dtype=pyarrow_dtype) + + @staticmethod + def _validate_dtype(pyarrow_dtype: pa.DataType) -> pa.StructType: + if not isinstance(pyarrow_dtype, pa.DataType): + raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}") + if not pa.types.is_struct(pyarrow_dtype): + raise ValueError("NestedDtype can only be constructed with pyarrow struct type.") + pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) + + for field in pyarrow_dtype: + if not is_pa_type_a_list(field.type): + raise ValueError( + "NestedDtype can only be constructed with pyarrow struct type, all fields must be list " + f"type. Given struct has unsupported field {field}" + ) + return pyarrow_dtype + + @classmethod + def construct_from_string(cls, string: str) -> Self: # type: ignore[name-defined] # noqa: F821 + """Construct NestedDtype from a string representation. + + This works only for simple types, i.e. non-parametric pyarrow types. + + Parameters + ---------- + string : str + The string representation of the nested type. For example, + 'nested"): + raise ValueError("Not a valid nested type string, expected 'nested<...>'") + fields_str = string.removeprefix("nested<").removesuffix(">") + + field_strings = fields_str.split(", ") + if len(field_strings) == 0: + raise ValueError( + "Not a valid nested type string, expected at least a single field inside " + "'nested'" + ) + + fields = {} + for field_string in field_strings: + try: + field_name, field_type = field_string.split(": ", maxsplit=1) + except ValueError as e: + raise ValueError( + "Not a valid nested type string, expected 'nested', got invalid field " + f"string '{field_string}'" + ) from e + if not field_type.startswith("[") or not field_type.endswith("]"): + raise ValueError( + "Not a valid nested type string, expected 'nested', got invalid field " + f"type string '{field_type}'" + ) + + value_type = field_type.removeprefix("[").removesuffix("]") + # We follow ArrowDtype implementation heere and do not try to parse complex types + try: + pa_value_type = pa.type_for_alias(value_type) + except ValueError as e: + raise ValueError( + f"Parsing pyarrow specific parameters in the string is not supported yet: {value_type}. " + "Please use NestedDtype() or NestedDtype.from_fields() instead." + ) from e + + fields[field_name] = pa_value_type + + return cls.from_fields(fields) + + @classmethod + def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype): + """Construct NestedDtype from a pandas.ArrowDtype. + + Parameters + ---------- + pandas_arrow_dtype : ArrowDtype + The pandas.ArrowDtype to construct NestedDtype from. + + Returns + ------- + NestedDtype + The constructed NestedDtype. + + Raises + ------ + ValueError + If the given dtype is not a valid nested type. + """ + pyarrow_dtype = cls._validate_dtype(pandas_arrow_dtype.pyarrow_dtype) + return cls(pyarrow_dtype=pyarrow_dtype) + + @classmethod + def construct_array_type(cls) -> type[ArrowExtensionArray]: + """Corresponded array type, always NestedExtensionArray""" + from nested_pandas.series.ext_array import NestedExtensionArray + + return NestedExtensionArray + + @property + def name(self) -> str: + """The string representation of the nested type""" + fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype]) + return f"nested<{fields}>" + + type = pd.DataFrame + """The type of the array's elements, always pd.DataFrame""" diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py new file mode 100644 index 0000000..cfba3e7 --- /dev/null +++ b/src/nested_pandas/series/ext_array.py @@ -0,0 +1,302 @@ +# typing.Self and "|" union syntax don't exist in Python 3.9 +from __future__ import annotations + +from collections.abc import Collection, Iterable, Iterator, Sequence +from typing import Any, cast + +import numpy as np +import pandas as pd +import pyarrow as pa +from numpy.typing import ArrayLike + +# Needed by ArrowExtensionArray.to_numpy(na_value=no_default) +from pandas._libs.lib import no_default + +# It is considered to be an experimental, so we need to be careful with it. +from pandas.core.arrays import ArrowExtensionArray + +from nested_pandas.series.dtype import NestedDtype +from nested_pandas.series.utils import enumerate_chunks, is_pa_type_a_list + +__all__ = ["NestedExtensionArray"] + + +class NestedExtensionArray(ArrowExtensionArray): + """Pandas extension array for nested dataframes + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array to be wrapped, must be a struct array with all fields being list + arrays of the same lengths. + + validate : bool, default True + Whether to validate the input array. + + Raises + ------ + ValueError + If the input array is not a struct array or if any of the fields is not + a list array or if the list arrays have different lengths. + """ + + _dtype: NestedDtype + + def __init__(self, values: pa.Array | pa.ChunkedArray, *, validate: bool = True) -> None: + super().__init__(values=values) + + # Fix the dtype to be NestedDtype + self._dtype = NestedDtype.from_pandas_arrow_dtype(self._dtype) + + if validate: + self._validate(self._pa_array) + + @staticmethod + def _convert_df_to_pa_scalar(df: pd.DataFrame, *, type: pa.DataType | None) -> pa.Scalar: + d = {column: series.values for column, series in df.to_dict("series").items()} + return pa.scalar(d, type=type) + + @staticmethod + def _convert_df_value_to_pa(value: object, *, type: pa.DataType | None) -> object: + # Convert "scalar" pd.DataFrame to a dict + if isinstance(value, pd.DataFrame): + return NestedExtensionArray._convert_df_to_pa_scalar(value, type=type) + # Convert pd.DataFrame collection to a list of dicts + if hasattr(value, "__getitem__") and isinstance(value, Iterable): + if hasattr(value, "iloc"): + first = value.iloc[0] + else: + try: + first = value[0] # type: ignore[index] + except IndexError: + return value + if isinstance(first, pd.DataFrame): + return [NestedExtensionArray._convert_df_to_pa_scalar(v, type=type) for v in value] + return value + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: # type: ignore[name-defined] # noqa: F821 + scalars = cls._convert_df_value_to_pa(scalars, type=None) + # The previous line may return an iterator, but parent's _from_sequence needs Sequence + if not isinstance(scalars, Sequence) and isinstance(scalars, Collection): + scalars = list(scalars) + return super()._from_sequence(scalars, dtype=dtype, copy=copy) + + @staticmethod + def _validate(array: pa.ChunkedArray) -> None: + for chunk in array.iterchunks(): + if not pa.types.is_struct(chunk.type): + raise ValueError(f"Expected a StructArray, got {chunk.type}") + struct_array = cast(pa.StructArray, chunk) + + first_list_array: pa.ListArray | None = None + for field in struct_array.type: + inner_array = struct_array.field(field.name) + if not is_pa_type_a_list(inner_array.type): + raise ValueError(f"Expected a ListArray, got {inner_array.type}") + list_array = cast(pa.ListArray, inner_array) + + if first_list_array is None: + first_list_array = list_array + continue + # compare offsets from the first list array with the current one + if not first_list_array.offsets.equals(list_array.offsets): + raise ValueError("Offsets of all ListArrays must be the same") + + def _replace_pa_array(self, pa_array: pa.ChunkedArray, *, validate: bool) -> None: + if validate: + self._validate(pa_array) + self._pa_array = pa_array + self._dtype = NestedDtype(pa_array.chunk(0).type) + + def __getitem__(self, item): + value = super().__getitem__(item) + # Convert "scalar" value to pd.DataFrame + if not isinstance(value, dict): + return value + return pd.DataFrame(value, copy=True) + + def __iter__(self) -> Iterator[Any]: + for value in super().__iter__(): + # Convert "scalar" value to pd.DataFrame + if not isinstance(value, dict): + yield value + else: + yield pd.DataFrame(value, copy=True) + + def to_numpy(self, dtype: None = None, copy: bool = False, na_value: Any = no_default) -> np.ndarray: + """Convert the extension array to a numpy array. + + Parameters + ---------- + dtype : None + This parameter is left for compatibility with the base class + method, but it is not used. dtype of the returned array is + always object. + copy : bool, default False + Whether to copy the data. It is not garanteed that the data + will not be copied if copy is False. + na_value : Any, default no_default + TODO: support NA values + + Returns + ------- + np.ndarray + The numpy array of pd.DataFrame objects. Each element is a single + time-series. + """ + array = super().to_numpy(dtype=dtype, copy=copy, na_value=na_value) + + # Hack with np.empty is the only way to force numpy to create 1-d array of objects + result = np.empty(shape=array.shape, dtype=object) + # We do copy=False here because user's 'copy' is already handled by ArrowExtensionArray.to_numpy + result[:] = [pd.DataFrame(value, copy=False) for value in array] + return result + + def __setitem__(self, key, value) -> None: + value = self._convert_df_value_to_pa(value, type=self._dtype.pyarrow_dtype) + super().__setitem__(key, value) + + @property + def list_offsets(self) -> pa.ChunkedArray: + """The list offsets of the field arrays. + + It is a chunk array of list offsets of the first field array. + (Since all fields are validated to have the same offsets.) + + Returns + ------- + pa.ChunkedArray + The list offsets of the field arrays. + """ + return pa.chunked_array([chunk.field(0).offsets for chunk in self._pa_array.iterchunks()]) + + @property + def field_names(self) -> list[str]: + """Names of the nested columns""" + return [field.name for field in self._pa_array.chunk(0).type] + + @property + def flat_length(self) -> int: + """Length of the flat arrays""" + return sum(chunk.field(0).value_lengths().sum().as_py() for chunk in self._pa_array.iterchunks()) + + def view_fields(self, fields: str | list[str]) -> Self: # type: ignore[name-defined] # noqa: F821 + """Get a view of the series with only the specified fields + + Parameters + ---------- + fields : str or list of str + The name of the field or a list of names of the fields to include. + + Returns + ------- + NestedExtensionArray + The view of the series with only the specified fields. + """ + if isinstance(fields, str): + fields = [fields] + if len(set(fields)) != len(fields): + raise ValueError("Duplicate field names are not allowed") + if not set(fields).issubset(self.field_names): + raise ValueError(f"Some fields are not found, given: {fields}, available: {self.field_names}") + + chunks = [] + for chunk in self._pa_array.iterchunks(): + chunk = cast(pa.StructArray, chunk) + struct_dict = {} + for field in fields: + struct_dict[field] = chunk.field(field) + struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys()) + chunks.append(struct_array) + pa_array = pa.chunked_array(chunks) + + return self.__class__(pa_array, validate=False) + + def set_flat_field(self, field: str, value: ArrayLike) -> None: + """Set the field from flat-array of values + + Parameters + ---------- + field : str + The name of the field. + value : ArrayLike + The 'flat' array of values to be set. + """ + # TODO: optimize for the case when the input is a pa.ChunkedArray + + if np.ndim(value) == 0: + value = np.repeat(value, self.flat_length) + + pa_array = pa.array(value) + + if len(pa_array) != self.flat_length: + raise ValueError("The input must be a scalar or have the same length as the flat arrays") + + offsets = self.list_offsets.combine_chunks() + list_array = pa.ListArray.from_arrays(values=pa_array, offsets=offsets) + + return self.set_list_field(field, list_array) + + def set_list_field(self, field: str, value: ArrayLike) -> None: + """Set the field from list-array + + Parameters + ---------- + field : str + The name of the field. + value : ArrayLike + The list-array of values to be set. + """ + # TODO: optimize for the case when the input is a pa.ChunkedArray + + pa_array = pa.array(value) + + if not is_pa_type_a_list(pa_array.type): + raise ValueError(f"Expected a list array, got {pa_array.type}") + + if len(pa_array) != len(self): + raise ValueError("The length of the list-array must be equal to the length of the series") + + chunks = [] + for sl, chunk in enumerate_chunks(self._pa_array): + chunk = cast(pa.StructArray, chunk) + + # Build a new struct array. We collect all existing fields and add the new one. + struct_dict = {} + for pa_field in chunk.type: + struct_dict[pa_field.name] = chunk.field(pa_field.name) + struct_dict[field] = pa.array(pa_array[sl]) + + struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys()) + chunks.append(struct_array) + pa_array = pa.chunked_array(chunks) + + self._replace_pa_array(pa_array, validate=True) + + def pop_field(self, field: str): + """Delete a field from the struct array + + Parameters + ---------- + field : str + The name of the field to be deleted. + """ + if field not in self.field_names: + raise ValueError(f"Field '{field}' not found") + + if len(self.field_names) == 1: + raise ValueError("Cannot delete the last field") + + chunks = [] + for chunk in self._pa_array.iterchunks(): + chunk = cast(pa.StructArray, chunk) + struct_dict = {} + for pa_field in chunk.type: + if pa_field.name != field: + struct_dict[pa_field.name] = chunk.field(pa_field.name) + struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys()) + chunks.append(struct_array) + pa_array = pa.chunked_array(chunks) + + self._replace_pa_array(pa_array, validate=False) diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py new file mode 100644 index 0000000..cca03c0 --- /dev/null +++ b/src/nested_pandas/series/packer.py @@ -0,0 +1,280 @@ +"""Module for converting between "flat" and "list" and "nested" representations + +TODO: mask support +TODO: multi-index support +""" + +# "|" for python 3.9 +from __future__ import annotations + +from collections.abc import Sequence + +import numpy as np +import pandas as pd +import pyarrow as pa + +from nested_pandas.series.dtype import NestedDtype +from nested_pandas.series.ext_array import NestedExtensionArray + +__all__ = ["pack_flat", "pack_lists", "pack_dfs"] + + +N_ROWS_INFER_DTYPE = 1000 + + +def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame: + """Pack a "flat" dataframe into a "nested" dataframe. + + For the input dataframe with repeated indexes, make a pandas.DataFrame, + where each original column is replaced by a column of lists, and, + optionally, a "structure" column is added, containing a structure of + lists with the original columns. + + Parameters + ---------- + df : pd.DataFrame + Input dataframe, with repeated indexes. + + name : str, optional + Name of the structure column. The default is None, which means no + structure column is added. + + Returns + ------- + pd.DataFrame + Output dataframe. + """ + # TODO: we can optimize name=None case a bit + struct_series = pack_flat(df, name=name) + packed_df = struct_series.struct.explode() + if name is not None: + packed_df[name] = struct_series + return packed_df + + +def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: + """Make a structure of lists representation of a "flat" dataframe. + + For the input dataframe with repeated indexes, make a pandas.Series, + where each original column is replaced by a structure of lists. + The dtype of the column is `nested_pandas.NestedDtype` with + the corresponding pyarrow type. The index of the output series is + the unique index of the input dataframe. The Series has `.nest` accessor, + see `nested_pandas.series.accessor.NestSeriesAccessor` for details. + + Parameters + ---------- + df : pd.DataFrame + Input dataframe, with repeated indexes. + name : str, optional + Name of the pd.Series. + + Returns + ------- + pd.Series + Output series, with unique indexes. + + See Also + -------- + nested_pandas.series.accessor.NestedSeriesAccessor : .nest accessor for the output series. + nested_pandas.series.dtype.NestedDtype : The dtype of the output series. + nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays. + """ + + # TODO: think about the case when the data is pre-sorted and we don't need a data copy. + flat = df.sort_index(kind="stable") + return pack_sorted_df_into_struct(flat, name=name) + + +def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series: + """Pack a sequence of "flat" dataframes into a "nested" series. + + Parameters + ---------- + dfs : Sequence[pd.DataFrame] + Input sequence of dataframes. + index : pd.Index, optional + Index of the output series. + name : str, optional + Name of the output series. + + Returns + ------- + pd.Series + Output series. + """ + if isinstance(dfs, pd.Series) and index is None: + index = dfs.index + + first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0] + + field_types = { + column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns + } + dtype = NestedDtype.from_fields(field_types) + dummy_value: dict[str, list] = {column: [] for column in first_df.columns} + series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name) + series[:] = dfs + return series + + +def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.Series: + """Make a structure of lists representation of a "flat" dataframe. + + Input dataframe must be sorted and all the columns must have pyarrow dtypes. + + Parameters + ---------- + df : pd.DataFrame + Input dataframe, with repeated indexes. It must be sorted and + all the columns must have pyarrow dtypes. + + name : str, optional + Name of the pd.Series. + + Returns + ------- + pd.Series + Output series, with unique indexes. + """ + packed_df = view_sorted_df_as_list_arrays(df) + # No need to validate the dataframe, the length of the nested arrays is forced to be the same by + # the view_sorted_df_as_list_arrays function. + return pack_lists(packed_df, name=name, validate=False) + + +def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = True) -> pd.Series: + """Make a series of arrow structures from a dataframe with nested arrays. + + For the input dataframe with repeated indexes, make a pandas.Series, + where each original column is replaced by a structure of lists. + The dtype of the column is `nested_pandas.NestedDtype` with the corresponding + pyarrow type. The index of the output series is the unique index of the + input dataframe. The Series has `.nest` accessor, see + `nested_pandas.series.accessor.NestSeriesAccessor` for details. + + For every row, all the nested array (aka pyarrow list) lengths must be + the same. + + Parameters + ---------- + df : pd.DataFrame + Input dataframe, with pyarrow list-arrays. + name : str, optional + Name of the pd.Series. + validate : bool, default True + Whether to validate the input dataframe. + + Returns + ------- + pd.Series + Output series, with unique indexes. + + See Also + -------- + nested_pandas.series.accessor.NestSeriesAccessor : The accessor for the output series. + nested_pandas.series.dtype.NestedDtype : The dtype of the output series. + nested_pandas.series.packer.pack_flat : Pack a "flat" dataframe with repeated indexes. + """ + struct_array = pa.StructArray.from_arrays( + [df[column] for column in df.columns], + names=df.columns, + ) + ext_array = NestedExtensionArray(struct_array, validate=validate) + return pd.Series( + ext_array, + index=df.index, + copy=False, + name=name, + ) + + +def view_sorted_df_as_list_arrays(df: pd.DataFrame) -> pd.DataFrame: + """Make a nested array representation of a "flat" dataframe. + + Parameters + ---------- + df : pd.DataFrame + Input dataframe, with repeated indexes. It must be sorted by its index. + + Returns + ------- + pd.DataFrame + Output dataframe, with unique indexes. It is a view over the input + dataframe, so it would mute the input dataframe if modified. + """ + offset_array = calculate_sorted_index_offsets(df.index) + unique_index = df.index.values[offset_array[:-1]] + + series_ = { + column: view_sorted_series_as_list_array(df[column], offset_array, unique_index) + for column in df.columns + } + + df = pd.DataFrame(series_) + + return df + + +def view_sorted_series_as_list_array( + series: pd.Series, offset: np.ndarray | None = None, unique_index: np.ndarray | None = None +) -> pd.Series: + """Make a nested array representation of a "flat" series. + + Parameters + ---------- + series : pd.Series + Input series, with repeated indexes. It must be sorted by its index. + + offset: np.ndarray or None, optional + Pre-calculated offsets of the input series index. + unique_index: np.ndarray or None, optional + Pre-calculated unique index of the input series. If given it must be + equal to `series.index.unique()` and `series.index.values[offset[:-1]]`. + + Returns + ------- + pd.Series + Output series, with unique indexes. It is a view over the input series, + so it would mute the input series if modified. + """ + if offset is None: + offset = calculate_sorted_index_offsets(series.index) + if unique_index is None: + unique_index = series.index.values[offset[:-1]] + + list_array = pa.ListArray.from_arrays( + offset, + pa.array(series), + ) + return pd.Series( + list_array, + dtype=pd.ArrowDtype(list_array.type), + index=unique_index, + copy=False, + ) + + +def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray: + """Calculate the offsets of the pre-sorted index values. + + Parameters + ---------- + index : pd.Index + Input index, must be sorted. + + Returns + ------- + np.ndarray + Output array of offsets, one element more than the number of unique + index values. + """ + # TODO: implement multi-index support + index_diff = np.diff(index.values, prepend=index.values[0] - 1, append=index.values[-1] + 1) + + if np.any(index_diff < 0): + raise ValueError("Table index must be strictly sorted.") + + offset = np.nonzero(index_diff)[0] + + return offset diff --git a/src/nested_pandas/series/utils.py b/src/nested_pandas/series/utils.py new file mode 100644 index 0000000..34707c8 --- /dev/null +++ b/src/nested_pandas/series/utils.py @@ -0,0 +1,41 @@ +from collections.abc import Generator + +import pyarrow as pa + + +def is_pa_type_a_list(pa_type: type[pa.Array]) -> bool: + """Check if the given pyarrow type is a list type. + + I.e. one of the following types: ListArray, LargeListArray, + FixedSizeListArray. + + Returns + ------- + bool + True if the given type is a list type, False otherwise. + """ + return ( + pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type) or pa.types.is_fixed_size_list(pa_type) + ) + + +def enumerate_chunks(array: pa.ChunkedArray) -> Generator[tuple[slice, pa.Array], None, None]: + """Iterate over pyarrow.ChunkedArray chunks with their slice indices. + + Parameters + ---------- + array : pa.ChunkedArray + Input chunked array. + + Yields + ------ + slice + `slice(index_start, index_stop)` for the current chunk. + pa.Array + The current chunk. + """ + index_start = 0 + for chunk in array.iterchunks(): + index_stop = index_start + len(chunk) + yield slice(index_start, index_stop), chunk + index_start = index_stop diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py new file mode 100644 index 0000000..d3a736f --- /dev/null +++ b/tests/nested_pandas/series/test_accessor.py @@ -0,0 +1,505 @@ +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest +from nested_pandas import NestedDtype +from nested_pandas.series.ext_array import NestedExtensionArray +from numpy.testing import assert_array_equal +from pandas.testing import assert_frame_equal, assert_series_equal + + +def test_registered(): + """Test that the series accessor .nest is registered.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1.0, 2.0, 1.0])]), + pa.array([np.array([4, 5, 6]), np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + _accessor = series.nest + + +def test_to_lists(): + """Test that the .nest.to_lists() method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), -np.array([1.0, 2.0, 1.0])]), + pa.array([np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + lists = series.nest.to_lists() + + desired = pd.DataFrame( + data={ + "a": pd.Series( + data=[np.array([1.0, 2.0, 3.0]), -np.array([1.0, 2.0, 1.0])], + dtype=pd.ArrowDtype(pa.list_(pa.float64())), + index=[0, 1], + ), + "b": pd.Series( + data=[np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])], + dtype=pd.ArrowDtype(pa.list_(pa.float64())), + index=[0, 1], + ), + }, + ) + assert_frame_equal(lists, desired) + + +def test_to_lists_with_fields(): + """Test that the .nest.to_lists(fields=...) method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), -np.array([1.0, 2.0, 1.0])]), + pa.array([np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + lists = series.nest.to_lists(fields=["a"]) + + desired = pd.DataFrame( + data={ + "a": pd.Series( + data=[np.array([1.0, 2.0, 3.0]), -np.array([1.0, 2.0, 1.0])], + dtype=pd.ArrowDtype(pa.list_(pa.float64())), + index=[0, 1], + ), + }, + ) + assert_frame_equal(lists, desired) + + +def test_to_flat(): + """Test that the .nest.to_flat() method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + flat = series.nest.to_flat() + + desired = pd.DataFrame( + data={ + "a": pd.Series( + data=[1.0, 2.0, 3.0, 1.0, 2.0, 1.0], + index=[0, 0, 0, 1, 1, 1], + name="a", + copy=False, + ), + "b": pd.Series( + data=[-4.0, -5.0, -6.0, -3.0, -4.0, -5.0], + index=[0, 0, 0, 1, 1, 1], + name="b", + copy=False, + ), + }, + ) + + assert_array_equal(flat.dtypes, desired.dtypes) + assert_array_equal(flat.index, desired.index) + + for column in flat.columns: + assert_array_equal(flat[column], desired[column]) + + +def test_to_flat_with_fields(): + """Test that the .nest.to_flat(fields=...) method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + flat = series.nest.to_flat(fields=["a"]) + + desired = pd.DataFrame( + data={ + "a": pd.Series( + data=[1.0, 2.0, 3.0, 1.0, 2.0, 1.0], + index=[0, 0, 0, 1, 1, 1], + name="a", + copy=False, + ), + }, + ) + + assert_array_equal(flat.dtypes, desired.dtypes) + assert_array_equal(flat.index, desired.index) + + for column in flat.columns: + assert_array_equal(flat[column], desired[column]) + + +def test_fields(): + """Test that the .nest.fields attribute works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + assert_array_equal(series.nest.fields, ["a", "b"]) + + +def test_flat_length(): + """Test that the .nest.flat_length attribute works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + assert series.nest.flat_length == 6 + + +def test_set_flat_field(): + """Test that the .nest.set_flat_field() method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + series.nest.set_flat_field("a", np.array(["a", "b", "c", "d", "e", "f"])) + + assert_series_equal( + series.nest["a"], + pd.Series( + data=["a", "b", "c", "d", "e", "f"], + index=[0, 0, 0, 1, 1, 1], + name="a", + dtype=pd.ArrowDtype(pa.string()), + ), + ) + + +def test_set_list_field(): + """Test that the .nest.set_list_field() method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + series.nest.set_list_field("c", [["a", "b", "c"], ["d", "e", "f"]]) + + assert_series_equal( + series.nest["c"], + pd.Series( + data=["a", "b", "c", "d", "e", "f"], + index=[0, 0, 0, 1, 1, 1], + name="c", + dtype=pd.ArrowDtype(pa.string()), + ), + ) + + +def test_pop_field(): + """Test that the .nest.pop_field() method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + a = series.nest.pop_field("a") + + assert_array_equal(series.nest.fields, ["b"]) + assert_series_equal( + a, + pd.Series( + [1.0, 2.0, 3.0, 1.0, 2.0, 1.0], + dtype=pd.ArrowDtype(pa.float64()), + index=[0, 0, 0, 1, 1, 1], + name="a", + ), + ) + + +def test_query_flat_1(): + """Test that the .nest.query_flat() method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([4.0, 5.0, 6.0])]), + pa.array([np.array([6.0, 4.0, 2.0]), np.array([1.0, 2.0, 3.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7]) + + filtered = series.nest.query_flat("a + b >= 7.0") + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0]), np.array([5.0, 6.0])]), + pa.array([np.array([6.0]), np.array([2.0, 3.0])]), + ], + names=["a", "b"], + ) + desired = pd.Series(desired_struct_array, dtype=NestedDtype(desired_struct_array.type), index=[5, 7]) + + assert_series_equal(filtered, desired) + + +# Currently we remove empty rows from the output series +def test_query_flat_empty_rows(): + """Test that the .nest.query_flat() method works as expected for empty rows.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([4.0, 5.0, 6.0])]), + pa.array([np.array([6.0, 4.0, 2.0]), np.array([1.0, 2.0, 3.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7]) + + filtered = series.nest.query_flat("a > 1000.0") + desired = pd.Series([], dtype=series.dtype) + + assert_series_equal(filtered, desired) + + +def test_get_list_series(): + """Test that the .nest.get_list_series() method works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]), + pa.array([np.array([6, 4, 2]), np.array([1, 2, 3])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7]) + + lists = series.nest.get_list_series("a") + + assert_series_equal( + lists, + pd.Series( + data=[np.array([1, 2, 3]), np.array([4, 5, 6])], + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + index=[5, 7], + name="a", + ), + ) + + +def test___getitem___single_field(): + """Test that the .nest["field"] works for a single field.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + assert_series_equal( + series.nest["a"], + pd.Series( + np.array([1.0, 2.0, 3.0, 1.0, 2.0, 1.0]), + dtype=pd.ArrowDtype(pa.float64()), + index=[0, 0, 0, 1, 1, 1], + name="a", + ), + ) + assert_series_equal( + series.nest["b"], + pd.Series( + -np.array([4.0, 5.0, 6.0, 3.0, 4.0, 5.0]), + dtype=pd.ArrowDtype(pa.float64()), + index=[0, 0, 0, 1, 1, 1], + name="b", + ), + ) + + +def test___getitem___multiple_fields(): + """Test that the .nest[["b", "a"]] works for multiple fields.""" + arrays = [ + pa.array([np.array([1.0, 2.0, 3.0]), -np.array([1.0, 2.0, 1.0])]), + pa.array([np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ] + series = pd.Series( + NestedExtensionArray( + pa.StructArray.from_arrays( + arrays=arrays, + names=["a", "b"], + ) + ), + index=[0, 1], + ) + + assert_series_equal( + series.nest[["b", "a"]], + pd.Series( + NestedExtensionArray( + pa.StructArray.from_arrays( + arrays=arrays[::-1], + names=["b", "a"], + ) + ), + index=[0, 1], + ), + ) + + +def test___setitem___with_flat(): + """Test that the .nest["field"] = ... works for a single field.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + series.nest["a"] = np.array(["a", "b", "c", "d", "e", "f"]) + + assert_series_equal( + series.nest["a"], + pd.Series( + data=["a", "b", "c", "d", "e", "f"], + index=[0, 0, 0, 1, 1, 1], + name="a", + dtype=pd.ArrowDtype(pa.string()), + ), + ) + + +def test___setitem___with_list(): + """Test that the .nest["field"] = ... works for a single field.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + series.nest["c"] = [["a", "b", "c"], ["d", "e", "f"]] + + assert_series_equal( + series.nest["c"], + pd.Series( + data=["a", "b", "c", "d", "e", "f"], + index=[0, 0, 0, 1, 1, 1], + name="c", + dtype=pd.ArrowDtype(pa.string()), + ), + ) + + +def test___setited___raises_for_ambiguous_lengths_1(): + """Test that the .nest["field"] = ... raises for ambiguous lengths of the right hand side.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array( + [ + np.array( + [ + 1.0, + ] + ), + np.array([2.0]), + ] + ), + pa.array([-np.array([6.0]), -np.array([5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + with pytest.raises(ValueError): + series.nest["c"] = ["a", "b", "c"] + + +def test___setited___raises_for_ambiguous_lengths_2(): + """Test that the .nest["field"] = ... raises for ambiguous lengths of the right hand side.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0]), np.array([])]), + pa.array([-np.array([6.0, 5.0]), -np.array([])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + with pytest.raises(ValueError): + series.nest["c"] = ["a", "b", "c"] + + +def test___delitem__(): + """Test that the `del .nest["field"]` works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + del series.nest["a"] + + assert_array_equal(series.nest.fields, ["b"]) + + +def test___iter__(): + """Test that the iter(.nest) works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + assert_array_equal(list(series.nest), ["a", "b"]) + + +def test___len__(): + """Test that the len(.nest) works.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + assert len(series.nest) == 2 diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py new file mode 100644 index 0000000..0abcd61 --- /dev/null +++ b/tests/nested_pandas/series/test_dtype.py @@ -0,0 +1,72 @@ +import pyarrow as pa +import pytest +from nested_pandas.series.dtype import NestedDtype +from nested_pandas.series.ext_array import NestedExtensionArray + + +@pytest.mark.parametrize( + "pyarrow_dtype", + [ + pa.struct([pa.field("a", pa.list_(pa.int64()))]), + pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), + pa.struct( + [ + pa.field("a", pa.list_(pa.int64())), + pa.field("b", pa.list_(pa.struct([pa.field("c", pa.int64())]))), + ] + ), + ], +) +def test_from_pyarrow_dtype(pyarrow_dtype): + """Test that we can construct NestedDtype from pyarrow struct type.""" + dtype = NestedDtype(pyarrow_dtype) + assert dtype.pyarrow_dtype == pyarrow_dtype + + +@pytest.mark.parametrize( + "pyarrow_dtype", + [ + pa.int64(), + pa.list_(pa.int64()), + pa.list_(pa.struct([pa.field("a", pa.int64())])), + pa.struct([pa.field("a", pa.int64())]), + pa.struct([pa.field("a", pa.int64()), pa.field("b", pa.float64())]), + pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.float64())]), + ], +) +def test_from_pyarrow_dtype_raises(pyarrow_dtype): + """Test that we raise an error when constructing NestedDtype from invalid pyarrow type.""" + with pytest.raises(ValueError): + NestedDtype(pyarrow_dtype) + + +def test_from_fields(): + """Test NestedDtype.from_fields().""" + fields = {"a": pa.int64(), "b": pa.float64()} + dtype = NestedDtype.from_fields(fields) + assert dtype.pyarrow_dtype == pa.struct( + [pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))] + ) + + +@pytest.mark.parametrize( + "fields", + [ + {"a": pa.int64(), "b": pa.float64()}, + {"a": pa.int64(), "b": pa.float64(), "c": pa.int64()}, + {"a": pa.string(), "b": pa.float64()}, + # Nested / parametric types are not implemented. + # {"a": pa.list_(pa.int64()), "b": pa.float64()}, + # {"a": pa.list_(pa.int64()), "b": pa.list_(pa.string())}, + # {"a": pa.struct([pa.field("a", pa.int64())]), "b": pa.list_(pa.int64())}, + ], +) +def test_name_vs_construct_from_string(fields): + """Test that dtype.name is consistent with dtype.construct_from_string(dtype.name).""" + dtype = NestedDtype.from_fields(fields) + assert dtype == NestedDtype.construct_from_string(dtype.name) + + +def test_construct_array_type(): + """Test that NestedDtype.construct_array_type() returns NestedExtensionArray.""" + assert NestedDtype.construct_array_type() is NestedExtensionArray diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py new file mode 100644 index 0000000..c4d9ead --- /dev/null +++ b/tests/nested_pandas/series/test_ext_array.py @@ -0,0 +1,628 @@ +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest +from nested_pandas import NestedDtype +from nested_pandas.series.ext_array import NestedExtensionArray +from numpy.testing import assert_array_equal +from pandas.testing import assert_frame_equal, assert_series_equal + + +def test_ext_array_dtype(): + """Test that the dtype of the extension array is correct.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + assert ext_array.dtype == NestedDtype(struct_array.type) + + +def test_series_dtype(): + """Test that the dtype of the series is correct.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + series = pd.Series(ext_array) + assert series.dtype == NestedDtype(struct_array.type) + + +def test_series_built_with_dtype(): + """Test that the series is built correctly with the given dtype.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + dtype = NestedDtype(struct_array.type) + series = pd.Series(struct_array, dtype=dtype) + assert isinstance(series.array, NestedExtensionArray) + + +def test_series_built_from_dict(): + """Test that the series is built correctly from a dictionary.""" + data = [ + {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, + {"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}, + ] + dtype = NestedDtype.from_fields({"a": pa.uint8(), "b": pa.float64()}) + series = pd.Series(data, dtype=dtype) + + assert isinstance(series.array, NestedExtensionArray) + assert series.array.dtype == dtype + + desired_ext_array = NestedExtensionArray( + pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])], type=pa.list_(pa.uint8())), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ) + assert_series_equal(series, pd.Series(desired_ext_array)) + + +def test__convert_df_to_pa_scalar(): + """Test that we can convert a DataFrame to a pyarrow scalar.""" + df = pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}) + pa_scalar = NestedExtensionArray._convert_df_to_pa_scalar(df, type=None) + + assert pa_scalar == pa.scalar( + {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, + type=pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), + ) + + +def test__convert_df_to_pa_from_scalar(): + """Test that we can convert a DataFrame to a pyarrow scalar.""" + df = pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}) + pa_scalar = NestedExtensionArray._convert_df_to_pa_scalar(df, type=None) + + assert pa_scalar == pa.scalar( + {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, + type=pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), + ) + + +def test__convert_df_to_pa_from_series(): + """Test that we can convert a DataFrame to a pyarrow scalar.""" + series = pd.Series( + [ + pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}), + pd.DataFrame({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}), + ] + ) + list_of_dicts = list(NestedExtensionArray._convert_df_value_to_pa(series, type=None)) + + desired_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + + assert list_of_dicts == [ + pa.scalar({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, type=desired_type), + pa.scalar({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}, type=desired_type), + ] + + +def test__convert_df_to_pa_from_list(): + """Test that we can convert a DataFrame to a pyarrow scalar.""" + list_of_dfs = [ + pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}), + pd.DataFrame({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}), + ] + list_of_dicts = list(NestedExtensionArray._convert_df_value_to_pa(list_of_dfs, type=None)) + + desired_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + + assert list_of_dicts == [ + pa.scalar({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, type=desired_type), + pa.scalar({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}, type=desired_type), + ] + + +def test__from_sequence(): + """Test that we can convert a list of DataFrames to a NestedExtensionArray.""" + list_of_dfs = [ + pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}), + pd.DataFrame({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}), + ] + ext_array = NestedExtensionArray._from_sequence(list_of_dfs, dtype=None) + + desired = NestedExtensionArray( + pa.StructArray.from_arrays( + [pa.array([[1, 2, 3], [1, 2, 1]]), pa.array([[-4.0, -5.0, -6.0], [-3.0, -4.0, -5.0]])], + names=["a", "b"], + ) + ) + assert ext_array.equals(desired) + + +def test___setitem___single_df(): + """Tests nested_ext_array[i] = pd.DataFrame(...) with df of the same size as the struct array.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array[0] = pd.DataFrame({"a": [5, 6, 7], "b": [100.0, 200.0, 300.0]}) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6, 7]), np.array([1, 2, 1])]), + pa.array([np.array([100.0, 200.0, 300.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +def test___setitem___single_df_different_size(): + """Tests nested_ext_array[i] = pd.DataFrame(...) with df of different size than the struct array.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array[0] = pd.DataFrame({"a": [5, 6], "b": [100.0, 200.0]}) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6]), np.array([1, 2, 1])]), + pa.array([np.array([100.0, 200.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +def test___setitem___single_df_to_all_rows(): + """Tests nested_ext_array[:] = pd.DataFrame(...)""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array[:] = pd.DataFrame({"a": [5, 6], "b": [100.0, 200.0]}) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6]), np.array([5, 6])]), + pa.array([np.array([100.0, 200.0]), np.array([100.0, 200.0])]), + ], + names=["a", "b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +def test___setitem___list_of_dfs(): + """Tests nested_ext_array[:] = [pd.DataFrame(...), pd.DataFrame(...), ...]""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array[:] = [ + pd.DataFrame({"a": [5, 6], "b": [100.0, 200.0]}), + pd.DataFrame({"a": [7, 8], "b": [300.0, 400.0]}), + ] + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6]), np.array([7, 8])]), + pa.array([np.array([100.0, 200.0]), np.array([300.0, 400.0])]), + ], + names=["a", "b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +def test___setitem___series_of_dfs(): + """Tests nested_ext_array[:] = pd.Series([pd.DataFrame(...), pd.DataFrame(...), ...])""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array[:] = pd.Series( + [ + pd.DataFrame({"a": [5, 6], "b": [100.0, 200.0]}), + pd.DataFrame({"a": [7, 8], "b": [300.0, 400.0]}), + ] + ) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6]), np.array([7, 8])]), + pa.array([np.array([100.0, 200.0]), np.array([300.0, 400.0])]), + ], + names=["a", "b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +# Test exception raises for wrong dtype +@pytest.mark.parametrize( + "data", + [ + # Must be struct + [ + 1, + 2, + 3, + ], + # Must be struct + {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, + # Lists of the same object must have the same length for each field + [{"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, {"a": [1, 2, 1], "b": [-3.0, -4.0]}], + # Struct fields must be lists + [{"a": 1, "b": [-4.0, -5.0, -6.0]}, {"a": 2, "b": [-3.0, -4.0, -5.0]}], + ], +) +def test_series_built_raises(data): + """Test that the extension array raises an error when the data is not valid.""" + pa_array = pa.array(data) + with pytest.raises(ValueError): + _array = NestedExtensionArray(pa_array) + + +def test_list_offsets(): + """Test that the list offsets are correct.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])], type=pa.list_(pa.uint8())), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + desired = pa.chunked_array([pa.array([0, 3, 6])]) + assert_array_equal(ext_array.list_offsets, desired) + + +def test___getitem__(): + """Tests series[i] is a valid DataFrame.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[100, 101]) + + second_row_as_df = series[101] + assert_frame_equal( + second_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 1.0]), "b": -np.array([3.0, 4.0, 5.0])}) + ) + + +def test_series_apply_udf_argument(): + """Tests `x` in series.apply(lambda x: x) is a valid DataFrame.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[100, 101]) + + series_of_dfs = series.apply(lambda x: x) + assert_frame_equal( + series_of_dfs.iloc[0], pd.DataFrame({"a": np.array([1.0, 2.0, 3.0]), "b": -np.array([4.0, 5.0, 6.0])}) + ) + + +def test___iter__(): + """Tests iter(series) yields valid DataFrames.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[100, 101]) + + # Check last df only + df = list(series)[-1] + assert_frame_equal(df, pd.DataFrame({"a": np.array([1.0, 2.0, 1.0]), "b": -np.array([3.0, 4.0, 5.0])})) + + +def test_field_names(): + """Tests that the extension array field names are correct.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + assert ext_array.field_names == ["a", "b"] + + +def test_flat_length(): + """Tests that the flat length of the extension array is correct.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + assert ext_array.flat_length == 7 + + +def test_view_fields_with_single_field(): + """Tests ext_array.view("field")""" + arrays = [ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ] + ext_array = NestedExtensionArray( + pa.StructArray.from_arrays( + arrays=arrays, + names=["a", "b"], + ) + ) + + view = ext_array.view_fields("a") + assert view.field_names == ["a"] + + desired = NestedExtensionArray( + pa.StructArray.from_arrays( + arrays=arrays[:1], + names=["a"], + ) + ) + + assert_series_equal(pd.Series(view), pd.Series(desired)) + + +def test_view_fields_with_multiple_fields(): + """Tests ext_array.view(["field1", "field2"])""" + arrays = [ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0, 4.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + pa.array([["x", "y", "z"], ["x1", "x2", "x3", "x4"]]), + ] + ext_array = NestedExtensionArray( + pa.StructArray.from_arrays( + arrays=arrays, + names=["a", "b", "c"], + ) + ) + + view = ext_array.view_fields(["b", "a"]) + assert view.field_names == ["b", "a"] + + assert_series_equal( + pd.Series(view), + pd.Series( + NestedExtensionArray(pa.StructArray.from_arrays(arrays=[arrays[1], arrays[0]], names=["b", "a"])) + ), + ) + + +def test_view_fields_raises_for_invalid_field(): + """Tests that we raise an error when trying to view a field that does not exist.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0, 4.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + with pytest.raises(ValueError): + ext_array.view_fields("c") + + +def test_view_fields_raises_for_non_unique_fields(): + """Tests that we raise an error when trying to view multiple fields with the sama name.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0, 4.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + with pytest.raises(ValueError): + ext_array.view_fields(["a", "a"]) + + +def test_set_flat_field_new_field_scalar(): + """Tests setting a new field with a scalar value.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array.set_flat_field("c", "YES") + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + pa.array([["YES"] * 3, ["YES"] * 4]), + ], + names=["a", "b", "c"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert_series_equal(pd.Series(ext_array), pd.Series(desired)) + + +def test_set_flat_field_replace_field_array(): + """Tests replacing a field with a new "flat" array.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0, 4.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array.set_flat_field("b", [True, False, True, False, True, False, True]) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0, 4.0])]), + pa.array([np.array([True, False, True]), np.array([False, True, False, True])]), + ], + names=["a", "b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert_series_equal(pd.Series(ext_array), pd.Series(desired)) + + +def test_set_list_field_new_field(): + """Tests setting a new field with a new "list" array""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array.set_list_field("c", [["x", "y", "z"], ["x1", "x2", "x3", "x4"]]) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + pa.array([np.array(["x", "y", "z"]), np.array(["x1", "x2", "x3", "x4"])]), + ], + names=["a", "b", "c"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert_series_equal(pd.Series(ext_array), pd.Series(desired)) + + +def test_set_list_field_replace_field(): + """Tests replacing a field with a new "list" array.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array.set_list_field("b", [["x", "y", "z"], ["x1", "x2", "x3", "x4"]]) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([np.array(["x", "y", "z"]), np.array(["x1", "x2", "x3", "x4"])]), + ], + names=["a", "b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert_series_equal(pd.Series(ext_array), pd.Series(desired)) + + +def test_pop_field(): + """Tests that we can pop a field from the extension array.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + pa.array([np.array(["x", "y", "z"]), np.array(["x1", "x2", "x3", "x4"])]), + ], + names=["a", "b", "c"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array.pop_field("c") + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert_series_equal(pd.Series(ext_array), pd.Series(desired)) + + +def test_delete_last_field_raises(): + """Tests that we raise an error when trying to delete the last field left.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + pa.array([np.array(["x", "y", "z"]), np.array(["x1", "x2", "x3", "x4"])]), + ], + names=["a", "b", "c"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array.pop_field("a") + assert ext_array.field_names == ["b", "c"] + + ext_array.pop_field("c") + assert ext_array.field_names == ["b"] + + with pytest.raises(ValueError): + ext_array.pop_field("b") diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py new file mode 100644 index 0000000..5859a9d --- /dev/null +++ b/tests/nested_pandas/series/test_packer.py @@ -0,0 +1,236 @@ +import numpy as np +import pandas as pd +import pyarrow as pa +from nested_pandas import NestedDtype +from nested_pandas.series import packer +from numpy.testing import assert_array_equal +from pandas.testing import assert_frame_equal, assert_series_equal + + +def test_pack_flat_into_df(): + """Test pack_flat_into_df().""" + df = pd.DataFrame( + data={ + "a": [7, 8, 9, 1, 2, 3, 4, 5, 6], + "b": [0, 1, 0, 0, 1, 0, 1, 0, 1], + }, + index=[4, 4, 4, 1, 1, 2, 2, 3, 3], + ) + actual = packer.pack_flat_into_df(df, name="struct") + + desired = pd.DataFrame( + data={ + "a": pd.Series( + data=[ + np.array([1, 2]), + np.array([3, 4]), + np.array([5, 6]), + np.array([7, 8, 9]), + ], + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + index=[1, 2, 3, 4], + ), + "b": pd.Series( + data=[ + np.array([0, 1]), + np.array([0, 1]), + np.array([0, 1]), + np.array([0, 1, 0]), + ], + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + index=[1, 2, 3, 4], + ), + "struct": pd.Series( + data=[ + (np.array([1, 2]), np.array([0, 1])), + (np.array([3, 4]), np.array([0, 1])), + (np.array([5, 6]), np.array([0, 1])), + (np.array([7, 8, 9]), np.array([0, 1, 0])), + ], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + index=[1, 2, 3, 4], + ), + }, + ) + + assert_frame_equal(actual, desired) + + +def test_pack_flat(): + """Test pack_flat().""" + df = pd.DataFrame( + data={ + "a": [7, 8, 9, 1, 2, 3, 4, 5, 6], + "b": [0, 1, 0, 0, 1, 0, 1, 0, 1], + }, + index=[4, 4, 4, 1, 1, 2, 2, 3, 3], + ) + actual = packer.pack_flat(df) + + desired = pd.Series( + data=[ + (np.array([1, 2]), np.array([0, 1])), + (np.array([3, 4]), np.array([0, 1])), + (np.array([5, 6]), np.array([0, 1])), + (np.array([7, 8, 9]), np.array([0, 1, 0])), + ], + index=[1, 2, 3, 4], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + ) + + assert_series_equal(actual, desired) + + +def test_pack_sorted_df_into_struct(): + """Test pack_sorted_df_into_struct().""" + df = pd.DataFrame( + data={ + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "b": [0, 1, 0, 1, 0, 1, 0, 1, 0], + }, + index=[1, 1, 2, 2, 3, 3, 4, 4, 4], + ) + actual = packer.pack_sorted_df_into_struct(df) + + desired = pd.Series( + data=[ + (np.array([1, 2]), np.array([0, 1])), + (np.array([3, 4]), np.array([0, 1])), + (np.array([5, 6]), np.array([0, 1])), + (np.array([7, 8, 9]), np.array([0, 1, 0])), + ], + index=[1, 2, 3, 4], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + ) + + assert_series_equal(actual, desired) + + +def test_pack_lists(): + """Test pack_lists().""" + packed_df = pd.DataFrame( + data={ + "a": [ + np.array([1, 2]), + np.array([3, 4]), + np.array([5, 6]), + np.array([7, 8, 9]), + ], + "b": [ + np.array([0, 1]), + np.array([0, 1]), + np.array([0, 1]), + np.array([0, 1, 0]), + ], + }, + index=[1, 2, 3, 4], + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ) + series = packer.pack_lists(packed_df) + + for field_name in packed_df.columns: + assert_series_equal(series.struct.field(field_name), packed_df[field_name]) + + +def test_pack_dfs(): + """Test pack_dfs().""" + dfs = [ + pd.DataFrame( + data={ + "a": [1, 2], + "b": [0, 1], + }, + index=[100, 100], + ), + pd.DataFrame( + data={ + "a": [3, 4], + "b": [0, 1], + }, + index=[101, 101], + ), + pd.DataFrame( + data={ + "a": [5, 6], + "b": [0, 1], + }, + index=[102, 102], + ), + pd.DataFrame( + data={ + "a": [7, 8, 9], + "b": [0, 1, 0], + }, + index=[103, 103, 103], + ), + ] + series = packer.pack_dfs(dfs, index=[100, 101, 102, 103]) + + desired = pd.Series( + data=[ + (np.array([1, 2]), np.array([0, 1])), + (np.array([3, 4]), np.array([0, 1])), + (np.array([5, 6]), np.array([0, 1])), + (np.array([7, 8, 9]), np.array([0, 1, 0])), + ], + index=[100, 101, 102, 103], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + ) + assert_series_equal(series, desired) + + +def test_view_sorted_df_as_list_arrays(): + """Test view_sorted_df_as_list_arrays().""" + flat_df = pd.DataFrame( + data={ + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "b": [0, 1, 0, 1, 0, 1, 0, 1, 0], + }, + index=[1, 1, 2, 2, 3, 3, 4, 4, 4], + ) + nested_df = packer.view_sorted_df_as_list_arrays(flat_df) + + assert_array_equal(nested_df.index, [1, 2, 3, 4]) + + desired_nested = pd.DataFrame( + data={ + "a": [ + np.array([1, 2]), + np.array([3, 4]), + np.array([5, 6]), + np.array([7, 8, 9]), + ], + "b": [ + np.array([0, 1]), + np.array([0, 1]), + np.array([0, 1]), + np.array([0, 1, 0]), + ], + }, + index=[1, 2, 3, 4], + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ) + assert_frame_equal(nested_df, desired_nested) + + +def test_view_sorted_series_as_list_array(): + """Test view_sorted_series_as_list_array().""" + series = pd.Series( + data=[1, 2, 3, 4, 5, 6, 7, 8, 9], + index=[1, 1, 2, 2, 3, 3, 4, 4, 4], + ) + nested = packer.view_sorted_series_as_list_array(series) + + assert_array_equal(nested.index, [1, 2, 3, 4]) + + desired_nested = pd.Series( + data=[ + np.array([1, 2]), + np.array([3, 4]), + np.array([5, 6]), + np.array([7, 8, 9]), + ], + index=[1, 2, 3, 4], + dtype=pd.ArrowDtype(pa.list_(pa.int64())), + ) + assert_series_equal(nested, desired_nested)