Skip to content

Commit

Permalink
Add low-level create_dataframe_from_blocks helper function (pandas-de…
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Apr 15, 2024
1 parent e7a96a4 commit ae246a6
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 2 deletions.
62 changes: 62 additions & 0 deletions pandas/api/internals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import numpy as np

from pandas._typing import ArrayLike

from pandas import (
DataFrame,
Index,
)
from pandas.core.internals.api import _make_block
from pandas.core.internals.managers import BlockManager as _BlockManager


def create_dataframe_from_blocks(
blocks: list[tuple[ArrayLike, np.ndarray]], index: Index, columns: Index
) -> DataFrame:
"""
Low-level function to create a DataFrame from arrays as they are
representing the block structure of the resulting DataFrame.
Attention: this is an advanced, low-level function that should only be
used if you know that the below-mentioned assumptions are guaranteed.
If passing data that do not follow those assumptions, subsequent
subsequent operations on the resulting DataFrame might lead to strange
errors.
For almost all use cases, you should use the standard pd.DataFrame(..)
constructor instead. If you are planning to use this function, let us
know by opening an issue at https://github.com/pandas-dev/pandas/issues.
Assumptions:
- The block arrays are either a 2D numpy array or a pandas ExtensionArray
- In case of a numpy array, it is assumed to already be in the expected
shape for Blocks (2D, (cols, rows), i.e. transposed compared to the
DataFrame columns).
- All arrays are taken as is (no type inference) and expected to have the
correct size.
- The placement arrays have the correct length (equalling the number of
columns that its equivalent block array represents), and all placement
arrays together form a complete set of 0 to n_columns - 1.
Parameters
----------
blocks : list of tuples of (block_array, block_placement)
This should be a list of tuples existing of (block_array, block_placement),
where:
- block_array is a 2D numpy array or a 1D ExtensionArray, following the
requirements listed above.
- block_placement is a 1D integer numpy array
index : Index
The Index object for the `index` of the resulting DataFrame.
columns : Index
The Index object for the `columns` of the resulting DataFrame.
Returns
-------
DataFrame
"""
block_objs = [_make_block(*block) for block in blocks]
axes = [columns, index]
mgr = _BlockManager(block_objs, axes)
return DataFrame._from_mgr(mgr, mgr.axes)
40 changes: 38 additions & 2 deletions pandas/core/internals/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,14 @@
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
PeriodDtype,
)

from pandas.core.arrays import DatetimeArray
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
)
from pandas.core.construction import extract_array
from pandas.core.internals.blocks import (
check_ndim,
Expand All @@ -32,11 +36,43 @@
)

if TYPE_CHECKING:
from pandas._typing import Dtype
from pandas._typing import (
ArrayLike,
Dtype,
)

from pandas.core.internals.blocks import Block


def _make_block(values: ArrayLike, placement: np.ndarray) -> Block:
"""
This is an analogue to blocks.new_block(_2d) that ensures:
1) correct dimension for EAs that support 2D (`ensure_block_shape`), and
2) correct EA class for datetime64/timedelta64 (`maybe_coerce_values`).
The input `values` is assumed to be either numpy array or ExtensionArray:
- In case of a numpy array, it is assumed to already be in the expected
shape for Blocks (2D, (cols, rows)).
- In case of an ExtensionArray the input can be 1D, also for EAs that are
internally stored as 2D.
For the rest no preprocessing or validation is done, except for those dtypes
that are internally stored as EAs but have an exact numpy equivalent (and at
the moment use that numpy dtype), i.e. datetime64/timedelta64.
"""
dtype = values.dtype
klass = get_block_type(dtype)
placement_obj = BlockPlacement(placement)

if (isinstance(dtype, ExtensionDtype) and dtype._supports_2d) or isinstance(
values, (DatetimeArray, TimedeltaArray)
):
values = ensure_block_shape(values, ndim=2)

values = maybe_coerce_values(values)
return klass(values, ndim=2, placement=placement_obj)


def make_block(
values, placement, klass=None, ndim=None, dtype: Dtype | None = None
) -> Block:
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/api/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ class TestApi(Base):
"indexers",
"interchange",
"typing",
"internals",
]
allowed_typing = [
"DataFrameGroupBy",
Expand Down
92 changes: 92 additions & 0 deletions pandas/tests/internals/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
in core.internals
"""

import datetime

import numpy as np
import pytest

import pandas as pd
import pandas._testing as tm
from pandas.api.internals import create_dataframe_from_blocks
from pandas.core import internals
from pandas.core.internals import api

Expand Down Expand Up @@ -71,3 +75,91 @@ def test_create_block_manager_from_blocks_deprecated():
)
with tm.assert_produces_warning(DeprecationWarning, match=msg):
internals.create_block_manager_from_blocks


def test_create_dataframe_from_blocks(float_frame):
block = float_frame._mgr.blocks[0]
index = float_frame.index.copy()
columns = float_frame.columns.copy()

result = create_dataframe_from_blocks(
[(block.values, block.mgr_locs.as_array)], index=index, columns=columns
)
tm.assert_frame_equal(result, float_frame)


def test_create_dataframe_from_blocks_types():
df = pd.DataFrame(
{
"int": list(range(1, 4)),
"uint": np.arange(3, 6).astype("uint8"),
"float": [2.0, np.nan, 3.0],
"bool": np.array([True, False, True]),
"boolean": pd.array([True, False, None], dtype="boolean"),
"string": list("abc"),
"datetime": pd.date_range("20130101", periods=3),
"datetimetz": pd.date_range("20130101", periods=3).tz_localize(
"Europe/Brussels"
),
"timedelta": pd.timedelta_range("1 day", periods=3),
"period": pd.period_range("2012-01-01", periods=3, freq="D"),
"categorical": pd.Categorical(["a", "b", "a"]),
"interval": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]),
}
)

result = create_dataframe_from_blocks(
[(block.values, block.mgr_locs.as_array) for block in df._mgr.blocks],
index=df.index,
columns=df.columns,
)
tm.assert_frame_equal(result, df)


def test_create_dataframe_from_blocks_datetimelike():
# extension dtypes that have an exact matching numpy dtype can also be
# be passed as a numpy array
index, columns = pd.RangeIndex(3), pd.Index(["a", "b", "c", "d"])

block_array1 = np.arange(
datetime.datetime(2020, 1, 1),
datetime.datetime(2020, 1, 7),
step=datetime.timedelta(1),
).reshape((2, 3))
block_array2 = np.arange(
datetime.timedelta(1), datetime.timedelta(7), step=datetime.timedelta(1)
).reshape((2, 3))
result = create_dataframe_from_blocks(
[(block_array1, np.array([0, 2])), (block_array2, np.array([1, 3]))],
index=index,
columns=columns,
)
expected = pd.DataFrame(
{
"a": pd.date_range("2020-01-01", periods=3, unit="us"),
"b": pd.timedelta_range("1 days", periods=3, unit="us"),
"c": pd.date_range("2020-01-04", periods=3, unit="us"),
"d": pd.timedelta_range("4 days", periods=3, unit="us"),
}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"array",
[
pd.date_range("2020-01-01", periods=3),
pd.date_range("2020-01-01", periods=3, tz="UTC"),
pd.period_range("2012-01-01", periods=3, freq="D"),
pd.timedelta_range("1 day", periods=3),
],
)
def test_create_dataframe_from_blocks_1dEA(array):
# ExtensionArrays can be passed as 1D even if stored under the hood as 2D
df = pd.DataFrame({"a": array})

block = df._mgr.blocks[0]
result = create_dataframe_from_blocks(
[(block.values[0], block.mgr_locs.as_array)], index=df.index, columns=df.columns
)
tm.assert_frame_equal(result, df)
1 change: 1 addition & 0 deletions scripts/validate_unwanted_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
# TODO(4.0): GH#55043 - remove upon removal of CoW option
"_get_option",
"_fill_limit_area_1d",
"_make_block",
}


Expand Down

0 comments on commit ae246a6

Please sign in to comment.