From 4bc49edfd07706f94c6818fa15276eab3b2a8f07 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 21 Nov 2024 14:35:05 +0900 Subject: [PATCH 01/37] fixed comparison of string column to mixed object column (issue #60228) --- pandas/core/ops/array_ops.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 983a3df57e369..dc63a4fb04a3a 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -38,6 +38,7 @@ is_bool_dtype, is_list_like, is_numeric_v_string_like, + is_string_dtype, is_object_dtype, is_scalar, ) @@ -53,7 +54,7 @@ from pandas.core import roperator from pandas.core.computation import expressions -from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.construction import ensure_wrapped_if_datetimelike, array from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -321,6 +322,17 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) + if ( + (is_string_dtype(lvalues) and is_object_dtype(rvalues)) or + (is_object_dtype(lvalues) and is_string_dtype(rvalues)) + ): + if lvalues.dtype.name == "string" and rvalues.dtype == object: + lvalues = lvalues.astype("string") + rvalues = array(rvalues, dtype="string") + elif rvalues.dtype.name == "string" and lvalues.dtype == object: + rvalues = rvalues.astype("string") + lvalues = array(lvalues, dtype="string") + if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From 0def761f2da2f1a17ffe46048a6da2a50c7a08df Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 21 Nov 2024 14:39:52 +0900 Subject: [PATCH 02/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/ops/array_ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index dc63a4fb04a3a..f7dc17ae00ac7 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -333,6 +333,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: rvalues = rvalues.astype("string") lvalues = array(lvalues, dtype="string") + if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From c4da91933d8ca8b50d15eeca37e2757182136d11 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 21 Nov 2024 14:41:24 +0900 Subject: [PATCH 03/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/ops/array_ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index f7dc17ae00ac7..dc63a4fb04a3a 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -333,7 +333,6 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: rvalues = rvalues.astype("string") lvalues = array(lvalues, dtype="string") - if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From 900f3b1070e0b0b533efcd5eb92d6e03f56ca6d9 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Fri, 22 Nov 2024 09:12:19 +0900 Subject: [PATCH 04/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/tests/series/methods/test_compare.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py index 2a57d5139b62c..74d2e35eaf502 100644 --- a/pandas/tests/series/methods/test_compare.py +++ b/pandas/tests/series/methods/test_compare.py @@ -138,3 +138,15 @@ def test_compare_datetime64_and_string(): tm.assert_series_equal(result_eq1, expected_eq) tm.assert_series_equal(result_eq2, expected_eq) tm.assert_series_equal(result_neq, expected_neq) + +def test_comparison_string_mixed_object(): + pd.options.future.infer_string = True + + ser_string = pd.Series(["a", "b"], dtype="string") + ser_mixed = pd.Series([1, "b"]) + + result = ser_string == ser_mixed + expected = pd.Series([False, True], dtype="boolean") + tm.assert_series_equal(result, expected) + + pd.options.future.infer_string = False \ No newline at end of file From 8db4edc1ac0e26aa3da38ca59092573dc1ea48ef Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Fri, 22 Nov 2024 09:46:41 +0900 Subject: [PATCH 05/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/ops/array_ops.py | 16 +++++++++------- pandas/tests/series/methods/test_compare.py | 4 +++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5f7aed8ed9786..a72525281df09 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -768,6 +768,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`comparison_op` where comparing a ``string`` dtype array with an ``object`` dtype array containing mixed types would raise a ``TypeError`` when PyArrow-based strings are enabled. (:issue:`60228`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index dc63a4fb04a3a..255633786ede1 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -38,9 +38,9 @@ is_bool_dtype, is_list_like, is_numeric_v_string_like, - is_string_dtype, is_object_dtype, is_scalar, + is_string_dtype, ) from pandas.core.dtypes.generic import ( ABCExtensionArray, @@ -54,7 +54,10 @@ from pandas.core import roperator from pandas.core.computation import expressions -from pandas.core.construction import ensure_wrapped_if_datetimelike, array +from pandas.core.construction import ( + array as pd_array, + ensure_wrapped_if_datetimelike, +) from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -322,16 +325,15 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) - if ( - (is_string_dtype(lvalues) and is_object_dtype(rvalues)) or - (is_object_dtype(lvalues) and is_string_dtype(rvalues)) + if (is_string_dtype(lvalues) and is_object_dtype(rvalues)) or ( + is_object_dtype(lvalues) and is_string_dtype(rvalues) ): if lvalues.dtype.name == "string" and rvalues.dtype == object: lvalues = lvalues.astype("string") - rvalues = array(rvalues, dtype="string") + rvalues = pd_array(rvalues, dtype="string") elif rvalues.dtype.name == "string" and lvalues.dtype == object: rvalues = rvalues.astype("string") - lvalues = array(lvalues, dtype="string") + lvalues = pd_array(lvalues, dtype="string") if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py index 74d2e35eaf502..93fef353457a7 100644 --- a/pandas/tests/series/methods/test_compare.py +++ b/pandas/tests/series/methods/test_compare.py @@ -139,7 +139,9 @@ def test_compare_datetime64_and_string(): tm.assert_series_equal(result_eq2, expected_eq) tm.assert_series_equal(result_neq, expected_neq) + def test_comparison_string_mixed_object(): + # Issue https://github.com/pandas-dev/pandas/issues/60228 pd.options.future.infer_string = True ser_string = pd.Series(["a", "b"], dtype="string") @@ -149,4 +151,4 @@ def test_comparison_string_mixed_object(): expected = pd.Series([False, True], dtype="boolean") tm.assert_series_equal(result, expected) - pd.options.future.infer_string = False \ No newline at end of file + pd.options.future.infer_string = False From d4ae654b18ec6a42b1bee9a7df8d786f02aca21b Mon Sep 17 00:00:00 2001 From: Kevin Amparado <109636487+KevsterAmp@users.noreply.github.com> Date: Sat, 23 Nov 2024 02:56:41 +0800 Subject: [PATCH 06/37] CI/BUG: Remove `trim()` function on `comment-commands.yml` (#60397) remove trim function on comment-commands.yml --- .github/workflows/comment-commands.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 45f3e911377c1..62956f5825782 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -11,7 +11,7 @@ permissions: jobs: issue_assign: runs-on: ubuntu-22.04 - if: (!github.event.issue.pull_request) && trim(github.event.comment.body) == 'take' + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' concurrency: group: ${{ github.actor }}-issue-assign steps: From eaa8b47ea5c0ce04f48557570574d42effd8fff2 Mon Sep 17 00:00:00 2001 From: Yuvraj Pradhan <151496266+Yuvraj-Pradhan-27@users.noreply.github.com> Date: Sat, 23 Nov 2024 01:45:04 +0530 Subject: [PATCH 07/37] DOC: Fixed spelling of 'behaviour' to 'behavior' (#60398) --- pandas/core/series.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 35b576da87ed7..4fa8b86fa4c16 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -567,7 +567,7 @@ def __arrow_c_stream__(self, requested_schema=None): Export the pandas Series as an Arrow C stream PyCapsule. This relies on pyarrow to convert the pandas Series to the Arrow - format (and follows the default behaviour of ``pyarrow.Array.from_pandas`` + format (and follows the default behavior of ``pyarrow.Array.from_pandas`` in its handling of the index, i.e. to ignore it). This conversion is not necessarily zero-copy. @@ -2226,7 +2226,7 @@ def drop_duplicates( 5 hippo Name: animal, dtype: object - With the 'keep' parameter, the selection behaviour of duplicated values + With the 'keep' parameter, the selection behavior of duplicated values can be changed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. @@ -3451,7 +3451,7 @@ def sort_values( 4 5.0 dtype: float64 - Sort values ascending order (default behaviour) + Sort values ascending order (default behavior) >>> s.sort_values(ascending=True) 1 1.0 @@ -4098,7 +4098,7 @@ def swaplevel( In the following example, we will swap the levels of the indices. Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. + in a similar manner. Note that column-wise is the default behavior. By not supplying any arguments for i and j, we swap the last and second to last indices. From ee0902a832b7fa3e5821ada176566301791e09ec Mon Sep 17 00:00:00 2001 From: ZKaoChi <1953542921@qq.com> Date: Sat, 23 Nov 2024 04:20:39 +0800 Subject: [PATCH 08/37] BUG: Convert output type in Excel for MultiIndex with period levels (#60182) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/formats/excel.py | 8 ++++++ pandas/tests/io/excel/test_style.py | 26 ++++++++++++++++++ pandas/tests/io/excel/test_writers.py | 38 +++++++++++++++++++++++++++ 4 files changed, 73 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 120ee978292d6..1d55fc3ed7b84 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -690,6 +690,7 @@ I/O - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 6a3e215de3f96..5fde6577e9f95 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -37,6 +37,7 @@ DataFrame, Index, MultiIndex, + Period, PeriodIndex, ) import pandas.core.common as com @@ -803,6 +804,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: allow_fill=levels._can_hold_na, fill_value=levels._na_value, ) + # GH#60099 + if isinstance(values[0], Period): + values = values.to_timestamp() for i, span_val in spans.items(): mergestart, mergeend = None, None @@ -827,6 +831,10 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): + # GH#60099 + if isinstance(indexcolval, Period): + indexcolval = indexcolval.to_timestamp() + yield CssExcelCell( row=self.rowcounter + idx, col=gcolidx, diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index f70e65e34c584..71ef1201e523f 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -9,6 +9,9 @@ from pandas import ( DataFrame, + MultiIndex, + Timestamp, + period_range, read_excel, ) import pandas._testing as tm @@ -333,3 +336,26 @@ def test_styler_to_s3(s3_public_bucket, s3so): f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so ) tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("merge_cells", [True, False, "columns"]) +def test_format_hierarchical_rows_periodindex(merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + formatter = ExcelFormatter(df, merge_cells=merge_cells) + formatted_cells = formatter._format_hierarchical_rows() + + for cell in formatted_cells: + if cell.row != 0 and cell.col == 0: + assert isinstance( + cell.val, Timestamp + ), "Period should be converted to Timestamp" diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 19fe9855dbb85..18948de72200a 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -23,6 +23,7 @@ MultiIndex, date_range, option_context, + period_range, ) import pandas._testing as tm @@ -335,6 +336,43 @@ def test_multiindex_interval_datetimes(self, tmp_excel): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("merge_cells", [True, False, "columns"]) + def test_excel_round_trip_with_periodindex(self, tmp_excel, merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + df.to_excel(tmp_excel, merge_cells=merge_cells) + result = pd.read_excel(tmp_excel, index_col=[0, 1]) + expected = DataFrame( + {"A": [1, 2]}, + MultiIndex.from_arrays( + [ + [ + pd.to_datetime("2006-10-06 00:00:00"), + pd.to_datetime("2006-10-07 00:00:00"), + ], + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + time_format = ( + "datetime64[s]" if tmp_excel.endswith(".ods") else "datetime64[us]" + ) + expected.index = expected.index.set_levels( + expected.index.levels[0].astype(time_format), level=0 + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "engine,ext", From a2ceb52a9b3f8a3bb1ec6ad9729acca3ff1f6707 Mon Sep 17 00:00:00 2001 From: partev Date: Mon, 25 Nov 2024 13:36:08 -0500 Subject: [PATCH 09/37] fix issue #60410 (#60412) --- doc/source/user_guide/window.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index e25c4c2441920..0581951d5bfad 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -567,9 +567,9 @@ One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass \alpha = \begin{cases} - \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ - \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ - 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 + \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ + \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ + 1 - e^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 \end{cases} One must specify precisely one of **span**, **center of mass**, **half-life** From e78df6f8f2ed2ca892e4caff61d8edfdfce2e981 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 26 Nov 2024 00:09:31 +0530 Subject: [PATCH 10/37] DOC: fix SA01 for pandas.errors.UnsortedIndexError (#60404) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 772793702f8b8..2a8b5f15d95f3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -119,7 +119,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ - -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 68bd70603abae..d6d2fd82858ed 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -100,6 +100,11 @@ class UnsortedIndexError(KeyError): Subclass of `KeyError`. + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by its index. + DataFrame.set_index : Set the DataFrame index using existing columns. + Examples -------- >>> df = pd.DataFrame( From cbd90ba5c403dc5449ac3b3a821ddc442c5ddc7d Mon Sep 17 00:00:00 2001 From: lfffkh <167774581+lfffkh@users.noreply.github.com> Date: Tue, 26 Nov 2024 02:40:37 +0800 Subject: [PATCH 11/37] Fix BUG: Cannot shift Intervals that are not closed='right' (the default) (#60407) first --- pandas/core/arrays/interval.py | 4 +++- pandas/tests/frame/methods/test_shift.py | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f47ef095a8409..bbbf1d9ca60bd 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1055,7 +1055,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: from pandas import Index fill_value = Index(self._left, copy=False)._na_value - empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + empty = IntervalArray.from_breaks( + [fill_value] * (empty_len + 1), closed=self.closed + ) else: empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index a0f96ff111444..b52240c208493 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -757,3 +757,12 @@ def test_shift_with_offsets_freq_empty(self): df_shifted = DataFrame(index=shifted_dates) result = df.shift(freq=offset) tm.assert_frame_equal(result, df_shifted) + + def test_series_shift_interval_preserves_closed(self): + # GH#60389 + ser = Series( + [pd.Interval(1, 2, closed="right"), pd.Interval(2, 3, closed="right")] + ) + result = ser.shift(1) + expected = Series([np.nan, pd.Interval(1, 2, closed="right")]) + tm.assert_series_equal(result, expected) From bca4b1c0ccb3fe5a74bb945d01bc372a90cc0e11 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 26 Nov 2024 00:11:18 +0530 Subject: [PATCH 12/37] DOC: fix SA01,ES01 for pandas.errors.PossibleDataLossError (#60403) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2a8b5f15d95f3..03c6b8dc077b9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -117,7 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.NumbaUtilError SA01" \ -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ - -i "pandas.errors.PossibleDataLossError SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index d6d2fd82858ed..5642b0d33b4f7 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -638,6 +638,15 @@ class PossibleDataLossError(Exception): """ Exception raised when trying to open a HDFStore file when already opened. + This error is triggered when there is a potential risk of data loss due to + conflicting operations on an HDFStore file. It serves to prevent unintended + overwrites or data corruption by enforcing exclusive access to the file. + + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + HDFStore.open : Open an HDFStore file in the specified mode. + Examples -------- >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP From 582740b3c0a1ef211b490abbbd94c192b0367af5 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 26 Nov 2024 00:11:50 +0530 Subject: [PATCH 13/37] DOC: fix SA01 for pandas.errors.OutOfBoundsTimedelta (#60402) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/np_datetime.pyx | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 03c6b8dc077b9..2817d84bad7b8 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -115,7 +115,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 193556b2697a9..1b7f04fe17238 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -201,6 +201,10 @@ class OutOfBoundsTimedelta(ValueError): Representation should be within a timedelta64[ns]. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + Examples -------- >>> pd.date_range(start="1/1/1700", freq="B", periods=100000) From 9fab4eb5fb0132731a360fdd8ea3b31d95de187f Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 26 Nov 2024 00:12:23 +0530 Subject: [PATCH 14/37] DOC: fix SA01,ES01 for pandas.errors.DuplicateLabelError (#60399) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2817d84bad7b8..8bafcb8944e14 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -109,7 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.std SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.DuplicateLabelError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.InvalidIndexError SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 5642b0d33b4f7..70e523688c644 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -393,6 +393,19 @@ class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. + This error is typically encountered when performing operations on objects + with `allows_duplicate_labels=False` and the operation would result in + duplicate labels in the index. Duplicate labels can lead to ambiguities + in indexing and reduce data integrity. + + See Also + -------- + Series.set_flags : Return a new ``Series`` object with updated flags. + DataFrame.set_flags : Return a new ``DataFrame`` object with updated flags. + Series.reindex : Conform ``Series`` object to new index with optional filling logic. + DataFrame.reindex : Conform ``DataFrame`` object to new index with optional filling + logic. + Examples -------- >>> s = pd.Series([0, 1, 2], index=["a", "b", "c"]).set_flags( From 00c2207cbe8e429d11db5973794b604041cd74b2 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 26 Nov 2024 00:12:55 +0530 Subject: [PATCH 15/37] DOC: fix SA01,ES01 for pandas.errors.InvalidIndexError (#60400) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8bafcb8944e14..58b0d26f7e2f3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -110,7 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ - -i "pandas.errors.InvalidIndexError SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 70e523688c644..814feadfb06e4 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -425,6 +425,16 @@ class InvalidIndexError(Exception): """ Exception raised when attempting to use an invalid index key. + This exception is triggered when a user attempts to access or manipulate + data in a pandas DataFrame or Series using an index key that is not valid + for the given object. This may occur in cases such as using a malformed + slice, a mismatched key for a ``MultiIndex``, or attempting to access an index + element that does not exist. + + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Examples -------- >>> idx = pd.MultiIndex.from_product([["x", "y"], [0, 1]]) From 39dcbb4a06beaee7dd584a28958db72b9bba7531 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 26 Nov 2024 00:20:15 +0530 Subject: [PATCH 16/37] DOC: fix SA01 for pandas.errors.NumExprClobberingError (#60401) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 58b0d26f7e2f3..246a907c5052c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -111,7 +111,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.var SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ - -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 814feadfb06e4..70d839d817114 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -538,6 +538,11 @@ class NumExprClobberingError(NameError): to 'numexpr'. 'numexpr' is the default engine value for these methods if the numexpr package is installed. + See Also + -------- + eval : Evaluate a Python expression as a string using various backends. + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + Examples -------- >>> df = pd.DataFrame({"abs": [1, 1, 1]}) From 0b6cece3acda1ae6e4f582d8276851b02aeac1ea Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 11:35:37 -0800 Subject: [PATCH 17/37] TST: Avoid hashing np.timedelta64 without unit (#60416) --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3d1177c23c612..611b92eb022d6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1254,7 +1254,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts_internal(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") result_td = algos.value_counts_internal(td) tm.assert_series_equal(result_td, exp_td) From 759874e4d4290f873cabc3eb525df203bd77b7e4 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 25 Nov 2024 16:01:47 -0800 Subject: [PATCH 18/37] BUG: Fix formatting of complex numbers with exponents (#60417) Fix formatting of complex numbers with exponents --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/formats/format.py | 2 +- pandas/tests/io/formats/test_to_string.py | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1d55fc3ed7b84..1b12735f0e7c1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -789,6 +789,7 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 861f5885f80c6..4f87b1a30ca61 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1749,7 +1749,7 @@ def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[st # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""] # Therefore, the imaginary part is the 4th and 3rd last elements, # and the real part is everything before the imaginary part - trimmed = re.split(r"([j+-])", x) + trimmed = re.split(r"(? Date: Mon, 25 Nov 2024 16:03:56 -0800 Subject: [PATCH 19/37] Bump pypa/cibuildwheel from 2.21.3 to 2.22.0 (#60414) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.21.3 to 2.22.0. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.21.3...v2.22.0) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 354402c572ade..32ca5573ac08a 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -152,7 +152,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.21.3 + uses: pypa/cibuildwheel@v2.22.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From ab757ff8c352a0f02fbad22b463f0cfeaee88d3c Mon Sep 17 00:00:00 2001 From: sooooooing <126747506+sooooooing@users.noreply.github.com> Date: Wed, 27 Nov 2024 03:38:15 +0900 Subject: [PATCH 20/37] DOC: fix docstring api.types.is_re_compilable (#60419) * fix docstring api.types.is_re_compilable * fix lint error --- ci/code_checks.sh | 1 - pandas/core/dtypes/inference.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 246a907c5052c..9faa2a249613b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 6adb34ff0f777..918d107f2ce6c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -190,12 +190,17 @@ def is_re_compilable(obj: object) -> bool: Parameters ---------- obj : The object to check + The object to check if the object can be compiled into a regex pattern instance. Returns ------- bool Whether `obj` can be compiled as a regex pattern. + See Also + -------- + api.types.is_re : Check if the object is a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re_compilable From be41966198eebf2f56d32b7f0f8d6c3bc4283e61 Mon Sep 17 00:00:00 2001 From: "Olivier H." Date: Tue, 26 Nov 2024 19:41:31 +0100 Subject: [PATCH 21/37] DOC: Clarifying pandas.melt method documentation by replacing "massage" by "reshape" (#60420) Clarifying pandas.melt method documentation by replacing "massage" by "reshape" Meanwhile, "massage" is correct in a colloquial sense to mean transforming or reshaping data. This is far from accessible for a non-English speaker (as I am). Using the term `reshape` or `transform` is more meaningful while being accurate. --- doc/source/user_guide/reshaping.rst | 2 +- pandas/core/reshape/melt.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 3347f3a2534f4..8c5e98791a9ef 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -321,7 +321,7 @@ The missing value can be filled with a specific value with the ``fill_value`` ar .. image:: ../_static/reshaping_melt.png The top-level :func:`~pandas.melt` function and the corresponding :meth:`DataFrame.melt` -are useful to massage a :class:`DataFrame` into a format where one or more columns +are useful to reshape a :class:`DataFrame` into a format where one or more columns are *identifier variables*, while all other columns, considered *measured variables*, are "unpivoted" to the row axis, leaving just two non-identifier columns, "variable" and "value". The names of those columns can be customized diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bfd8e3ccd2f7c..f4cb82816bbcf 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -51,9 +51,9 @@ def melt( """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - This function is useful to massage a DataFrame into a format where one + This function is useful to reshape a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to + columns are considered measured variables (`value_vars`), and are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. From fd570f466e05f8944c67735d12c04eaab2d37478 Mon Sep 17 00:00:00 2001 From: partev Date: Tue, 26 Nov 2024 14:35:12 -0500 Subject: [PATCH 22/37] replace twitter->X (#60426) --- doc/source/conf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index ddbda0aa3bf65..677ee6274b093 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -242,7 +242,6 @@ "external_links": [], "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", - "twitter_url": "https://twitter.com/pandas_dev", "analytics": { "plausible_analytics_domain": "pandas.pydata.org", "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", @@ -258,6 +257,11 @@ # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) "show_version_warning_banner": False, "icon_links": [ + { + "name": "X", + "url": "https://x.com/pandas_dev", + "icon": "fa-brands fa-square-x-twitter", + }, { "name": "Mastodon", "url": "https://fosstodon.org/@pandas_dev", From 98f7e4deeff26a5ef993ee27104387a1a6e0d3d3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 26 Nov 2024 21:07:06 +0100 Subject: [PATCH 23/37] String dtype: use ObjectEngine for indexing for now correctness over performance (#60329) --- pandas/_libs/index.pyi | 3 + pandas/_libs/index.pyx | 25 +++++ pandas/core/indexes/base.py | 3 +- pandas/tests/indexes/string/test_indexing.py | 104 ++++++++++++++++-- .../io/parser/common/test_common_basic.py | 3 +- 5 files changed, 124 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index bf6d8ba8973d3..3af2856d2fbbf 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # np.ndarray[..., ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 1506a76aa94a6..688f943760d1f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -557,6 +557,31 @@ cdef class StringEngine(IndexEngine): raise KeyError(val) return str(val) +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + bint uses_na + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + self.uses_na = na_value is C_NA + + cdef bint _checknull(self, object val): + if self.uses_na: + return val is C_NA + else: + return util.is_nan(val) + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif self._checknull(val): + return self.na_value + else: + raise KeyError(val) + cdef class DatetimeEngine(Int64Engine): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d4ba7e01ebfa9..165fe109c4c94 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -876,7 +876,7 @@ def _engine( # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): - return libindex.StringEngine(target_values) + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -5974,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..d1a278af337b7 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,51 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + if any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) + ): + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + else: + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", @@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) -class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + tm.assert_numpy_array_equal(result, expected) - expected_indexer = np.array([2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 511db2c6a33d8..3680273f5e98a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 From 106f33cfce16f4e08f6ca5bd0e6e440ec9a94867 Mon Sep 17 00:00:00 2001 From: Jason Mok <106209849+jasonmokk@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:28:39 -0600 Subject: [PATCH 24/37] DOC: Add type hint for squeeze method (#60415) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 039bdf9c36ee7..a6be17a654aa7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -838,7 +838,7 @@ def pop(self, item: Hashable) -> Series | Any: return result @final - def squeeze(self, axis: Axis | None = None): + def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: """ Squeeze 1 dimensional axis objects into scalars. From 1d809c3c45c5cd0b32211790fa84172e7f48b270 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Thu, 28 Nov 2024 02:46:42 +0800 Subject: [PATCH 25/37] BUG: fix NameError raised when specifying dtype with string having "[pyarrow]" while PyArrow is not installed (#60413) * Add test * Fix * Add note * Update pandas/tests/dtypes/test_common.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * update * Fix doc warning --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/dtypes/dtypes.py | 2 ++ pandas/tests/dtypes/test_common.py | 7 +++++++ 3 files changed, 10 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1b12735f0e7c1..4bd31de185bb4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -761,6 +761,7 @@ ExtensionArray - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) +- Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) - Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 96b0aa16940a6..e5d1033de4457 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2344,6 +2344,8 @@ def construct_from_string(cls, string: str) -> ArrowDtype: if string == "string[pyarrow]": # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") + if pa_version_under10p1: + raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" try: diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e338fb1331734..5a59617ce5bd3 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -835,3 +835,10 @@ def test_pandas_dtype_string_dtypes(string_storage): with pd.option_context("string_storage", string_storage): result = pandas_dtype("string") assert result == pd.StringDtype(string_storage, na_value=pd.NA) + + +@td.skip_if_installed("pyarrow") +def test_construct_from_string_without_pyarrow_installed(): + # GH 57928 + with pytest.raises(ImportError, match="pyarrow>=10.0.1 is required"): + pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]") From 89e2efcd95beb68d381cccfb4a68c4d2ab421df3 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 21 Nov 2024 14:35:05 +0900 Subject: [PATCH 26/37] fixed comparison of string column to mixed object column (issue #60228) --- pandas/core/ops/array_ops.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 983a3df57e369..dc63a4fb04a3a 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -38,6 +38,7 @@ is_bool_dtype, is_list_like, is_numeric_v_string_like, + is_string_dtype, is_object_dtype, is_scalar, ) @@ -53,7 +54,7 @@ from pandas.core import roperator from pandas.core.computation import expressions -from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.construction import ensure_wrapped_if_datetimelike, array from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -321,6 +322,17 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) + if ( + (is_string_dtype(lvalues) and is_object_dtype(rvalues)) or + (is_object_dtype(lvalues) and is_string_dtype(rvalues)) + ): + if lvalues.dtype.name == "string" and rvalues.dtype == object: + lvalues = lvalues.astype("string") + rvalues = array(rvalues, dtype="string") + elif rvalues.dtype.name == "string" and lvalues.dtype == object: + rvalues = rvalues.astype("string") + lvalues = array(lvalues, dtype="string") + if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From a832418c25e676c59129dadda9aeafb97982dfb1 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 21 Nov 2024 14:39:52 +0900 Subject: [PATCH 27/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/ops/array_ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index dc63a4fb04a3a..f7dc17ae00ac7 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -333,6 +333,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: rvalues = rvalues.astype("string") lvalues = array(lvalues, dtype="string") + if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From 7152b018f7cf0bf668e8a96bea4262091478ba34 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 21 Nov 2024 14:41:24 +0900 Subject: [PATCH 28/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/ops/array_ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index f7dc17ae00ac7..dc63a4fb04a3a 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -333,7 +333,6 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: rvalues = rvalues.astype("string") lvalues = array(lvalues, dtype="string") - if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From 61ffbc0d14dc70919fc76e8a14476bd1f2df96aa Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Fri, 22 Nov 2024 09:12:19 +0900 Subject: [PATCH 29/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/tests/series/methods/test_compare.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py index 2a57d5139b62c..74d2e35eaf502 100644 --- a/pandas/tests/series/methods/test_compare.py +++ b/pandas/tests/series/methods/test_compare.py @@ -138,3 +138,15 @@ def test_compare_datetime64_and_string(): tm.assert_series_equal(result_eq1, expected_eq) tm.assert_series_equal(result_eq2, expected_eq) tm.assert_series_equal(result_neq, expected_neq) + +def test_comparison_string_mixed_object(): + pd.options.future.infer_string = True + + ser_string = pd.Series(["a", "b"], dtype="string") + ser_mixed = pd.Series([1, "b"]) + + result = ser_string == ser_mixed + expected = pd.Series([False, True], dtype="boolean") + tm.assert_series_equal(result, expected) + + pd.options.future.infer_string = False \ No newline at end of file From 104a60f97ab0748f99e008b944174ea7aa84d0af Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Fri, 22 Nov 2024 09:46:41 +0900 Subject: [PATCH 30/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/ops/array_ops.py | 16 +++++++++------- pandas/tests/series/methods/test_compare.py | 4 +++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4bd31de185bb4..1de45f4c5c140 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -771,6 +771,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`comparison_op` where comparing a ``string`` dtype array with an ``object`` dtype array containing mixed types would raise a ``TypeError`` when PyArrow-based strings are enabled. (:issue:`60228`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index dc63a4fb04a3a..255633786ede1 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -38,9 +38,9 @@ is_bool_dtype, is_list_like, is_numeric_v_string_like, - is_string_dtype, is_object_dtype, is_scalar, + is_string_dtype, ) from pandas.core.dtypes.generic import ( ABCExtensionArray, @@ -54,7 +54,10 @@ from pandas.core import roperator from pandas.core.computation import expressions -from pandas.core.construction import ensure_wrapped_if_datetimelike, array +from pandas.core.construction import ( + array as pd_array, + ensure_wrapped_if_datetimelike, +) from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -322,16 +325,15 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) - if ( - (is_string_dtype(lvalues) and is_object_dtype(rvalues)) or - (is_object_dtype(lvalues) and is_string_dtype(rvalues)) + if (is_string_dtype(lvalues) and is_object_dtype(rvalues)) or ( + is_object_dtype(lvalues) and is_string_dtype(rvalues) ): if lvalues.dtype.name == "string" and rvalues.dtype == object: lvalues = lvalues.astype("string") - rvalues = array(rvalues, dtype="string") + rvalues = pd_array(rvalues, dtype="string") elif rvalues.dtype.name == "string" and lvalues.dtype == object: rvalues = rvalues.astype("string") - lvalues = array(lvalues, dtype="string") + lvalues = pd_array(lvalues, dtype="string") if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py index 74d2e35eaf502..93fef353457a7 100644 --- a/pandas/tests/series/methods/test_compare.py +++ b/pandas/tests/series/methods/test_compare.py @@ -139,7 +139,9 @@ def test_compare_datetime64_and_string(): tm.assert_series_equal(result_eq2, expected_eq) tm.assert_series_equal(result_neq, expected_neq) + def test_comparison_string_mixed_object(): + # Issue https://github.com/pandas-dev/pandas/issues/60228 pd.options.future.infer_string = True ser_string = pd.Series(["a", "b"], dtype="string") @@ -149,4 +151,4 @@ def test_comparison_string_mixed_object(): expected = pd.Series([False, True], dtype="boolean") tm.assert_series_equal(result, expected) - pd.options.future.infer_string = False \ No newline at end of file + pd.options.future.infer_string = False From 658f7576937b28fa6447c57651adeca674ddbcbf Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 28 Nov 2024 10:27:41 +0900 Subject: [PATCH 31/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/arrays/arrow/array.py | 8 +++++++- pandas/core/ops/array_ops.py | 16 +--------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0c93db0afb07..0c56474141b8f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -726,7 +726,13 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): try: - result = pc_func(self._pa_array, self._box_pa(other)) + if pa.types.is_string(self._pa_array.type): + other_array = self._box_pa(other) + if pa.types.is_string(other_array.type): + other_array = other_array.cast(pa.large_string()) + result = pc_func(self._pa_array, other_array) + else: + result = pc_func(self._pa_array, self._box_pa(other)) except pa.ArrowNotImplementedError: # TODO: could this be wrong if other is object dtype? # in which case we need to operate pointwise? diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 255633786ede1..983a3df57e369 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -40,7 +40,6 @@ is_numeric_v_string_like, is_object_dtype, is_scalar, - is_string_dtype, ) from pandas.core.dtypes.generic import ( ABCExtensionArray, @@ -54,10 +53,7 @@ from pandas.core import roperator from pandas.core.computation import expressions -from pandas.core.construction import ( - array as pd_array, - ensure_wrapped_if_datetimelike, -) +from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -325,16 +321,6 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) - if (is_string_dtype(lvalues) and is_object_dtype(rvalues)) or ( - is_object_dtype(lvalues) and is_string_dtype(rvalues) - ): - if lvalues.dtype.name == "string" and rvalues.dtype == object: - lvalues = lvalues.astype("string") - rvalues = pd_array(rvalues, dtype="string") - elif rvalues.dtype.name == "string" and lvalues.dtype == object: - rvalues = rvalues.astype("string") - lvalues = pd_array(lvalues, dtype="string") - if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From 0129c686b888fa6c5e3f4b550627db3c82c63b17 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 21 Nov 2024 14:39:52 +0900 Subject: [PATCH 32/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/ops/array_ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 255633786ede1..249dc6337a64b 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -335,6 +335,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: rvalues = rvalues.astype("string") lvalues = pd_array(lvalues, dtype="string") + if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From 65ae2e2a507addf802f628e264d38ea058437755 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 21 Nov 2024 14:41:24 +0900 Subject: [PATCH 33/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/ops/array_ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 249dc6337a64b..255633786ede1 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -335,7 +335,6 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: rvalues = rvalues.astype("string") lvalues = pd_array(lvalues, dtype="string") - if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From 497e8a6314aa5d8f1ff7549f266657ee2a9c379c Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 28 Nov 2024 10:27:41 +0900 Subject: [PATCH 34/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/arrays/arrow/array.py | 8 +++++++- pandas/core/ops/array_ops.py | 16 +--------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0c93db0afb07..0c56474141b8f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -726,7 +726,13 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): try: - result = pc_func(self._pa_array, self._box_pa(other)) + if pa.types.is_string(self._pa_array.type): + other_array = self._box_pa(other) + if pa.types.is_string(other_array.type): + other_array = other_array.cast(pa.large_string()) + result = pc_func(self._pa_array, other_array) + else: + result = pc_func(self._pa_array, self._box_pa(other)) except pa.ArrowNotImplementedError: # TODO: could this be wrong if other is object dtype? # in which case we need to operate pointwise? diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 255633786ede1..983a3df57e369 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -40,7 +40,6 @@ is_numeric_v_string_like, is_object_dtype, is_scalar, - is_string_dtype, ) from pandas.core.dtypes.generic import ( ABCExtensionArray, @@ -54,10 +53,7 @@ from pandas.core import roperator from pandas.core.computation import expressions -from pandas.core.construction import ( - array as pd_array, - ensure_wrapped_if_datetimelike, -) +from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -325,16 +321,6 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) - if (is_string_dtype(lvalues) and is_object_dtype(rvalues)) or ( - is_object_dtype(lvalues) and is_string_dtype(rvalues) - ): - if lvalues.dtype.name == "string" and rvalues.dtype == object: - lvalues = lvalues.astype("string") - rvalues = pd_array(rvalues, dtype="string") - elif rvalues.dtype.name == "string" and lvalues.dtype == object: - rvalues = rvalues.astype("string") - lvalues = pd_array(lvalues, dtype="string") - if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and lvalues.dtype != object From 56bc8b17a75fe542c2a8a0bcb611703b4bff8186 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 28 Nov 2024 14:19:01 +0900 Subject: [PATCH 35/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/arrays/arrow/array.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0c56474141b8f..8f8372add8cf7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -728,9 +728,10 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: try: if pa.types.is_string(self._pa_array.type): other_array = self._box_pa(other) + self_array = self._pa_array.cast(pa.large_string()) if pa.types.is_string(other_array.type): other_array = other_array.cast(pa.large_string()) - result = pc_func(self._pa_array, other_array) + result = pc_func(self_array, other_array) else: result = pc_func(self._pa_array, self._box_pa(other)) except pa.ArrowNotImplementedError: From b301ac086185604d95e1c602420255d772e0c505 Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Thu, 28 Nov 2024 23:51:58 +0900 Subject: [PATCH 36/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/arrays/arrow/array.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8f8372add8cf7..c9ad25f174054 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -726,14 +726,21 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): try: - if pa.types.is_string(self._pa_array.type): - other_array = self._box_pa(other) - self_array = self._pa_array.cast(pa.large_string()) + other_array = self._box_pa(other) + if isinstance(other_array.type, pa.DictionaryType): + other_array = other_array.dictionary_decode() if pa.types.is_string(other_array.type): - other_array = other_array.cast(pa.large_string()) + other_array = other_array.cast(pa.string()) + if pa.types.is_string(self._pa_array.type): + self_array = self._pa_array + if not pa.types.is_string(other_array.type): + other_array = other_array.cast(pa.string()) result = pc_func(self_array, other_array) else: - result = pc_func(self._pa_array, self._box_pa(other)) + result = pc_func(self._pa_array, other_array) + if result.type == pa.string(): + result = result.cast(self._pa_array.type) + return type(self)(result) except pa.ArrowNotImplementedError: # TODO: could this be wrong if other is object dtype? # in which case we need to operate pointwise? From 01887f8f321928572c69a917c8d98a39093d79ec Mon Sep 17 00:00:00 2001 From: parkjaewon Date: Fri, 29 Nov 2024 00:15:13 +0900 Subject: [PATCH 37/37] BUG (string dtype): comparison of string column to mixed object column fails #60228 --- pandas/core/arrays/arrow/array.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c9ad25f174054..8f8372add8cf7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -726,21 +726,14 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): try: - other_array = self._box_pa(other) - if isinstance(other_array.type, pa.DictionaryType): - other_array = other_array.dictionary_decode() - if pa.types.is_string(other_array.type): - other_array = other_array.cast(pa.string()) if pa.types.is_string(self._pa_array.type): - self_array = self._pa_array - if not pa.types.is_string(other_array.type): - other_array = other_array.cast(pa.string()) + other_array = self._box_pa(other) + self_array = self._pa_array.cast(pa.large_string()) + if pa.types.is_string(other_array.type): + other_array = other_array.cast(pa.large_string()) result = pc_func(self_array, other_array) else: - result = pc_func(self._pa_array, other_array) - if result.type == pa.string(): - result = result.cast(self._pa_array.type) - return type(self)(result) + result = pc_func(self._pa_array, self._box_pa(other)) except pa.ArrowNotImplementedError: # TODO: could this be wrong if other is object dtype? # in which case we need to operate pointwise?