diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 20fe8cbab1c9f..4e55aedbb2845 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -897,3 +897,13 @@ def register_converter_cb(key: str) -> None: "(at which point this option will be deprecated).", validator=is_one_of_factory([True, False]), ) + + cf.register_option( + "usecols_use_order", + False, + ": bool\n " + "Whether usecols parameter will use order of input when " + "making a DataFrame. \n This feature will be default in pandas 3.0" + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False]), + ) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 4fbd71ed03662..7d345791b5a7d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -11,6 +11,7 @@ defaultdict, ) import csv +from inspect import isfunction import sys from textwrap import fill from typing import ( @@ -26,6 +27,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( @@ -1516,8 +1519,10 @@ def read(self, nrows: int | None = None) -> DataFrame: if hasattr(self, "orig_options"): dtype_arg = self.orig_options.get("dtype", None) + usecols = self.orig_options.get("usecols", None) else: dtype_arg = None + usecols = None if isinstance(dtype_arg, dict): dtype = defaultdict(lambda: None) # type: ignore[var-annotated] @@ -1530,6 +1535,17 @@ def read(self, nrows: int | None = None) -> DataFrame: else: dtype = None + if get_option("future.usecols_use_order"): + if usecols is None or isfunction(usecols): + # Doesn't change anything if function or None gets passed + pass + elif len(usecols) == len(columns): + # uses size of number in usecols to determine corresponding columns + value_ranked = {v: i for i, v in enumerate(sorted(usecols))} + usecols_pressed = [value_ranked[v] for v in usecols] + columns = [columns[i] for i in usecols_pressed] + col_dict = {k: col_dict[k] for k in columns} + if dtype is not None: new_col_dict = {} for k, v in col_dict.items(): @@ -1548,7 +1564,6 @@ def read(self, nrows: int | None = None) -> DataFrame: index=index, copy=False, ) - self._currow += new_rows return df diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 82b42beb38ae0..e09f88ba3f113 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config.config import option_context + from pandas.errors import ParserError from pandas import ( @@ -545,3 +547,41 @@ def test_usecols_dtype(all_parsers): {"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [(3, 0, 2), ("d", "a", "c")]) +@pytest.mark.parametrize("usecols_use_order", (True, False)) +def test_usecols_order(all_parsers, usecols, usecols_use_order): + # TODOE add portion in doc for 3.0 transition + parser = all_parsers + pyarrow_flag = False + data = """\ +a,b,c,d +1,2,3,0 +4,5,6,0 +7,8,9,0 +10,11,12,13""" + + if parser.engine == "pyarrow": + if isinstance(usecols[0], int): + msg = "The pyarrow engine does not allow 'usecols' to be integer column" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), usecols=usecols) + return + else: + # looks like pyarrow already considers column order by default. + # Modifies test to account for it in selecting expected df + pyarrow_flag = True + + if usecols_use_order or pyarrow_flag: + expected = DataFrame( + {"d": [0, 0, 0, 13], "a": [1, 4, 7, 10], "c": [3, 6, 9, 12]} + ) + else: + expected = DataFrame( + {"a": [1, 4, 7, 10], "c": [3, 6, 9, 12], "d": [0, 0, 0, 13]} + ) + + with option_context("future.usecols_use_order", usecols_use_order): + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected)