Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

row_to_names improvement #1379

Merged
merged 19 commits into from
Jul 13, 2024
187 changes: 141 additions & 46 deletions janitor/functions/row_to_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

import warnings
from functools import singledispatch

import numpy as np
import pandas as pd
Expand All @@ -15,7 +15,7 @@
@deprecated_alias(row_number="row_numbers", remove_row="remove_rows")
def row_to_names(
df: pd.DataFrame,
row_numbers: int | list = 0,
row_numbers: int | list | slice = 0,
remove_rows: bool = False,
remove_rows_above: bool = False,
reset_index: bool = False,
Expand Down Expand Up @@ -47,7 +47,7 @@ def row_to_names(
1 9 y
>>> df.row_to_names([0,1], remove_rows=True, reset_index=True)
nums chars
6 x
6 x
0 9 y

Remove rows above the elevated row and the elevated row itself.
Expand All @@ -72,8 +72,7 @@ def row_to_names(
Args:
df: A pandas DataFrame.
row_numbers: Position of the row(s) containing the variable names.
Note that indexing starts from 0. It can also be a list,
in which case, a MultiIndex column is created.
It can be an integer, a list or a slice.
Defaults to 0 (first row).
remove_rows: Whether the row(s) should be removed from the DataFrame.
remove_rows_above: Whether the row(s) above the selected row should
Expand All @@ -83,53 +82,149 @@ def row_to_names(
Returns:
A pandas DataFrame with set column names.
""" # noqa: E501
if not pd.options.mode.copy_on_write:
df = df.copy()

check("row_numbers", row_numbers, [int, list])
if isinstance(row_numbers, list):
for entry in row_numbers:
check("entry in the row_numbers argument", entry, [int])

warnings.warn(
"The function row_to_names will, in the official 1.0 release, "
"change its behaviour to reset the dataframe's index by default. "
"You can prepare for this change right now by explicitly setting "
"`reset_index=True` when calling on `row_to_names`."

return _row_to_names(
row_numbers,
df=df,
remove_rows=remove_rows,
remove_rows_above=remove_rows_above,
reset_index=reset_index,
)


@singledispatch
def _row_to_names(
row_numbers, df, remove_rows, remove_rows_above, reset_index
) -> pd.DataFrame:
"""
Base function for row_to_names.
"""
raise TypeError(
"row_numbers should be either an integer, "
"a slice or a list; "
f"instead got type {type(row_numbers).__name__}"
)
# should raise if positional indexers are missing
# IndexError: positional indexers are out-of-bounds
headers = df.iloc[row_numbers]


@_row_to_names.register(int) # noqa: F811
def _row_to_names_dispatch( # noqa: F811
row_numbers, df, remove_rows, remove_rows_above, reset_index
):
df_ = df[:]
headers = df_.iloc[row_numbers]
df_.columns = headers
df_.columns.name = None
if not remove_rows and not remove_rows_above and not reset_index:
return df_
if not remove_rows and not remove_rows_above and reset_index:
return df_.reset_index(drop=True)

len_df = len(df_)
arrays = [arr._values for _, arr in df_.items()]
if remove_rows_above and remove_rows:
indexer = np.arange(row_numbers + 1, len_df)
elif remove_rows_above:
indexer = np.arange(row_numbers, len_df)
elif remove_rows:
indexer = np.arange(len_df)
mask = np.ones(len_df, dtype=np.bool_)
mask[row_numbers] = False
indexer = indexer[mask]
arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
if reset_index:
df_index = pd.RangeIndex(start=0, stop=indexer.size)
else:
df_index = df_.index[indexer]
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
_df.columns = df_.columns
return _df


@_row_to_names.register(slice) # noqa: F811
def _row_to_names_dispatch( # noqa: F811
row_numbers, df, remove_rows, remove_rows_above, reset_index
):
if row_numbers.step is not None:
raise ValueError(
"The step argument for slice is not supported in row_to_names."
)
df_ = df[:]
headers = df_.iloc[row_numbers]
if isinstance(headers, pd.DataFrame) and (len(headers) == 1):
headers = headers.squeeze()
if isinstance(headers, pd.Series):
headers = pd.Index(headers)
df_.columns = headers
df_.columns.name = None
else:
headers = [entry.array for _, entry in headers.items()]
headers = [array._values for _, array in headers.items()]
headers = pd.MultiIndex.from_tuples(headers)
df_.columns = headers
if not remove_rows and not remove_rows_above and not reset_index:
return df_
if not remove_rows and not remove_rows_above and reset_index:
return df_.reset_index(drop=True)
len_df = len(df_)
arrays = [arr._values for _, arr in df_.items()]
if remove_rows_above and remove_rows:
indexer = np.arange(row_numbers.stop, len_df)
elif remove_rows_above:
indexer = np.arange(row_numbers.start, len_df)
elif remove_rows:
indexer = np.arange(len_df)
mask = np.ones(len_df, dtype=np.bool_)
mask[row_numbers] = False
indexer = indexer[mask]
arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
if reset_index:
df_index = pd.RangeIndex(start=0, stop=indexer.size)
else:
df_index = df_.index[indexer]
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
_df.columns = df_.columns
return _df

df.columns = headers
df.columns.name = None

df_index = df.index
@_row_to_names.register(list) # noqa: F811
def _row_to_names_dispatch( # noqa: F811
row_numbers, df, remove_rows, remove_rows_above, reset_index
):
if remove_rows_above:
if isinstance(row_numbers, list):
if not (np.diff(row_numbers) == 1).all():
raise ValueError(
"The remove_rows_above argument is applicable "
"only if the row_numbers argument is an integer, "
"or the integers in a list are consecutive increasing, "
"with a difference of 1."
)
tail = row_numbers[0]
else:
tail = row_numbers
df = df.iloc[tail:]
if remove_rows:
if isinstance(row_numbers, int):
row_numbers = [row_numbers]
df_index = df.index.symmetric_difference(df_index[row_numbers])
df = df.loc[df_index]
raise ValueError(
"The remove_rows_above argument is applicable "
"only if the row_numbers argument is an integer "
"or a slice."
)

for entry in row_numbers:
check("entry in the row_numbers argument", entry, [int])

df_ = df[:]
headers = df_.iloc[row_numbers]
if isinstance(headers, pd.DataFrame) and (len(headers) == 1):
headers = headers.squeeze()
df_.columns = headers
df_.columns.name = None
else:
headers = [array._values for _, array in headers.items()]
headers = pd.MultiIndex.from_tuples(headers)
df_.columns = headers

if not remove_rows and reset_index:
return df_.reset_index(drop=True)
if not remove_rows and not reset_index:
return df_

len_df = len(df_)
arrays = [arr._values for _, arr in df_.items()]
indexer = np.arange(len_df)
mask = np.ones(len_df, dtype=np.bool_)
mask[row_numbers] = False
indexer = indexer[mask]

arrays = {num: arr[indexer] for num, arr in enumerate(arrays)}
if reset_index:
df.index = range(len(df))
return df
df_index = pd.RangeIndex(start=0, stop=indexer.size)
else:
df_index = df_.index[indexer]
_df = pd.DataFrame(data=arrays, index=df_index, copy=False)
_df.columns = df_.columns
return _df
2 changes: 1 addition & 1 deletion janitor/polars/complete.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
try:
import polars as pl
import polars.selectors as cs
from polars.type_aliases import ColumnNameOrSelector
from polars._typing import ColumnNameOrSelector
except ImportError:
import_message(
submodule="polars",
Expand Down
2 changes: 1 addition & 1 deletion janitor/polars/pivot_longer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

try:
import polars as pl
from polars.type_aliases import ColumnNameOrSelector
from polars._typing import ColumnNameOrSelector
except ImportError:
import_message(
submodule="polars",
Expand Down
Loading
Loading