Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT Add make_series and make_dataframe #798

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 0 additions & 40 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,46 +101,6 @@ This page lists all available functions and classes of `skrub`.

deduplicate

.. raw:: html

<h2>Dataframes operations</h2>

.. autosummary::
:toctree: generated/
:template: function.rst
:nosignatures:
:caption: DataFrames operations

dataframe.get_df_namespace

.. raw:: html

<h3>Pandas</h3>

.. autosummary::
:toctree: generated/
:template: function.rst
:nosignatures:
:caption: Pandas operations

dataframe.is_pandas
dataframe.pd_aggregate
dataframe.pd_join

.. raw:: html

<h3>Polars</h3>

.. autosummary::
:toctree: generated/
:template: function.rst
:nosignatures:
:caption: Polars operations

dataframe.is_polars
dataframe.pl_aggregate
dataframe.pl_join

.. raw:: html

<h2>Data download and generation</h2>
Expand Down
56 changes: 19 additions & 37 deletions skrub/_agg_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
from typing import Iterable

import numpy as np
from numpy.typing import ArrayLike
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_is_fitted

from skrub._dataframe._namespace import get_df_namespace
from skrub._dataframe._pandas import _parse_argument
from skrub._utils import atleast_1d_or_none, atleast_2d_or_none
from skrub.dataframe import DataFrameLike, SeriesLike
from skrub.dataframe._namespace import get_df_namespace
from skrub.dataframe._pandas import _parse_argument

NUM_OPERATIONS = ["sum", "mean", "std", "min", "max", "hist", "value_counts"]
CATEG_OPERATIONS = ["mode", "count", "value_counts"]
Expand Down Expand Up @@ -52,10 +50,10 @@ def split_num_categ_operations(operations: list[str]) -> tuple[list[str], list[s


def check_missing_columns(
X: DataFrameLike,
columns: list[str],
error_msg: str,
) -> None:
X,
columns,
error_msg,
):
"""All elements of main_key must belong to the columns of X.

Parameters
Expand Down Expand Up @@ -161,13 +159,13 @@ class AggJoiner(BaseEstimator, TransformerMixin):

def __init__(
self,
aux_table: DataFrameLike | Iterable[DataFrameLike] | str | Iterable[str],
aux_table,
*,
aux_key: str | Iterable[str],
main_key: str | Iterable[str],
cols: str | Iterable[str] | None = None,
operation: str | Iterable[str] | None = None,
suffix: str | Iterable[str] | None = None,
aux_key,
main_key,
cols=None,
operation=None,
suffix=None,
):
self.aux_table = aux_table
self.aux_key = aux_key
Expand All @@ -176,11 +174,7 @@ def __init__(
self.operation = operation
self.suffix = suffix

def fit(
self,
X: DataFrameLike,
y: ArrayLike | SeriesLike | None = None,
) -> "AggJoiner":
def fit(self, X, y=None):
"""Aggregate auxiliary tables based on the main keys.

Parameters
Expand Down Expand Up @@ -221,7 +215,7 @@ def fit(

return self

def transform(self, X: DataFrameLike) -> DataFrameLike:
def transform(self, X):
"""Left-join pre-aggregated tables on `X`.

Parameters
Expand All @@ -248,18 +242,14 @@ def transform(self, X: DataFrameLike) -> DataFrameLike:

return X

def _screen(
self,
aux_table: DataFrameLike,
y: DataFrameLike | SeriesLike | ArrayLike,
) -> DataFrameLike:
def _screen(self, aux_table, y):
"""Only keep aggregated features which correlation with
y is above some threshold.
"""
# TODO: Add logic
return aux_table

def check_input(self, X: DataFrameLike) -> None:
def check_input(self, X):
"""Perform a check on column names data type and suffixes.

Parameters
Expand Down Expand Up @@ -452,11 +442,7 @@ def __init__(
self.operation = operation
self.suffix = suffix

def fit(
self,
X: DataFrameLike,
y: DataFrameLike | SeriesLike | ArrayLike,
) -> "AggTarget":
def fit(self, X, y):
"""Aggregate the target ``y`` based on keys from ``X``.

Parameters
Expand Down Expand Up @@ -501,7 +487,7 @@ def fit(

return self

def transform(self, X: DataFrameLike) -> DataFrameLike:
def transform(self, X):
"""Left-join pre-aggregated tables on `X`.

Parameters
Expand All @@ -524,11 +510,7 @@ def transform(self, X: DataFrameLike) -> DataFrameLike:
right_on=self.main_key_,
)

def check_input(
self,
X: DataFrameLike,
y: DataFrameLike | SeriesLike | ArrayLike,
) -> DataFrameLike:
def check_input(self, X, y):
"""Perform a check on column names data type and suffixes.

Parameters
Expand Down
File renamed without changes.
14 changes: 5 additions & 9 deletions skrub/dataframe/_namespace.py → skrub/_dataframe/_namespace.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import sys
from types import ModuleType

import pandas as pd

import skrub.dataframe._pandas as skrub_pd
import skrub.dataframe._polars as skrub_pl
from skrub.dataframe._types import DataFrameLike
import skrub._dataframe._pandas as skrub_pd
import skrub._dataframe._polars as skrub_pl


def is_pandas(dataframe: DataFrameLike) -> bool:
def is_pandas(dataframe):
"""Check whether the input is a Pandas dataframe.

Parameters
Expand All @@ -24,7 +22,7 @@ def is_pandas(dataframe: DataFrameLike) -> bool:
return isinstance(dataframe, pd.DataFrame)


def is_polars(dataframe: DataFrameLike) -> bool:
def is_polars(dataframe):
"""Check whether the input is a Polars dataframe or lazyframe.

Parameters
Expand All @@ -45,9 +43,7 @@ def is_polars(dataframe: DataFrameLike) -> bool:
return isinstance(dataframe, (pl.DataFrame, pl.LazyFrame))


def get_df_namespace(
*dfs: DataFrameLike | list[DataFrameLike],
) -> tuple[ModuleType, ModuleType]:
def get_df_namespace(*dfs):
"""Get the namespaces of dataframes.

Introspects dataframes and returns their skrub namespace object
Expand Down
77 changes: 56 additions & 21 deletions skrub/dataframe/_pandas.py → skrub/_dataframe/_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,63 @@
Pandas specialization of the aggregate and join operation.
"""
import re
from collections.abc import Callable
from itertools import product
from typing import Iterable

import numpy as np
import pandas as pd

from skrub._utils import atleast_1d_or_none


def make_dataframe(X, index=None):
"""Convert an dictionary of columns into a Pandas dataframe.

Parameters
----------
X : mapping from column name to 1d iterable
Input data to convert.

index : 1d array-like, default=None
The index of the dataframe.

Returns
-------
X : Pandas dataframe
Converted output.
"""
return pd.DataFrame(X, index=index)


def make_series(X, index=None, name=None):
"""Convert an 1d array into a Pandas series.

Parameters
----------
X : 1d iterable
Input data to convert.

index : 1d array-like, default=None
The index of the series.

name : str, default=None
The name of the series.

Returns
-------
X : Pandas series
Converted output.
"""
return pd.Series(X, index=index, name=name)


def aggregate(
table: pd.DataFrame,
key: str | Iterable[str],
cols_to_agg: str | Iterable[str],
num_operations: str | Iterable[str] = ("mean",),
categ_operations: str | Iterable[str] = ("mode",),
suffix: str | None = None,
) -> pd.DataFrame:
table,
key,
cols_to_agg,
num_operations=("mean",),
categ_operations=("mode",),
suffix=None,
):
"""Aggregates a :obj:`pandas.DataFrame`.

This function uses the ``dataframe.groupby(key).agg`` method from Pandas.
Expand Down Expand Up @@ -107,11 +146,11 @@ def aggregate(


def join(
left: pd.DataFrame,
right: pd.DataFrame,
left_on: str | Iterable[str],
right_on: str | Iterable[str],
) -> pd.DataFrame:
left,
right,
left_on,
right_on,
):
"""Left join two :obj:`pandas.DataFrame`.

This function uses the ``dataframe.merge`` method from Pandas.
Expand Down Expand Up @@ -148,9 +187,7 @@ def join(
)


def get_named_agg(
table: pd.DataFrame, cols: list[str], operations: list[str]
) -> tuple[dict, dict]:
def get_named_agg(table, cols, operations):
"""Map aggregation tuples to their output key.

The dictionary has the form: output_key = (column, aggfunc).
Expand Down Expand Up @@ -195,7 +232,7 @@ def get_named_agg(
return named_agg, value_counts


def _parse_argument(operation: str) -> tuple[str, int | None]:
def _parse_argument(operation):
"""Split a text input into a function name and its argument.

Parameters
Expand Down Expand Up @@ -237,9 +274,7 @@ def _parse_argument(operation: str) -> tuple[str, int | None]:
}


def _get_aggfunc(
serie: pd.Series, op_root: str, n_bins: int
) -> tuple[str | Callable, dict]:
def _get_aggfunc(serie, op_root, n_bins):
"""Map operation roots to their pandas agg functions.

When args is provided for histogram or value_counts,
Expand Down
Loading
Loading