Skip to content

Commit

Permalink
better handling of timedelta in tablereport
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromedockes committed Nov 21, 2024
1 parent ef9d0c1 commit 21333e8
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 8 deletions.
4 changes: 3 additions & 1 deletion skrub/_column_associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ def _onehot_encode(df, n_bins):
output = np.zeros((n_cols, n_bins, n_rows), dtype=bool)
for col_idx in range(n_cols):
col = sbd.col_by_idx(df, col_idx)
if sbd.is_numeric(col):
if sbd.is_duration(col):
col = sbd.total_seconds(col)

Check warning on line 110 in skrub/_column_associations.py

View check run for this annotation

Codecov / codecov/patch

skrub/_column_associations.py#L110

Added line #L110 was not covered by tests
if sbd.is_numeric(col) or sbd.is_any_date(col):
col = sbd.to_float32(col)
if _CATEGORICAL_THRESHOLD <= sbd.n_unique(col):
_onehot_encode_numbers(sbd.to_numpy(col), n_bins, output[col_idx])
Expand Down
48 changes: 48 additions & 0 deletions skrub/_dataframe/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"is_pandas_object",
"is_any_date",
"to_datetime",
"is_duration",
"is_categorical",
"to_categorical",
"is_all_null",
Expand Down Expand Up @@ -101,6 +102,8 @@
"slice",
"replace",
"with_columns",
"abs",
"total_seconds",
]

#
Expand Down Expand Up @@ -809,6 +812,21 @@ def _to_datetime_polars(col, format, strict=True):
raise ValueError("Failed to convert to datetime") from e


@dispatch
def is_duration(col):
raise NotImplementedError()


@is_duration.specialize("pandas", argument_type="Column")
def _is_duration_pandas(col):
return pd.api.types.is_timedelta64_dtype(col)


@is_duration.specialize("polars", argument_type="Column")
def _is_duration_polars(col):
return col.dtype == pl.Duration

Check warning on line 827 in skrub/_dataframe/_common.py

View check run for this annotation

Codecov / codecov/patch

skrub/_dataframe/_common.py#L827

Added line #L827 was not covered by tests


@dispatch
def is_categorical(col):
raise NotImplementedError()
Expand Down Expand Up @@ -1226,3 +1244,33 @@ def with_columns(df, **new_cols):
cols = {col_name: col(df, col_name) for col_name in column_names(df)}
cols.update({n: make_column_like(df, c, n) for n, c in new_cols.items()})
return make_dataframe_like(df, cols)


@dispatch
def abs(col):
raise NotImplementedError()


@abs.specialize("pandas", argument_type="Column")
def _abs_pandas(col):
return col.abs()

Check warning on line 1256 in skrub/_dataframe/_common.py

View check run for this annotation

Codecov / codecov/patch

skrub/_dataframe/_common.py#L1256

Added line #L1256 was not covered by tests


@abs.specialize("polars", argument_type="Column")
def _abs_polars(col):
return col.abs()

Check warning on line 1261 in skrub/_dataframe/_common.py

View check run for this annotation

Codecov / codecov/patch

skrub/_dataframe/_common.py#L1261

Added line #L1261 was not covered by tests


@dispatch
def total_seconds(col):
raise NotImplementedError()


@total_seconds.specialize("pandas")
def _total_seconds_pandas(col):
return col.dt.total_seconds()

Check warning on line 1271 in skrub/_dataframe/_common.py

View check run for this annotation

Codecov / codecov/patch

skrub/_dataframe/_common.py#L1271

Added line #L1271 was not covered by tests


@total_seconds.specialize("polars")
def _total_seconds_polars(col):
return col.dt.total_seconds()

Check warning on line 1276 in skrub/_dataframe/_common.py

View check run for this annotation

Codecov / codecov/patch

skrub/_dataframe/_common.py#L1276

Added line #L1276 was not covered by tests
14 changes: 13 additions & 1 deletion skrub/_reporting/_data/templates/column-summary.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,39 @@ <h3 class="margin-r-m">
<dt>Unique values</dt>
<dd>{{ column.n_unique | format_number }} ({{ column.unique_proportion | format_percent }})</dd>
{% endif %}

{% if column["duration_unit"] %}
{% set unit = " {}s".format(column['duration_unit']) %}
{% else %}
{% set unit = "" %}
{% endif %}

{% if "mean" in column %}
<dt>Mean ± Std</dt>
<dd>{{ column["mean"] | format_number }} ±
{{ column["standard_deviation"] | format_number }}
{{ unit }}
</dd>
{% endif %}
{% if column.quantiles %}
<dt>Median ± IQR</dt>
<dd>{{ column.quantiles[0.5] | format_number }} ±
{{ column["inter_quartile_range"] | format_number}}
{{ unit }}
</dd>

<dt>Min | Max</dt>
<dd>
{{ column.quantiles[0.0] | format_number }} |
{{ column.quantiles[1.0] | format_number }}
{{ unit }}
</dd>
{% elif "min" in column %}
<dt>Min | Max</dt>
<dd>
{{ column.min | format_number }} |
{{ column.max | format_number }}
{{ unit }}
</dd>
{% endif %}
{% endif %}
Expand All @@ -70,7 +81,8 @@ <h3 class="margin-r-m">
<strong>Constant value:</strong>
<div class="copybutton-grid">
<div class="box">
<pre id="{{ val_id }}">{{ column.constant_value }}</pre>
<pre id="{{ val_id }}"
data-copy-text="{{ column.constant_value.__repr__() }}">{{ column.constant_value }}</pre>
{{ buttons.copybutton(val_id) }}
</div>
</div>
Expand Down
4 changes: 3 additions & 1 deletion skrub/_reporting/_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,15 @@ def _adjust_fig_size(fig, ax, target_w, target_h):


@_plot
def histogram(col, color=COLOR_0):
def histogram(col, duration_unit=None, color=COLOR_0):
"""Histogram for a numeric column."""
col = sbd.drop_nulls(col)
values = sbd.to_numpy(col)
fig, ax = plt.subplots()
_despine(ax)
ax.hist(values, color=color)
if duration_unit is not None:
ax.set_xlabel(f"{duration_unit.capitalize()}s")

Check warning on line 126 in skrub/_reporting/_plotting.py

View check run for this annotation

Codecov / codecov/patch

skrub/_reporting/_plotting.py#L126

Added line #L126 was not covered by tests
if sbd.is_any_date(col):
_rotate_ticklabels(ax)
_adjust_fig_size(fig, ax, 2.0, 1.0)
Expand Down
18 changes: 13 additions & 5 deletions skrub/_reporting/_summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def _add_nulls_summary(summary, column, dataframe_summary):


def _add_value_counts(summary, column, *, dataframe_summary, with_plots):
if sbd.is_numeric(column) or sbd.is_any_date(column):
if sbd.is_numeric(column) or sbd.is_any_date(column) or sbd.is_duration(column):
summary["high_cardinality"] = True
return
n_unique, value_counts = _utils.top_k_value_counts(column, k=10)
Expand Down Expand Up @@ -208,24 +208,32 @@ def _add_numeric_summary(
summary, column, dataframe_summary, with_plots, order_by_column
):
del dataframe_summary
if not sbd.is_numeric(column):
return
first_value = sbd.to_list(sbd.head(column, 1))[0]
if sbd.is_duration(column):
summary["is_duration"] = True
column, duration_unit = _utils.duration_to_numeric(column)

Check warning on line 214 in skrub/_reporting/_summarize.py

View check run for this annotation

Codecov / codecov/patch

skrub/_reporting/_summarize.py#L213-L214

Added lines #L213 - L214 were not covered by tests
else:
summary["is_duration"] = False
if not sbd.is_numeric(column):
return
duration_unit = None
summary["duration_unit"] = duration_unit
std = sbd.std(column)
summary["standard_deviation"] = float("nan") if std is None else std
summary["mean"] = sbd.mean(column)
quantiles = _utils.quantiles(column)
summary["inter_quartile_range"] = quantiles[0.75] - quantiles[0.25]
if quantiles[0.0] == quantiles[1.0]:
summary["value_is_constant"] = True
summary["constant_value"] = quantiles[0.0]
summary["constant_value"] = first_value
return
summary["value_is_constant"] = False
summary["quantiles"] = quantiles
if not with_plots:
return
if order_by_column is None:
summary["histogram_plot"] = _plotting.histogram(
column, color=_plotting.COLORS[0]
column, duration_unit=duration_unit, color=_plotting.COLORS[0]
)
else:
summary["line_plot"] = _plotting.line(order_by_column, column)
19 changes: 19 additions & 0 deletions skrub/_reporting/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,22 @@ def default(self, value):
if isinstance(value, np.floating):
return float(value)
raise


def duration_to_numeric(col):
seconds = sbd.total_seconds(col)
q = sbd.quantile(sbd.abs(seconds), 0.9)
HOUR = 3600
DAY = HOUR * 24
YEAR = DAY * 365.2425

Check warning on line 122 in skrub/_reporting/_utils.py

View check run for this annotation

Codecov / codecov/patch

skrub/_reporting/_utils.py#L118-L122

Added lines #L118 - L122 were not covered by tests
if q < 1e-3:
return seconds * 1e6, "microsecond"

Check warning on line 124 in skrub/_reporting/_utils.py

View check run for this annotation

Codecov / codecov/patch

skrub/_reporting/_utils.py#L124

Added line #L124 was not covered by tests
if q < 1.0:
return seconds * 1e3, "millisecond"

Check warning on line 126 in skrub/_reporting/_utils.py

View check run for this annotation

Codecov / codecov/patch

skrub/_reporting/_utils.py#L126

Added line #L126 was not covered by tests
if q < HOUR:
return seconds, "second"

Check warning on line 128 in skrub/_reporting/_utils.py

View check run for this annotation

Codecov / codecov/patch

skrub/_reporting/_utils.py#L128

Added line #L128 was not covered by tests
if q < DAY:
return seconds / HOUR, "hour"

Check warning on line 130 in skrub/_reporting/_utils.py

View check run for this annotation

Codecov / codecov/patch

skrub/_reporting/_utils.py#L130

Added line #L130 was not covered by tests
if q < YEAR:
return seconds / DAY, "day"
return seconds / YEAR, "year"

Check warning on line 133 in skrub/_reporting/_utils.py

View check run for this annotation

Codecov / codecov/patch

skrub/_reporting/_utils.py#L132-L133

Added lines #L132 - L133 were not covered by tests

0 comments on commit 21333e8

Please sign in to comment.