Skip to content

Commit

Permalink
clean up X, y datetypes in powerlift
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbkoch committed Sep 1, 2024
1 parent b024b4f commit f2f2a4c
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 45 deletions.
6 changes: 2 additions & 4 deletions docs/benchmarks/ebm-benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -294,12 +294,12 @@
" elif trial.method.name == \"knn\":\n",
" est = Pipeline([(\"ct\", ct), (\"est\", KNeighborsClassifier(**knn_params))])\n",
" elif trial.method.name == \"aplr\":\n",
" fit_params[\"y\"] = fit_params[\"y\"].astype(str)\n",
" y_test = y_test.astype(str)\n",
" ct.sparse_threshold = 0 # APLR only handles dense\n",
" if trial.task.name in {\"CIFAR_10\", \"Fashion-MNIST\", \"Devnagari-Script\", \"mnist_784\"}:\n",
" max_samples = 10000 # crashes or fit time too long without subsampling\n",
" est = Pipeline([(\"ct\", ct), (\"est\", APLRClassifier(**aplr_params))])\n",
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
" y_test = y_test.astype(str).to_numpy()\n",
" else:\n",
" raise Exception(f\"Unrecognized method name {trial.method.name}\")\n",
"\n",
Expand Down Expand Up @@ -350,8 +350,6 @@
" if trial.task.name in {\"Airlines_DepDelay_10M\"}:\n",
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
" est = Pipeline([(\"ct\", ct), (\"est\", APLRRegressor(**aplr_params))])\n",
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
" y_test = y_test.astype(str).to_numpy()\n",
" else:\n",
" raise Exception(f\"Unrecognized method name {trial.method.name}\")\n",
"\n",
Expand Down
5 changes: 4 additions & 1 deletion python/powerlift/powerlift/bench/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
"""

import pandas as pd
import numpy as np
from typing import Dict, Iterable
from typing import Type, TypeVar
from typing import Union, Optional, List
from dataclasses import dataclass
from numbers import Number
import time

from powerlift.bench.store import Store
from powerlift.bench.store import Store, MIMETYPE_SERIES


@dataclass(frozen=True)
Expand Down Expand Up @@ -105,6 +106,8 @@ def data(self, aliases: Iterable[str]) -> List[object]:
name = alias_map[alias]
asset = name_to_asset[name]
parsed = BytesParser.deserialize(asset.mimetype, asset.embedded)
if asset.mimetype == MIMETYPE_SERIES:
parsed = np.array(parsed)
outputs.append(parsed)
return outputs

Expand Down
96 changes: 56 additions & 40 deletions python/powerlift/powerlift/bench/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def serialize(cls, obj):
orig_close = bstream.close
bstream.close = lambda: None
try:
obj.astype(dtype=object).to_frame(name="Target").to_parquet(
obj.to_frame(name="Target").to_parquet(
bstream, compression="Brotli", index=False
)
finally:
Expand Down Expand Up @@ -1287,6 +1287,7 @@ def retrieve_openml(
suite = openml.study.get_suite(suite_id)
tasks = suite.tasks.copy()
random.Random(1337).shuffle(tasks)
cat_type = pd.CategoricalDtype(ordered=False)
for task_id in tqdm(tasks, desc=source):
task = openml.tasks.get_task(
task_id,
Expand All @@ -1313,48 +1314,23 @@ def retrieve_openml(
)

if task.task_type_id == openml.tasks.TaskType.SUPERVISED_CLASSIFICATION:
problem = (
"binary"
if dataset.qualities["NumberOfClasses"] == 2
else "multiclass"
)
classes, y = np.unique(y.values, return_inverse=True)
problem = "binary" if len(classes) == 2 else "multiclass"

# for benchmarking we do not care about the original target strings
y = pd.Series(np.unique(y, return_inverse=True)[1])
y = pd.Series(y, dtype=np.int16)
elif task.task_type_id == openml.tasks.TaskType.SUPERVISED_REGRESSION:
problem = "regression"
y = pd.Series(y, dtype=np.float64)
else:
raise Exception(f"Unrecognized task_type_id {task.task_type_id}.")

for col_name, cat in zip(X.columns, categorical_mask):
col = X[col_name]

if pd.api.types.is_sparse(col):
col = col.sparse.to_dense()
X[col_name] = col

if col.dtype.name == "category":
if not cat:
raise Exception(
f"Categorical type mismatch. Was CategoricalDtype but indicated non-categorical."
)
if col.cat.ordered:
# OpenMl incorrectly is indicating these as ordered
X[col_name] = col.cat.as_unordered()
elif col.dtype.name == "object":
if cat:
X[col_name] = col.astype(pd.CategoricalDtype(ordered=False))
else:
X[col_name] = col.astype(float)
elif np.issubdtype(col.dtype, np.floating) or np.issubdtype(
col.dtype, np.integer
):
if cat:
raise Exception(
f"Categorical type mismatch. Was continuous but indicated categorical."
)
if cat:
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
else:
raise Exception(f"Unrecognized data type {col.dtype.name}.")
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)

meta = {
"name": name,
Expand Down Expand Up @@ -1480,6 +1456,7 @@ def retrieve_catboost_50k(
if cache_dir is not None:
cache_dir = pathlib.Path(cache_dir, "catboost_50k")

cat_type = pd.CategoricalDtype(ordered=False)
for dataset in tqdm(datasets, desc="catboost_50k"):
name = dataset["name"]
X_name = f"{name}.X.parquet"
Expand All @@ -1492,14 +1469,34 @@ def retrieve_catboost_50k(
target = dataset["target"]
X = df.drop(target, axis=1)
y = df[target]
problem = dataset["problem"]
if dataset["problem"] == "classification":
problem = "binary" if len(y.unique()) == 2 else "multiclass"
problem_type = dataset["problem"]

if problem_type == "classification":
classes, y = np.unique(y.values, return_inverse=True)
problem = "binary" if len(classes) == 2 else "multiclass"

# for benchmarking we do not care about the original target strings
y = pd.Series(y, dtype=np.int16)
elif problem_type == "regression":
problem = "regression"
y = pd.Series(y, dtype=np.float64)
else:
raise Exception(f"Unrecognized problem {problem_type}.")

categorical_mask = [dt.kind == "O" for dt in X.dtypes]

for col_name, cat in zip(X.columns, categorical_mask):
col = X[col_name]
if cat:
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
else:
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)

meta = {
"name": name,
"problem": problem,
"source": "catboost_50k",
"categorical_mask": [dt.kind == "O" for dt in X.dtypes],
"categorical_mask": categorical_mask,
"feature_names": list(X.columns),
}
supervised = SupervisedDataset(X, y, meta)
Expand Down Expand Up @@ -1531,6 +1528,7 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
)
dataset_names.extend([("regression", name) for name in regression_dataset_names])

cat_type = pd.CategoricalDtype(ordered=False)
for problem_type, dataset_name in tqdm(dataset_names, desc="pmlb"):
name = dataset_name
X_name = f"{name}.X.parquet"
Expand All @@ -1542,14 +1540,32 @@ def retrieve_pmlb(cache_dir: str = None) -> Generator[SupervisedDataset, None, N
df = fetch_data(dataset_name)
X = df.drop("target", axis=1)
y = df["target"]
problem = problem_type
if problem_type == "classification":
problem = "binary" if len(y.unique()) == 2 else "multiclass"
classes, y = np.unique(y.values, return_inverse=True)
problem = "binary" if len(classes) == 2 else "multiclass"

# for benchmarking we do not care about the original target strings
y = pd.Series(y, dtype=np.int16)
elif problem_type == "regression":
problem = "regression"
y = pd.Series(y, dtype=np.float64)
else:
raise Exception(f"Unrecognized problem_type {problem_type}.")

categorical_mask = [dt.kind == "O" for dt in X.dtypes]

for col_name, cat in zip(X.columns, categorical_mask):
col = X[col_name]
if cat:
X[col_name] = pd.Series(col, dtype=cat_type, name=col.name)
else:
X[col_name] = pd.Series(col, dtype=np.float64, name=col.name)

meta = {
"name": name,
"problem": problem,
"source": "pmlb",
"categorical_mask": [dt.kind == "O" for dt in X.dtypes],
"categorical_mask": categorical_mask,
"feature_names": list(X.columns),
}
supervised = SupervisedDataset(X, y, meta)
Expand Down

0 comments on commit f2f2a4c

Please sign in to comment.