Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT update pre-commig-config #929

Merged
merged 5 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 7 additions & 17 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,15 @@ repos:
rev: v4.3.0
hooks:
- id: check-yaml
exclude: doc/
- id: end-of-file-fixer
exclude: doc/
- id: trailing-whitespace
exclude: doc/
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.2.1
hooks:
- id: ruff
args: ["--fix", "--output-format=full"]
- repo: https://github.com/psf/black
rev: 22.8.0
rev: 23.3.0
hooks:
- id: black
exclude: doc/
- repo: https://github.com/pycqa/flake8
rev: 5.0.4
hooks:
- id: flake8
files: skrub/
types: [file, python]
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
hooks:
- id: isort
files: skrub/
args: ["--profile", "black", "--filter-files"]
22 changes: 13 additions & 9 deletions benchmarks/bench_fuzzy_join_sparse_vs_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@
"""

import math
from pathlib import Path
from utils import default_parser, find_result, monitor
from utils.join import evaluate, fetch_big_data
from argparse import ArgumentParser
import numbers
import warnings
from argparse import ArgumentParser
from collections.abc import Iterable
from pathlib import Path
from time import perf_counter
from typing import Literal

Expand All @@ -32,6 +30,8 @@
)
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from utils import default_parser, find_result, monitor
from utils.join import evaluate, fetch_big_data


def _numeric_encoding(
Expand Down Expand Up @@ -355,8 +355,10 @@ def fuzzy_join(

if numerical_match not in ["string", "number"]:
raise ValueError(
"Parameter 'numerical_match' should be either 'string' or 'number', "
f"got {numerical_match!r}. ",
(
"Parameter 'numerical_match' should be either 'string' or 'number', "
f"got {numerical_match!r}. "
),
)

for param in [on, left_on, right_on]:
Expand Down Expand Up @@ -401,9 +403,11 @@ def fuzzy_join(
# Warn if presence of missing values
if main_table[main_cols].isna().any().any():
warnings.warn(
"You are merging on missing values. "
"The output correspondence will be random or missing. "
"To avoid unexpected errors you can drop them. ",
(
"You are merging on missing values. "
"The output correspondence will be random or missing. "
"To avoid unexpected errors you can drop them. "
),
UserWarning,
stacklevel=2,
)
Expand Down
37 changes: 19 additions & 18 deletions benchmarks/bench_gap_divergence.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,37 +21,37 @@
Commit: dc77f610e240d2613c99436d01f98db4e4e7922c
"""

import scipy as sp
from argparse import ArgumentParser
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt

from argparse import ArgumentParser
from skrub._gap_encoder import (
GapEncoder,
GapEncoderColumn,
_multiplicative_update_h,
_multiplicative_update_w,
batch_lookup,
check_input,
)
from joblib import Parallel, delayed
from sklearn.ensemble import (
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
)
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from skrub import TableVectorizer
from pathlib import Path

from sklearn.pipeline import Pipeline
from utils import (
monitor,
default_parser,
find_result,
get_classification_datasets,
get_regression_datasets,
monitor,
)

from skrub import TableVectorizer
from skrub._gap_encoder import (
GapEncoder,
GapEncoderColumn,
_multiplicative_update_h,
_multiplicative_update_w,
batch_lookup,
check_input,
)


Expand Down Expand Up @@ -182,7 +182,8 @@ def fit(self, X, y=None):
"employee_salaries",
# "road_safety", # https://github.com/skrub-data/skrub/issues/622
"drug_directory",
# "traffic_violations", # Takes way too long and seems to cause memory leaks
# Takes way too long and seems to cause memory leaks
# "traffic_violations",
],
},
save_as=benchmark_name,
Expand Down
16 changes: 9 additions & 7 deletions benchmarks/bench_gap_encoder_hp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@
Benchmark hyperparameters of GapEncoder on traffic_violations dataset
"""

from utils import default_parser, find_result, monitor
from time import perf_counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from skrub.datasets import fetch_traffic_violations
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
from skrub import GapEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from loguru import logger
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from utils import default_parser, find_result, monitor

from skrub import GapEncoder
from skrub.datasets import fetch_traffic_violations

#######################################################
# Benchmarking accuracy and speed on traffic_violations
Expand Down
16 changes: 9 additions & 7 deletions benchmarks/bench_gap_es_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,26 @@
Benchmark hyperparameters of GapEncoder on traffic_violations dataset
"""

from utils import default_parser, find_result, monitor
from time import perf_counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from skrub.datasets import fetch_traffic_violations
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from utils import default_parser, find_result, monitor

from skrub import GapEncoder
from skrub._gap_encoder import (
GapEncoderColumn,
_beta_divergence,
batch_lookup,
_multiplicative_update_h,
_multiplicative_update_w,
batch_lookup,
)
import seaborn as sns
import matplotlib.pyplot as plt
from skrub.datasets import fetch_traffic_violations


class ModifiedGapEncoderColumn(GapEncoderColumn):
Expand Down
23 changes: 12 additions & 11 deletions benchmarks/bench_minhash_batch_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,11 @@ class MinHashEncoder(BaseEstimator, TransformerMixin):
batch_per_job: int, default=1
Number of batches to be processed in each job.
n_jobs : int, default=None
The number of jobs to run in parallel.
The hash computations for all unique elements are parallelized.
None means 1 unless in a
`joblib.parallel_backend context <https://joblib.readthedocs.io/en/latest/parallel.html>`_.
-1 means using all processors.
See `Scikit-learn Glossary <https://scikit-learn.org/stable/glossary.html#term-n_jobs>`_
for more details.
The number of jobs to run in parallel. The hash computations for all unique
elements are parallelized. None means 1 unless in a `joblib.parallel_backend
context <https://joblib.readthedocs.io/en/latest/parallel.html>`_. -1 means
using all processors. See `Scikit-learn Glossary
<https://scikit-learn.org/stable/glossary.html#term-n_jobs>`_ for more details.

Attributes
----------
Expand Down Expand Up @@ -319,7 +317,8 @@ def transform(self, X) -> np.array:
# Handle missing values
missing_mask = (
~(X == X) # Find np.nan
| (X == None) # Find None. Note: `X is None` doesn't work.
# Find None. Note: `X is None` doesn't work.
| (X == None) # noqa: E711
| (X == "") # Find empty strings
)

Expand Down Expand Up @@ -406,9 +405,11 @@ def plot(df: pd.DataFrame):
# Create a new columns merging batched and batch_per_job
# If batch is False, ignore batch_per_job
df["config"] = df.apply(
lambda row: f"batched={row['batched']}, batch_per_job={row['batch_per_job']}"
if row["batched"]
else "batched=False",
lambda row: (
f"batched={row['batched']}, batch_per_job={row['batch_per_job']}"
if row["batched"]
else "batched=False"
),
axis=1,
)
sns.boxplot(x="n_jobs", y="time", hue="config", data=df)
Expand Down
18 changes: 10 additions & 8 deletions benchmarks/run_on_openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,22 @@
It can also be used to compare our scores to OpenML scores uploaded by other users,
using the `--compare_scores` flag (this is slow).
"""
from collections import Counter
import openml
import argparse
import os
from collections import Counter

import numpy as np
from benchmarks.utils import default_parser
from skrub import TableVectorizer, MinHashEncoder
from sklearn.pipeline import Pipeline
import openml
from loguru import logger
from sklearn.ensemble import (
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
)
from sklearn.model_selection import cross_val_score
import argparse
from loguru import logger
from sklearn.pipeline import Pipeline

from benchmarks.utils import default_parser
from skrub import MinHashEncoder, TableVectorizer

# argparse
parser = argparse.ArgumentParser(parents=[default_parser])
Expand Down Expand Up @@ -178,7 +180,7 @@
errors[task_id] = str(e)
continue

logger.info(f"Finished! ")
logger.info("Finished! ")
logger.error(f"{len(errors)} tasks with errors: {set(errors.keys())}")
# print all unique errors
errors_counter = Counter(errors.values())
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/utils/_various.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd

from skrub.datasets import (
DatasetAll,
fetch_drug_directory,
fetch_employee_salaries,
fetch_medical_charge,
Expand All @@ -11,7 +12,6 @@
fetch_road_safety,
fetch_traffic_violations,
)
from skrub.datasets import DatasetAll


def find_result(bench_name: str) -> Path:
Expand Down
15 changes: 12 additions & 3 deletions benchmarks/utils/join.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
from pathlib import Path

import pandas as pd

from skrub.datasets._utils import get_data_dir


Expand Down Expand Up @@ -30,7 +31,7 @@ def fetch_data(
data_home: Path | str | None = None,
data_directory: str | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Fetch datasets from https://github.com/Yeye-He/Auto-Join/tree/master/autojoin-Benchmark # noqa
"""Fetch datasets from [1]_.

Parameters
----------
Expand All @@ -57,6 +58,10 @@ def fetch_data(

gt: pd.DataFrame
Ground truth dataset.

References
----------
.. [1] https://github.com/Yeye-He/Auto-Join/tree/master/autojoin-Benchmark
"""
left_path, right_path, gt_path, file_paths = get_local_data(
dataset_name, data_home, data_directory
Expand Down Expand Up @@ -89,7 +94,7 @@ def fetch_big_data(
data_home: Path | str | None = None,
data_directory: str | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Fetch datasets from https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md # noqa
"""Fetch datasets from [1]_.

Parameters
----------
Expand Down Expand Up @@ -120,6 +125,10 @@ def fetch_big_data(

gt: pd.DataFrame
Ground truth dataset.

References
----------
.. [1] https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md
"""
link = "https://pages.cs.wisc.edu/~anhai/data1/deepmatcher_data/"
left_path, right_path, gt_path, file_paths = get_local_data(
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/utils/monitor.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import tracemalloc
import os

import tracemalloc
from collections import defaultdict
from collections.abc import Callable, Collection, Mapping
from datetime import datetime
from itertools import product
from pathlib import Path
from random import choice
from string import ascii_letters, digits
from time import perf_counter, time as get_time
from time import perf_counter
from time import time as get_time
from typing import Any
from warnings import warn

Expand Down
2 changes: 1 addition & 1 deletion doc/_static/skrub-cover.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion doc/_templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ <h4 class="card-title"><a href="{{ pathto('cleaning') }}">Cleaning</a></h4><p><a
</article>
<h2 style="margin-top: max(4rem, 7vh);text-align: center;margin-bottom: .1rem;font-size: max(1.3vw, 4ex);">Less data wrangling, more machine learning</h2>

{# So far, no article footer. Maybe this is brutal #}
{# So far, no article footer. Maybe this is brutal #}
</div>
</div>
{% endblock docs_main %}
Expand Down
2 changes: 0 additions & 2 deletions doc/about.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,3 @@ empirical work.

skrub received funding from `project DirtyData
<https://project.inria.fr/dirtydata/>`_ (ANR-17-CE23-0018).


Loading
Loading