skrub-data · jeromedockes · Jun 7, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,25 +3,15 @@ repos:
     rev: v4.3.0
     hooks:
     -   id: check-yaml
-        exclude: doc/
     -   id: end-of-file-fixer
-        exclude: doc/
     -   id: trailing-whitespace
-        exclude: doc/
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.2.1
+    hooks:
+    -   id: ruff
+        args: ["--fix", "--output-format=full"]
 -   repo: https://github.com/psf/black
-    rev: 22.8.0
+    rev: 23.3.0
     hooks:
     -   id: black
-        exclude: doc/
--   repo: https://github.com/pycqa/flake8
-    rev: 5.0.4
-    hooks:
-    -   id: flake8
-        files: skrub/
-        types: [file, python]
--   repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
-    hooks:
-    -   id: isort
-        files: skrub/
-        args: ["--profile", "black", "--filter-files"]
diff --git a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py
@@ -9,13 +9,11 @@
 """
 
 import math
-from pathlib import Path
-from utils import default_parser, find_result, monitor
-from utils.join import evaluate, fetch_big_data
-from argparse import ArgumentParser
 import numbers
 import warnings
+from argparse import ArgumentParser
 from collections.abc import Iterable
+from pathlib import Path
 from time import perf_counter
 from typing import Literal
 
@@ -32,6 +30,8 @@
 )
 from sklearn.neighbors import NearestNeighbors
 from sklearn.preprocessing import StandardScaler
+from utils import default_parser, find_result, monitor
+from utils.join import evaluate, fetch_big_data
 
 
 def _numeric_encoding(
@@ -355,8 +355,10 @@ def fuzzy_join(
 
     if numerical_match not in ["string", "number"]:
         raise ValueError(
-            "Parameter 'numerical_match' should be either 'string' or 'number', "
-            f"got {numerical_match!r}. ",
+            (
+                "Parameter 'numerical_match' should be either 'string' or 'number', "
+                f"got {numerical_match!r}. "
+            ),
         )
 
     for param in [on, left_on, right_on]:
@@ -401,9 +403,11 @@ def fuzzy_join(
     # Warn if presence of missing values
     if main_table[main_cols].isna().any().any():
         warnings.warn(
-            "You are merging on missing values. "
-            "The output correspondence will be random or missing. "
-            "To avoid unexpected errors you can drop them. ",
+            (
+                "You are merging on missing values. "
+                "The output correspondence will be random or missing. "
+                "To avoid unexpected errors you can drop them. "
+            ),
             UserWarning,
             stacklevel=2,
         )

diff --git a/benchmarks/bench_gap_divergence.py b/benchmarks/bench_gap_divergence.py
@@ -21,37 +21,37 @@
 Commit: dc77f610e240d2613c99436d01f98db4e4e7922c
 """
 
-import scipy as sp
+from argparse import ArgumentParser
+from pathlib import Path
+
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import scipy as sp
 import seaborn as sns
-import matplotlib.pyplot as plt
-
-from argparse import ArgumentParser
-from skrub._gap_encoder import (
-    GapEncoder,
-    GapEncoderColumn,
-    _multiplicative_update_h,
-    _multiplicative_update_w,
-    batch_lookup,
-    check_input,
-)
 from joblib import Parallel, delayed
 from sklearn.ensemble import (
     HistGradientBoostingClassifier,
     HistGradientBoostingRegressor,
 )
-from sklearn.pipeline import Pipeline
 from sklearn.model_selection import cross_validate
-from skrub import TableVectorizer
-from pathlib import Path
-
+from sklearn.pipeline import Pipeline
 from utils import (
-    monitor,
     default_parser,
     find_result,
     get_classification_datasets,
     get_regression_datasets,
+    monitor,
+)
+
+from skrub import TableVectorizer
+from skrub._gap_encoder import (
+    GapEncoder,
+    GapEncoderColumn,
+    _multiplicative_update_h,
+    _multiplicative_update_w,
+    batch_lookup,
+    check_input,
 )
 
 
@@ -182,7 +182,8 @@ def fit(self, X, y=None):
             "employee_salaries",
             # "road_safety",  # https://github.com/skrub-data/skrub/issues/622
             "drug_directory",
-            # "traffic_violations",  # Takes way too long and seems to cause memory leaks
+            # Takes way too long and seems to cause memory leaks
+            # "traffic_violations",
         ],
     },
     save_as=benchmark_name,

diff --git a/benchmarks/bench_gap_encoder_hp.py b/benchmarks/bench_gap_encoder_hp.py
@@ -2,18 +2,20 @@
 Benchmark hyperparameters of GapEncoder on traffic_violations dataset
 """
 
-from utils import default_parser, find_result, monitor
 from time import perf_counter
+
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from skrub.datasets import fetch_traffic_violations
-from sklearn.model_selection import train_test_split
-from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.metrics import roc_auc_score, balanced_accuracy_score
-from skrub import GapEncoder
 import seaborn as sns
-import matplotlib.pyplot as plt
 from loguru import logger
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import balanced_accuracy_score, roc_auc_score
+from sklearn.model_selection import train_test_split
+from utils import default_parser, find_result, monitor
+
+from skrub import GapEncoder
+from skrub.datasets import fetch_traffic_violations
 
 #######################################################
 # Benchmarking accuracy and speed on traffic_violations

diff --git a/benchmarks/bench_gap_es_score.py b/benchmarks/bench_gap_es_score.py
@@ -2,24 +2,26 @@
 Benchmark hyperparameters of GapEncoder on traffic_violations dataset
 """
 
-from utils import default_parser, find_result, monitor
 from time import perf_counter
+
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from skrub.datasets import fetch_traffic_violations
-from sklearn.model_selection import train_test_split
+import seaborn as sns
 from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.metrics import roc_auc_score, balanced_accuracy_score
+from sklearn.metrics import balanced_accuracy_score, roc_auc_score
+from sklearn.model_selection import train_test_split
+from utils import default_parser, find_result, monitor
+
 from skrub import GapEncoder
 from skrub._gap_encoder import (
     GapEncoderColumn,
     _beta_divergence,
-    batch_lookup,
     _multiplicative_update_h,
     _multiplicative_update_w,
+    batch_lookup,
 )
-import seaborn as sns
-import matplotlib.pyplot as plt
+from skrub.datasets import fetch_traffic_violations
 
 
 class ModifiedGapEncoderColumn(GapEncoderColumn):

diff --git a/benchmarks/bench_minhash_batch_number.py b/benchmarks/bench_minhash_batch_number.py
@@ -64,13 +64,11 @@ class MinHashEncoder(BaseEstimator, TransformerMixin):
     batch_per_job: int, default=1
         Number of batches to be processed in each job.
     n_jobs : int, default=None
-        The number of jobs to run in parallel.
-        The hash computations for all unique elements are parallelized.
-        None means 1 unless in a
-        `joblib.parallel_backend context <https://joblib.readthedocs.io/en/latest/parallel.html>`_.
-        -1 means using all processors.
-        See `Scikit-learn Glossary <https://scikit-learn.org/stable/glossary.html#term-n_jobs>`_
-        for more details.
+        The number of jobs to run in parallel. The hash computations for all unique
+        elements are parallelized. None means 1 unless in a `joblib.parallel_backend
+        context <https://joblib.readthedocs.io/en/latest/parallel.html>`_. -1 means
+        using all processors. See `Scikit-learn Glossary
+        <https://scikit-learn.org/stable/glossary.html#term-n_jobs>`_ for more details.
 
     Attributes
     ----------
@@ -319,7 +317,8 @@ def transform(self, X) -> np.array:
         # Handle missing values
         missing_mask = (
             ~(X == X)  # Find np.nan
-            | (X == None)  # Find None. Note: `X is None` doesn't work.
+            # Find None. Note: `X is None` doesn't work.
+            | (X == None)  # noqa: E711
             | (X == "")  # Find empty strings
         )
 
@@ -406,9 +405,11 @@ def plot(df: pd.DataFrame):
     # Create a new columns merging batched and batch_per_job
     # If batch is False, ignore batch_per_job
     df["config"] = df.apply(
-        lambda row: f"batched={row['batched']}, batch_per_job={row['batch_per_job']}"
-        if row["batched"]
-        else "batched=False",
+        lambda row: (
+            f"batched={row['batched']}, batch_per_job={row['batch_per_job']}"
+            if row["batched"]
+            else "batched=False"
+        ),
         axis=1,
     )
     sns.boxplot(x="n_jobs", y="time", hue="config", data=df)

diff --git a/benchmarks/run_on_openml_datasets.py b/benchmarks/run_on_openml_datasets.py
@@ -4,20 +4,22 @@
 It can also be used to compare our scores to OpenML scores uploaded by other users,
 using the `--compare_scores` flag (this is slow).
 """
-from collections import Counter
-import openml
+import argparse
 import os
+from collections import Counter
+
 import numpy as np
-from benchmarks.utils import default_parser
-from skrub import TableVectorizer, MinHashEncoder
-from sklearn.pipeline import Pipeline
+import openml
+from loguru import logger
 from sklearn.ensemble import (
     HistGradientBoostingClassifier,
     HistGradientBoostingRegressor,
 )
 from sklearn.model_selection import cross_val_score
-import argparse
-from loguru import logger
+from sklearn.pipeline import Pipeline
+
+from benchmarks.utils import default_parser
+from skrub import MinHashEncoder, TableVectorizer
 
 # argparse
 parser = argparse.ArgumentParser(parents=[default_parser])
@@ -178,7 +180,7 @@
             errors[task_id] = str(e)
             continue
 
-logger.info(f"Finished! ")
+logger.info("Finished! ")
 logger.error(f"{len(errors)} tasks with errors: {set(errors.keys())}")
 # print all unique errors
 errors_counter = Counter(errors.values())

diff --git a/benchmarks/utils/_various.py b/benchmarks/utils/_various.py
@@ -3,6 +3,7 @@
 import pandas as pd
 
 from skrub.datasets import (
+    DatasetAll,
     fetch_drug_directory,
     fetch_employee_salaries,
     fetch_medical_charge,
@@ -11,7 +12,6 @@
     fetch_road_safety,
     fetch_traffic_violations,
 )
-from skrub.datasets import DatasetAll
 
 
 def find_result(bench_name: str) -> Path:

diff --git a/benchmarks/utils/join.py b/benchmarks/utils/join.py
@@ -1,6 +1,7 @@
-import pandas as pd
 from pathlib import Path
 
+import pandas as pd
+
 from skrub.datasets._utils import get_data_dir
 
 
@@ -30,7 +31,7 @@ def fetch_data(
     data_home: Path | str | None = None,
     data_directory: str | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """Fetch datasets from https://github.com/Yeye-He/Auto-Join/tree/master/autojoin-Benchmark  # noqa
+    """Fetch datasets from [1]_.
 
     Parameters
     ----------
@@ -57,6 +58,10 @@ def fetch_data(
 
     gt: pd.DataFrame
         Ground truth dataset.
+
+    References
+    ----------
+    .. [1] https://github.com/Yeye-He/Auto-Join/tree/master/autojoin-Benchmark
     """
     left_path, right_path, gt_path, file_paths = get_local_data(
         dataset_name, data_home, data_directory
@@ -89,7 +94,7 @@ def fetch_big_data(
     data_home: Path | str | None = None,
     data_directory: str | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """Fetch datasets from https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md  # noqa
+    """Fetch datasets from [1]_.
 
     Parameters
     ----------
@@ -120,6 +125,10 @@ def fetch_big_data(
 
     gt: pd.DataFrame
         Ground truth dataset.
+
+    References
+    ----------
+    .. [1] https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md
     """
     link = "https://pages.cs.wisc.edu/~anhai/data1/deepmatcher_data/"
     left_path, right_path, gt_path, file_paths = get_local_data(

diff --git a/benchmarks/utils/monitor.py b/benchmarks/utils/monitor.py
@@ -1,14 +1,14 @@
-import tracemalloc
 import os
-
+import tracemalloc
 from collections import defaultdict
 from collections.abc import Callable, Collection, Mapping
 from datetime import datetime
 from itertools import product
 from pathlib import Path
 from random import choice
 from string import ascii_letters, digits
-from time import perf_counter, time as get_time
+from time import perf_counter
+from time import time as get_time
 from typing import Any
 from warnings import warn
 

diff --git a/doc/_static/skrub-cover.svg b/doc/_static/skrub-cover.svg
diff --git a/doc/_templates/index.html b/doc/_templates/index.html
@@ -96,7 +96,7 @@ <h4 class="card-title"><a href="{{ pathto('cleaning') }}">Cleaning</a></h4><p><a
 </article>
 <h2 style="margin-top: max(4rem, 7vh);text-align: center;margin-bottom: .1rem;font-size: max(1.3vw, 4ex);">Less data wrangling, more machine learning</h2>
 
-{# So far, no article footer. Maybe this is brutal #}              
+{# So far, no article footer. Maybe this is brutal #}
 </div>
 </div>
 {% endblock docs_main %}

diff --git a/doc/about.rst b/doc/about.rst
@@ -15,5 +15,3 @@ empirical work.
 
 skrub received funding from `project DirtyData
 <https://project.inria.fr/dirtydata/>`_ (ANR-17-CE23-0018).
-
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,5 +15,3 @@ empirical work.

		skrub received funding from `project DirtyData
		<https://project.inria.fr/dirtydata/>`_ (ANR-17-CE23-0018).