Merge remote-tracking branch 'upstream/main' into add-3.13

skrub-data · Dec 2, 2024 · 08467bb · 08467bb
2 parents d25c5b4 + c74b0c0
commit 08467bb
Show file tree

Hide file tree

Showing 29 changed files with 53 additions and 38 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,3 +15,11 @@ repos:
     rev: 23.3.0
     hooks:
     -   id: black
+-   repo: https://github.com/codespell-project/codespell
+    # Configuration for codespell is in pyproject.toml
+    rev: v2.3.0
+    hooks:
+    -   id: codespell
+        exclude: .*/package-lock.json
+    additional_dependencies:
+    -   tomli
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -472,7 +472,7 @@ Minor changes
 * :class:`TableVectorizer` never output a sparse matrix by default. This can be changed by
   increasing the `sparse_threshold` parameter. :pr:`646` by :user:`Leo Grinsztajn <LeoGrin>`
 
-* :class:`TableVectorizer` doesn't fail anymore if an infered type doesn't work during transform.
+* :class:`TableVectorizer` doesn't fail anymore if an inferred type doesn't work during transform.
   The new entries not matching the type are replaced by missing values. :pr:`666` by :user:`Leo Grinsztajn <LeoGrin>`
 
 - Dataset fetcher :func:`datasets.fetch_employee_salaries` now has a parameter

diff --git a/benchmarks/bench_fuzzy_join_count_vs_hash.py b/benchmarks/bench_fuzzy_join_count_vs_hash.py
@@ -98,7 +98,7 @@ def fuzzy_join(
         If False, the order of the join keys depends on the join type
         (`how` keyword).
     suffixes : typing.Tuple[str, str], default=('_x', '_y')
-        A list of strings indicating the suffix to add when overlaping
+        A list of strings indicating the suffix to add when overlapping
         column names.
 
     Returns

diff --git a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py
@@ -258,7 +258,7 @@ def fuzzy_join(
         If False, the order of the join keys depends on the join type
         (`how` keyword).
     suffixes : str 2-tuple, default=('_x', '_y')
-        A list of strings indicating the suffix to add when overlaping
+        A list of strings indicating the suffix to add when overlapping
         column names.
     sparse : boolean, default=True
         Use sparse or dense arrays for nearest neighbor search.

diff --git a/benchmarks/bench_fuzzy_join_vs_others.py b/benchmarks/bench_fuzzy_join_vs_others.py
@@ -42,7 +42,7 @@ def thefuzz_merge(
                high to low
 
     Return:
-        Dataframe with boths keys and matches.
+        Dataframe with both keys and matches.
     """
     s = df_2[right_on].tolist()
     m = df_1[left_on].apply(lambda x: process.extract(x, s, limit=limit, scorer=scorer))

diff --git a/benchmarks/utils/join.py b/benchmarks/utils/join.py
@@ -37,7 +37,7 @@ def fetch_data(
         The name of the dataset to download.
 
     save: bool, default=true
-        Wheter to save the datasets locally.
+        Whether to save the datasets locally.
 
     data_home: Path or str, optional
         The path to the root data directory.
@@ -104,7 +104,7 @@ def fetch_big_data(
         Options are {'Dirty', 'Structured', 'Textual'}.
 
     save: bool, default=true
-        Wheter to save the datasets locally.
+        Whether to save the datasets locally.
 
     data_home: Path or str, optional
         The path to the root data directory.

diff --git a/benchmarks/utils/monitor.py b/benchmarks/utils/monitor.py
@@ -27,7 +27,7 @@ def monitor(
     """Decorator used to monitor the execution of a function.
 
     The decorated function should return either:
-    - ``None``, when the goal is only to monitor time of exection and/or memory
+    - ``None``, when the goal is only to monitor time of execution and/or memory
       (parameters ``time`` and/or ``memory`` should be ``True`` (the default));
     - a mapping (dict), which will be added to the results. The keys are going
       to be the columns of the resulting pandas DataFrame.
@@ -79,7 +79,7 @@ def monitor(
         execution without the memory monitoring.
     hot_load : str, optional
         Name of the file to hot-load (meaning, recovering partial results
-        from a previous run that was interupted).
+        from a previous run that was interrupted).
         The name of the file is random (created at runtime), and printed before
         the run. Grab it from the stdout of your interrupted run.
     repeat : int, default=1

diff --git a/doc/assembling.rst b/doc/assembling.rst
@@ -31,7 +31,7 @@ has no need for pre-cleaning.
 Joining external tables for machine learning
 --------------------------------------------
 
-Joining is straigthforward for two tables because you only need to identify
+Joining is straightforward for two tables because you only need to identify
 the common key.
 
 In addition, skrub also enable more advanced analysis:

diff --git a/examples/04_fuzzy_joining.py b/examples/04_fuzzy_joining.py
@@ -143,7 +143,7 @@
 
 ###############################################################################
 #
-# We see that our |fj| succesfully identified the countries,
+# We see that our |fj| successfully identified the countries,
 # even though some country names differ between tables.
 #
 # For instance, "Egypt" and "Egypt, Arab Rep." are correctly matched, as are
@@ -167,7 +167,7 @@
 augmented_df.sort_values("skrub_Joiner_rescaled_distance").tail(10)
 
 ###############################################################################
-# We see that some matches were unsuccesful
+# We see that some matches were unsuccessful
 # (e.g "Palestinian Territories*" and "Palau"),
 # because there is simply no match in the two tables.
 
@@ -343,7 +343,7 @@
 # many ways to clean a table as there are errors. |fj|
 # method is generalizable across all datasets.
 #
-# Data transformation is also often very costly in both time and ressources.
+# Data transformation is also often very costly in both time and resources.
 # |fj| is fast and easy-to-use.
 #
 # Now up to you, try improving our model by adding information into it and

diff --git a/examples/06_ken_embeddings.py b/examples/06_ken_embeddings.py
@@ -6,7 +6,7 @@
 companies or famous people), bringing new information assembled from external
 sources may be the key to improving the analysis.
 
-Embeddings, or vectorial representations of entities, are a conveniant way to
+Embeddings, or vectorial representations of entities, are a convenient way to
 capture and summarize the information on an entity.
 Relational data embeddings capture all common entities from Wikipedia. [#]_
 These will be called `KEN embeddings` in the following example.
@@ -204,7 +204,7 @@
 # The |Pipeline| can now be readily applied to the dataframe for prediction:
 from sklearn.model_selection import cross_validate
 
-# We will save the results in a dictionnary:
+# We will save the results in a dictionary:
 all_r2_scores = dict()
 all_rmse_scores = dict()
 

diff --git a/examples/07_multiple_key_join.py b/examples/07_multiple_key_join.py
@@ -14,7 +14,7 @@
 
 |joiner| is a scikit-learn compatible transformer that enables
 performing joins across multiple keys,
-independantly of the data type (numerical, string or mixed).
+independently of the data type (numerical, string or mixed).
 
 The following example uses US domestic flights data
 to illustrate how space and time information from a
@@ -106,7 +106,7 @@
 aux.head()
 
 ###############################################################################
-# Then we join this table with the airports so that we get all auxilliary
+# Then we join this table with the airports so that we get all auxiliary
 # tables into one.
 
 from skrub import Joiner
@@ -119,7 +119,7 @@
 
 ###############################################################################
 # Joining airports with flights data:
-# Let's instanciate another multiple key joiner on the date and the airport:
+# Let's instantiate another multiple key joiner on the date and the airport:
 
 joiner = Joiner(
     aux_augmented,

diff --git a/examples/FIXME/08_join_aggregation_full.py b/examples/FIXME/08_join_aggregation_full.py
@@ -520,7 +520,7 @@ def get_X_y(data):
 plot_gain_tradeoff(results)
 
 # %%
-# We see that the agg-joiner model is slighly more calibrated, with a lower (better)
+# We see that the agg-joiner model is slightly more calibrated, with a lower (better)
 # log loss.
 
 plot_calibration_curve(results)
@@ -545,4 +545,4 @@ def get_X_y(data):
 # auxiliary data, you would need to replace the auxiliary table in the AggJoiner that
 # was used during ``fit`` with the updated data, which is a rather hacky approach.
 #
-# These limitations will be addresssed later in skrub.
+# These limitations will be addressed later in skrub.
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -272,3 +272,10 @@ filterwarnings = [
 ]
 addopts = "--doctest-modules"
 doctest_optionflags = "NORMALIZE_WHITESPACE ELLIPSIS"
+
+[tool.codespell]
+# Ref: https://github.com/codespell-project/codespell#using-a-config-file
+skip = '.git*,*.svg,package-lock.json,*.lock,*.css,*-min.*'
+check-hidden = true
+# ignore-regex = ''
+ignore-words-list = 'ans,serie'
diff --git a/skrub/_column_associations.py b/skrub/_column_associations.py
@@ -29,7 +29,7 @@ def column_associations(df):
     columns are binned with 10 bins. For categorical columns, only the 10 most
     frequent categories are considered. In both cases, nulls are treated as a
     separate category, ie a separate row in the contingency table. Thus
-    associations betwen the values of 2 columns or between their missingness
+    associations between the values of 2 columns or between their missingness
     patterns may be captured.
 
     Parameters
@@ -178,7 +178,7 @@ def _compute_cramer(table, n_samples):
     The input is the table computed by ``_contingency_table`` with shape
     (n cols, n cols, n bins, n bins).
 
-    This returs the symmetric matrix with shape (n cols, n cols) where entry
+    This returns the symmetric matrix with shape (n cols, n cols) where entry
     i, j contains the statistic for column i x column j.
     """
     marginal_0 = table.sum(axis=-2)

diff --git a/skrub/_fuzzy_join.py b/skrub/_fuzzy_join.py
@@ -60,7 +60,7 @@ def fuzzy_join(
         in the right table.
 
     'self_join_neighbor'
-        Once the match candidate (i.e. the nearest neigbor from the right
+        Once the match candidate (i.e. the nearest neighbor from the right
         table) has been found, we find its nearest neighbor in the right
         table (excluding itself). The reference distance is the distance that
         separates those 2 right rows.

diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py
@@ -159,7 +159,7 @@ class GapEncoder(TransformerMixin, SingleColumnTransformer):
     >>> enc.get_feature_names_out()
     ['city: england, london, uk', 'city: france, paris, pqris']
 
-    It got it right, reccuring topics are "London" and "England" on the
+    It got it right, reoccurring topics are "London" and "England" on the
     one side and "Paris" and "France" on the other.
 
     As this is a continuous encoding, we can look at the level of

diff --git a/skrub/_joiner.py b/skrub/_joiner.py
@@ -126,7 +126,7 @@ class Joiner(TransformerMixin, BaseEstimator):
         in the auxiliary table.
 
     'self_join_neighbor'
-        Once the match candidate (i.e. the nearest neigbor from the auxiliary
+        Once the match candidate (i.e. the nearest neighbor from the auxiliary
         table) has been found, we find its nearest neighbor in the auxiliary
         table (excluding itself). The reference distance is the distance that
         separates those 2 auxiliary rows.

diff --git a/skrub/_on_each_column.py b/skrub/_on_each_column.py
@@ -250,7 +250,7 @@ class OnEachColumn(TransformerMixin, BaseEstimator):
         ``cols``) are passed through.
 
     rename_columns : str, default='{}'
-        Format string applied to all transformation ouput column names. For
+        Format string applied to all transformation output column names. For
         example pass ``'transformed_{}'`` to prepend ``'transformed_'`` to all
         output column names. The default value does not modify the names.
         Renaming is not applied to columns not selected by ``cols``.

diff --git a/skrub/_on_subframe.py b/skrub/_on_subframe.py
@@ -43,7 +43,7 @@ class OnSubFrame(TransformerMixin, BaseEstimator):
         passed through.
 
     rename_columns : str, default='{}'
-        Format strings applied to all transformation ouput column names. For
+        Format strings applied to all transformation output column names. For
         example pass ``'transformed_{}'`` to prepend ``'transformed_'`` to all
         output column names. The default value does not modify the names.
         Renaming is not applied to columns not selected by ``cols``.

diff --git a/skrub/_reporting/_plotting.py b/skrub/_reporting/_plotting.py
@@ -231,7 +231,7 @@ def value_counts(value_counts, n_unique, n_rows, color=COLOR_0):
     n_unique : int
         Cardinality of the plotted column, used to determine if all unique
         values are plotted or if there are too many and some have been
-        ommitted. The figure's title is adjusted accordingly.
+        omitted. The figure's title is adjusted accordingly.
 
     n_rows : int
         Total length of the column, used to convert the counts to proportions.

diff --git a/skrub/_reporting/_sample_table.py b/skrub/_reporting/_sample_table.py
@@ -329,7 +329,7 @@ class _PandasTable:
     i=0  | my house   | 1st   | df.iloc[0, 0] | df.iloc[0, 1] | ... | ... |
     i=1  |            | 2nd   | df.iloc[1, 0] | df.iloc[1, 1] | ... | ... |
     i=2  | your house | 1st   | ...           | ...           | ... | ... |
-    i=3  |            | 2st   | ...           | ...           | ... | ... |
+    i=3  |            | 2nd   | ...           | ...           | ... | ... |
 
 
     start_i, start_j are the first i, j coords (here -3, -2)

diff --git a/skrub/_text_encoder.py b/skrub/_text_encoder.py
@@ -67,12 +67,12 @@ class TextEncoder(SingleColumnTransformer, TransformerMixin):
         You can find more options on the `sentence-transformers documentation
         <https://www.sbert.net/docs/pretrained_models.html#model-overview>`_.
 
-        The default model is a shrinked version of e5-v2, which has shown good
+        The default model is a shrunk version of e5-v2, which has shown good
         performance in the benchmark of [1]_.
 
     n_components : int or None, default=30,
         The number of embedding dimensions. As the number of dimensions is different
-        accross embedding models, this class uses a :class:`~sklearn.decomposition.PCA`
+        across embedding models, this class uses a :class:`~sklearn.decomposition.PCA`
         to set the number of embedding to ``n_components`` during ``transform``.
         Set ``n_components=None`` to skip the PCA dimension reduction mechanism.
 
@@ -395,7 +395,7 @@ def get_feature_names_out(self):
     def __getstate__(self):
         state = self.__dict__.copy()
         # Always dump self._cache_folder because it is overwritten when the model
-        # is loaded, and it shows an absolut path on the user machine.
+        # is loaded, and it shows an absolute path on the user machine.
         # However, we have to include self.cache_folder in the serialized object
         # because that is a parameter provided by the user.
         remove_props = ["_cache_folder"]

diff --git a/skrub/_to_datetime.py b/skrub/_to_datetime.py
@@ -98,7 +98,7 @@ class ToDatetime(SingleColumnTransformer):
     format : str or None, optional, default=None
         Format to use for parsing dates that are stored as strings, e.g.
         ``"%Y-%m-%dT%H:%M%S"``.
-        If not specfied, the format is inferred from the data when possible.
+        If not specified, the format is inferred from the data when possible.
         When doing so, for dates presented as 01/02/2003, it is usually
         possible to infer from the data whether the month comes first (USA
         convention) or the day comes first, ie ``"%m/%d/%Y"`` vs

diff --git a/skrub/_wrap_transformer.py b/skrub/_wrap_transformer.py
@@ -24,7 +24,7 @@ def wrap_transformer(
     ``OnEachColumn`` instance. Otherwise it is wrapped in a ``OnSubFrame``
     instance.
 
-    This default choice can be overriden by passing ``columnwise=True`` to
+    This default choice can be overridden by passing ``columnwise=True`` to
     force the use of ``OnEachColumn`` or ``columnwise=False`` to force the use
     of ``OnSubFrame``.
 

diff --git a/skrub/datasets/_fetching.py b/skrub/datasets/_fetching.py
@@ -63,7 +63,7 @@
 
 MOVIELENS_URL = "https://files.grouplens.org/datasets/movielens/{zip_directory}.zip"
 
-# A dictionnary storing the sha256 hashes of the figshare files
+# A dictionary storing the sha256 hashes of the figshare files
 figshare_id_to_hash = {
     39142985: "47d73381ef72b050002a8642194c6718a4954ec9e6c556f4c4ddc6ed84ceec92",
     39149066: "e479cf9741a90c40401697e7fa54409e3b9cfa09f27502877382e64e86fbfcd0",

diff --git a/skrub/datasets/_generating.py b/skrub/datasets/_generating.py
@@ -28,7 +28,7 @@ def make_deduplication_data(
         Number of duplications per example.
     prob_mistake_per_letter : float in [0, 1], default=0.2
         Probability of misspelling a character in duplications.
-        By default, 1/5 of the characters will be misspeled.
+        By default, 1/5 of the characters will be misspelled.
     random_state : int, RandomState instance, optional
         Determines random number generation for dataset noise. Pass an int
         for reproducible output across multiple function calls.

diff --git a/skrub/tests/test_column_associations.py b/skrub/tests/test_column_associations.py
@@ -25,7 +25,7 @@ def test_column_associations(df_module):
 
 def test_infinite(df_module):
     # non-regression test for https://github.com/skrub-data/skrub/issues/1133
-    # (colum associations would raise an exception on low-cardinality float
+    # (column associations would raise an exception on low-cardinality float
     # column with infinite values)
     with warnings.catch_warnings():
         # pandas convert_dtypes() emits a spurious warning while trying to decide if

diff --git a/skrub/tests/test_multi_agg_joiner.py b/skrub/tests/test_multi_agg_joiner.py
@@ -303,7 +303,7 @@ def test_wrong_keys_length(main_table, df_module):
     "Check that providing wrong key lengths in the `MultiAggJoiner` raise an error."
     main_table = df_module.DataFrame(main_table)
 
-    # Check wrong main_keys lenght
+    # Check wrong main_keys length
     multi_agg_joiner = MultiAggJoiner(
         aux_tables=[main_table, main_table],
         operations=[["count"], ["count"]],