Add codespell support (skrub-data#1126)

jeromedockes · Dec 2, 2024 · c74b0c0 · c74b0c0
1 parent f100059
commit c74b0c0
Show file tree

Hide file tree

Showing 29 changed files with 3,329 additions and 2,556 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,3 +15,11 @@ repos:
     rev: 23.3.0
     hooks:
     -   id: black
+-   repo: https://github.com/codespell-project/codespell
+    # Configuration for codespell is in pyproject.toml
+    rev: v2.3.0
+    hooks:
+    -   id: codespell
+        exclude: .*/package-lock.json
+    additional_dependencies:
+    -   tomli
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -472,7 +472,7 @@ Minor changes
 * :class:`TableVectorizer` never output a sparse matrix by default. This can be changed by
   increasing the `sparse_threshold` parameter. :pr:`646` by :user:`Leo Grinsztajn <LeoGrin>`
 
-* :class:`TableVectorizer` doesn't fail anymore if an infered type doesn't work during transform.
+* :class:`TableVectorizer` doesn't fail anymore if an inferred type doesn't work during transform.
   The new entries not matching the type are replaced by missing values. :pr:`666` by :user:`Leo Grinsztajn <LeoGrin>`
 
 - Dataset fetcher :func:`datasets.fetch_employee_salaries` now has a parameter

diff --git a/benchmarks/bench_fuzzy_join_count_vs_hash.py b/benchmarks/bench_fuzzy_join_count_vs_hash.py
@@ -98,7 +98,7 @@ def fuzzy_join(
         If False, the order of the join keys depends on the join type
         (`how` keyword).
     suffixes : typing.Tuple[str, str], default=('_x', '_y')
-        A list of strings indicating the suffix to add when overlaping
+        A list of strings indicating the suffix to add when overlapping
         column names.
 
     Returns

diff --git a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py
@@ -258,7 +258,7 @@ def fuzzy_join(
         If False, the order of the join keys depends on the join type
         (`how` keyword).
     suffixes : str 2-tuple, default=('_x', '_y')
-        A list of strings indicating the suffix to add when overlaping
+        A list of strings indicating the suffix to add when overlapping
         column names.
     sparse : boolean, default=True
         Use sparse or dense arrays for nearest neighbor search.

diff --git a/benchmarks/bench_fuzzy_join_vs_others.py b/benchmarks/bench_fuzzy_join_vs_others.py
@@ -42,7 +42,7 @@ def thefuzz_merge(
                high to low
 
     Return:
-        Dataframe with boths keys and matches.
+        Dataframe with both keys and matches.
     """
     s = df_2[right_on].tolist()
     m = df_1[left_on].apply(lambda x: process.extract(x, s, limit=limit, scorer=scorer))

diff --git a/benchmarks/utils/join.py b/benchmarks/utils/join.py
@@ -37,7 +37,7 @@ def fetch_data(
         The name of the dataset to download.
 
     save: bool, default=true
-        Wheter to save the datasets locally.
+        Whether to save the datasets locally.
 
     data_home: Path or str, optional
         The path to the root data directory.
@@ -104,7 +104,7 @@ def fetch_big_data(
         Options are {'Dirty', 'Structured', 'Textual'}.
 
     save: bool, default=true
-        Wheter to save the datasets locally.
+        Whether to save the datasets locally.
 
     data_home: Path or str, optional
         The path to the root data directory.

diff --git a/benchmarks/utils/monitor.py b/benchmarks/utils/monitor.py
@@ -27,7 +27,7 @@ def monitor(
     """Decorator used to monitor the execution of a function.
 
     The decorated function should return either:
-    - ``None``, when the goal is only to monitor time of exection and/or memory
+    - ``None``, when the goal is only to monitor time of execution and/or memory
       (parameters ``time`` and/or ``memory`` should be ``True`` (the default));
     - a mapping (dict), which will be added to the results. The keys are going
       to be the columns of the resulting pandas DataFrame.
@@ -79,7 +79,7 @@ def monitor(
         execution without the memory monitoring.
     hot_load : str, optional
         Name of the file to hot-load (meaning, recovering partial results
-        from a previous run that was interupted).
+        from a previous run that was interrupted).
         The name of the file is random (created at runtime), and printed before
         the run. Grab it from the stdout of your interrupted run.
     repeat : int, default=1

diff --git a/doc/assembling.rst b/doc/assembling.rst
@@ -31,7 +31,7 @@ has no need for pre-cleaning.
 Joining external tables for machine learning
 --------------------------------------------
 
-Joining is straigthforward for two tables because you only need to identify
+Joining is straightforward for two tables because you only need to identify
 the common key.
 
 In addition, skrub also enable more advanced analysis:

diff --git a/examples/04_fuzzy_joining.py b/examples/04_fuzzy_joining.py
@@ -143,7 +143,7 @@
 
 ###############################################################################
 #
-# We see that our |fj| succesfully identified the countries,
+# We see that our |fj| successfully identified the countries,
 # even though some country names differ between tables.
 #
 # For instance, "Egypt" and "Egypt, Arab Rep." are correctly matched, as are
@@ -167,7 +167,7 @@
 augmented_df.sort_values("skrub_Joiner_rescaled_distance").tail(10)
 
 ###############################################################################
-# We see that some matches were unsuccesful
+# We see that some matches were unsuccessful
 # (e.g "Palestinian Territories*" and "Palau"),
 # because there is simply no match in the two tables.
 
@@ -343,7 +343,7 @@
 # many ways to clean a table as there are errors. |fj|
 # method is generalizable across all datasets.
 #
-# Data transformation is also often very costly in both time and ressources.
+# Data transformation is also often very costly in both time and resources.
 # |fj| is fast and easy-to-use.
 #
 # Now up to you, try improving our model by adding information into it and

diff --git a/examples/06_ken_embeddings.py b/examples/06_ken_embeddings.py
@@ -6,7 +6,7 @@
 companies or famous people), bringing new information assembled from external
 sources may be the key to improving the analysis.
 
-Embeddings, or vectorial representations of entities, are a conveniant way to
+Embeddings, or vectorial representations of entities, are a convenient way to
 capture and summarize the information on an entity.
 Relational data embeddings capture all common entities from Wikipedia. [#]_
 These will be called `KEN embeddings` in the following example.
@@ -204,7 +204,7 @@
 # The |Pipeline| can now be readily applied to the dataframe for prediction:
 from sklearn.model_selection import cross_validate
 
-# We will save the results in a dictionnary:
+# We will save the results in a dictionary:
 all_r2_scores = dict()
 all_rmse_scores = dict()
 

diff --git a/examples/07_multiple_key_join.py b/examples/07_multiple_key_join.py
@@ -14,7 +14,7 @@
 
 |joiner| is a scikit-learn compatible transformer that enables
 performing joins across multiple keys,
-independantly of the data type (numerical, string or mixed).
+independently of the data type (numerical, string or mixed).
 
 The following example uses US domestic flights data
 to illustrate how space and time information from a
@@ -106,7 +106,7 @@
 aux.head()
 
 ###############################################################################
-# Then we join this table with the airports so that we get all auxilliary
+# Then we join this table with the airports so that we get all auxiliary
 # tables into one.
 
 from skrub import Joiner
@@ -119,7 +119,7 @@
 
 ###############################################################################
 # Joining airports with flights data:
-# Let's instanciate another multiple key joiner on the date and the airport:
+# Let's instantiate another multiple key joiner on the date and the airport:
 
 joiner = Joiner(
     aux_augmented,

diff --git a/examples/FIXME/08_join_aggregation_full.py b/examples/FIXME/08_join_aggregation_full.py
@@ -520,7 +520,7 @@ def get_X_y(data):
 plot_gain_tradeoff(results)
 
 # %%
-# We see that the agg-joiner model is slighly more calibrated, with a lower (better)
+# We see that the agg-joiner model is slightly more calibrated, with a lower (better)
 # log loss.
 
 plot_calibration_curve(results)
@@ -545,4 +545,4 @@ def get_X_y(data):
 # auxiliary data, you would need to replace the auxiliary table in the AggJoiner that
 # was used during ``fit`` with the updated data, which is a rather hacky approach.
 #
-# These limitations will be addresssed later in skrub.
+# These limitations will be addressed later in skrub.