Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into add-3.13
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromedockes committed Dec 2, 2024
2 parents d25c5b4 + c74b0c0 commit 08467bb
Show file tree
Hide file tree
Showing 29 changed files with 53 additions and 38 deletions.
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,11 @@ repos:
rev: 23.3.0
hooks:
- id: black
- repo: https://github.com/codespell-project/codespell
# Configuration for codespell is in pyproject.toml
rev: v2.3.0
hooks:
- id: codespell
exclude: .*/package-lock.json
additional_dependencies:
- tomli
2 changes: 1 addition & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ Minor changes
* :class:`TableVectorizer` never output a sparse matrix by default. This can be changed by
increasing the `sparse_threshold` parameter. :pr:`646` by :user:`Leo Grinsztajn <LeoGrin>`

* :class:`TableVectorizer` doesn't fail anymore if an infered type doesn't work during transform.
* :class:`TableVectorizer` doesn't fail anymore if an inferred type doesn't work during transform.
The new entries not matching the type are replaced by missing values. :pr:`666` by :user:`Leo Grinsztajn <LeoGrin>`

- Dataset fetcher :func:`datasets.fetch_employee_salaries` now has a parameter
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_fuzzy_join_count_vs_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def fuzzy_join(
If False, the order of the join keys depends on the join type
(`how` keyword).
suffixes : typing.Tuple[str, str], default=('_x', '_y')
A list of strings indicating the suffix to add when overlaping
A list of strings indicating the suffix to add when overlapping
column names.
Returns
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_fuzzy_join_sparse_vs_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def fuzzy_join(
If False, the order of the join keys depends on the join type
(`how` keyword).
suffixes : str 2-tuple, default=('_x', '_y')
A list of strings indicating the suffix to add when overlaping
A list of strings indicating the suffix to add when overlapping
column names.
sparse : boolean, default=True
Use sparse or dense arrays for nearest neighbor search.
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_fuzzy_join_vs_others.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def thefuzz_merge(
high to low
Return:
Dataframe with boths keys and matches.
Dataframe with both keys and matches.
"""
s = df_2[right_on].tolist()
m = df_1[left_on].apply(lambda x: process.extract(x, s, limit=limit, scorer=scorer))
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/utils/join.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def fetch_data(
The name of the dataset to download.
save: bool, default=true
Wheter to save the datasets locally.
Whether to save the datasets locally.
data_home: Path or str, optional
The path to the root data directory.
Expand Down Expand Up @@ -104,7 +104,7 @@ def fetch_big_data(
Options are {'Dirty', 'Structured', 'Textual'}.
save: bool, default=true
Wheter to save the datasets locally.
Whether to save the datasets locally.
data_home: Path or str, optional
The path to the root data directory.
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/utils/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def monitor(
"""Decorator used to monitor the execution of a function.
The decorated function should return either:
- ``None``, when the goal is only to monitor time of exection and/or memory
- ``None``, when the goal is only to monitor time of execution and/or memory
(parameters ``time`` and/or ``memory`` should be ``True`` (the default));
- a mapping (dict), which will be added to the results. The keys are going
to be the columns of the resulting pandas DataFrame.
Expand Down Expand Up @@ -79,7 +79,7 @@ def monitor(
execution without the memory monitoring.
hot_load : str, optional
Name of the file to hot-load (meaning, recovering partial results
from a previous run that was interupted).
from a previous run that was interrupted).
The name of the file is random (created at runtime), and printed before
the run. Grab it from the stdout of your interrupted run.
repeat : int, default=1
Expand Down
2 changes: 1 addition & 1 deletion doc/assembling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ has no need for pre-cleaning.
Joining external tables for machine learning
--------------------------------------------

Joining is straigthforward for two tables because you only need to identify
Joining is straightforward for two tables because you only need to identify
the common key.

In addition, skrub also enable more advanced analysis:
Expand Down
6 changes: 3 additions & 3 deletions examples/04_fuzzy_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@

###############################################################################
#
# We see that our |fj| succesfully identified the countries,
# We see that our |fj| successfully identified the countries,
# even though some country names differ between tables.
#
# For instance, "Egypt" and "Egypt, Arab Rep." are correctly matched, as are
Expand All @@ -167,7 +167,7 @@
augmented_df.sort_values("skrub_Joiner_rescaled_distance").tail(10)

###############################################################################
# We see that some matches were unsuccesful
# We see that some matches were unsuccessful
# (e.g "Palestinian Territories*" and "Palau"),
# because there is simply no match in the two tables.

Expand Down Expand Up @@ -343,7 +343,7 @@
# many ways to clean a table as there are errors. |fj|
# method is generalizable across all datasets.
#
# Data transformation is also often very costly in both time and ressources.
# Data transformation is also often very costly in both time and resources.
# |fj| is fast and easy-to-use.
#
# Now up to you, try improving our model by adding information into it and
Expand Down
4 changes: 2 additions & 2 deletions examples/06_ken_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
companies or famous people), bringing new information assembled from external
sources may be the key to improving the analysis.
Embeddings, or vectorial representations of entities, are a conveniant way to
Embeddings, or vectorial representations of entities, are a convenient way to
capture and summarize the information on an entity.
Relational data embeddings capture all common entities from Wikipedia. [#]_
These will be called `KEN embeddings` in the following example.
Expand Down Expand Up @@ -204,7 +204,7 @@
# The |Pipeline| can now be readily applied to the dataframe for prediction:
from sklearn.model_selection import cross_validate

# We will save the results in a dictionnary:
# We will save the results in a dictionary:
all_r2_scores = dict()
all_rmse_scores = dict()

Expand Down
6 changes: 3 additions & 3 deletions examples/07_multiple_key_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
|joiner| is a scikit-learn compatible transformer that enables
performing joins across multiple keys,
independantly of the data type (numerical, string or mixed).
independently of the data type (numerical, string or mixed).
The following example uses US domestic flights data
to illustrate how space and time information from a
Expand Down Expand Up @@ -106,7 +106,7 @@
aux.head()

###############################################################################
# Then we join this table with the airports so that we get all auxilliary
# Then we join this table with the airports so that we get all auxiliary
# tables into one.

from skrub import Joiner
Expand All @@ -119,7 +119,7 @@

###############################################################################
# Joining airports with flights data:
# Let's instanciate another multiple key joiner on the date and the airport:
# Let's instantiate another multiple key joiner on the date and the airport:

joiner = Joiner(
aux_augmented,
Expand Down
4 changes: 2 additions & 2 deletions examples/FIXME/08_join_aggregation_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ def get_X_y(data):
plot_gain_tradeoff(results)

# %%
# We see that the agg-joiner model is slighly more calibrated, with a lower (better)
# We see that the agg-joiner model is slightly more calibrated, with a lower (better)
# log loss.

plot_calibration_curve(results)
Expand All @@ -545,4 +545,4 @@ def get_X_y(data):
# auxiliary data, you would need to replace the auxiliary table in the AggJoiner that
# was used during ``fit`` with the updated data, which is a rather hacky approach.
#
# These limitations will be addresssed later in skrub.
# These limitations will be addressed later in skrub.
2 changes: 1 addition & 1 deletion pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -272,3 +272,10 @@ filterwarnings = [
]
addopts = "--doctest-modules"
doctest_optionflags = "NORMALIZE_WHITESPACE ELLIPSIS"

[tool.codespell]
# Ref: https://github.com/codespell-project/codespell#using-a-config-file
skip = '.git*,*.svg,package-lock.json,*.lock,*.css,*-min.*'
check-hidden = true
# ignore-regex = ''
ignore-words-list = 'ans,serie'
4 changes: 2 additions & 2 deletions skrub/_column_associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def column_associations(df):
columns are binned with 10 bins. For categorical columns, only the 10 most
frequent categories are considered. In both cases, nulls are treated as a
separate category, ie a separate row in the contingency table. Thus
associations betwen the values of 2 columns or between their missingness
associations between the values of 2 columns or between their missingness
patterns may be captured.
Parameters
Expand Down Expand Up @@ -178,7 +178,7 @@ def _compute_cramer(table, n_samples):
The input is the table computed by ``_contingency_table`` with shape
(n cols, n cols, n bins, n bins).
This returs the symmetric matrix with shape (n cols, n cols) where entry
This returns the symmetric matrix with shape (n cols, n cols) where entry
i, j contains the statistic for column i x column j.
"""
marginal_0 = table.sum(axis=-2)
Expand Down
2 changes: 1 addition & 1 deletion skrub/_fuzzy_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def fuzzy_join(
in the right table.
'self_join_neighbor'
Once the match candidate (i.e. the nearest neigbor from the right
Once the match candidate (i.e. the nearest neighbor from the right
table) has been found, we find its nearest neighbor in the right
table (excluding itself). The reference distance is the distance that
separates those 2 right rows.
Expand Down
2 changes: 1 addition & 1 deletion skrub/_gap_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ class GapEncoder(TransformerMixin, SingleColumnTransformer):
>>> enc.get_feature_names_out()
['city: england, london, uk', 'city: france, paris, pqris']
It got it right, reccuring topics are "London" and "England" on the
It got it right, reoccurring topics are "London" and "England" on the
one side and "Paris" and "France" on the other.
As this is a continuous encoding, we can look at the level of
Expand Down
2 changes: 1 addition & 1 deletion skrub/_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class Joiner(TransformerMixin, BaseEstimator):
in the auxiliary table.
'self_join_neighbor'
Once the match candidate (i.e. the nearest neigbor from the auxiliary
Once the match candidate (i.e. the nearest neighbor from the auxiliary
table) has been found, we find its nearest neighbor in the auxiliary
table (excluding itself). The reference distance is the distance that
separates those 2 auxiliary rows.
Expand Down
2 changes: 1 addition & 1 deletion skrub/_on_each_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ class OnEachColumn(TransformerMixin, BaseEstimator):
``cols``) are passed through.
rename_columns : str, default='{}'
Format string applied to all transformation ouput column names. For
Format string applied to all transformation output column names. For
example pass ``'transformed_{}'`` to prepend ``'transformed_'`` to all
output column names. The default value does not modify the names.
Renaming is not applied to columns not selected by ``cols``.
Expand Down
2 changes: 1 addition & 1 deletion skrub/_on_subframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class OnSubFrame(TransformerMixin, BaseEstimator):
passed through.
rename_columns : str, default='{}'
Format strings applied to all transformation ouput column names. For
Format strings applied to all transformation output column names. For
example pass ``'transformed_{}'`` to prepend ``'transformed_'`` to all
output column names. The default value does not modify the names.
Renaming is not applied to columns not selected by ``cols``.
Expand Down
2 changes: 1 addition & 1 deletion skrub/_reporting/_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def value_counts(value_counts, n_unique, n_rows, color=COLOR_0):
n_unique : int
Cardinality of the plotted column, used to determine if all unique
values are plotted or if there are too many and some have been
ommitted. The figure's title is adjusted accordingly.
omitted. The figure's title is adjusted accordingly.
n_rows : int
Total length of the column, used to convert the counts to proportions.
Expand Down
2 changes: 1 addition & 1 deletion skrub/_reporting/_sample_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ class _PandasTable:
i=0 | my house | 1st | df.iloc[0, 0] | df.iloc[0, 1] | ... | ... |
i=1 | | 2nd | df.iloc[1, 0] | df.iloc[1, 1] | ... | ... |
i=2 | your house | 1st | ... | ... | ... | ... |
i=3 | | 2st | ... | ... | ... | ... |
i=3 | | 2nd | ... | ... | ... | ... |
start_i, start_j are the first i, j coords (here -3, -2)
Expand Down
6 changes: 3 additions & 3 deletions skrub/_text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ class TextEncoder(SingleColumnTransformer, TransformerMixin):
You can find more options on the `sentence-transformers documentation
<https://www.sbert.net/docs/pretrained_models.html#model-overview>`_.
The default model is a shrinked version of e5-v2, which has shown good
The default model is a shrunk version of e5-v2, which has shown good
performance in the benchmark of [1]_.
n_components : int or None, default=30,
The number of embedding dimensions. As the number of dimensions is different
accross embedding models, this class uses a :class:`~sklearn.decomposition.PCA`
across embedding models, this class uses a :class:`~sklearn.decomposition.PCA`
to set the number of embedding to ``n_components`` during ``transform``.
Set ``n_components=None`` to skip the PCA dimension reduction mechanism.
Expand Down Expand Up @@ -395,7 +395,7 @@ def get_feature_names_out(self):
def __getstate__(self):
state = self.__dict__.copy()
# Always dump self._cache_folder because it is overwritten when the model
# is loaded, and it shows an absolut path on the user machine.
# is loaded, and it shows an absolute path on the user machine.
# However, we have to include self.cache_folder in the serialized object
# because that is a parameter provided by the user.
remove_props = ["_cache_folder"]
Expand Down
2 changes: 1 addition & 1 deletion skrub/_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class ToDatetime(SingleColumnTransformer):
format : str or None, optional, default=None
Format to use for parsing dates that are stored as strings, e.g.
``"%Y-%m-%dT%H:%M%S"``.
If not specfied, the format is inferred from the data when possible.
If not specified, the format is inferred from the data when possible.
When doing so, for dates presented as 01/02/2003, it is usually
possible to infer from the data whether the month comes first (USA
convention) or the day comes first, ie ``"%m/%d/%Y"`` vs
Expand Down
2 changes: 1 addition & 1 deletion skrub/_wrap_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def wrap_transformer(
``OnEachColumn`` instance. Otherwise it is wrapped in a ``OnSubFrame``
instance.
This default choice can be overriden by passing ``columnwise=True`` to
This default choice can be overridden by passing ``columnwise=True`` to
force the use of ``OnEachColumn`` or ``columnwise=False`` to force the use
of ``OnSubFrame``.
Expand Down
2 changes: 1 addition & 1 deletion skrub/datasets/_fetching.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@

MOVIELENS_URL = "https://files.grouplens.org/datasets/movielens/{zip_directory}.zip"

# A dictionnary storing the sha256 hashes of the figshare files
# A dictionary storing the sha256 hashes of the figshare files
figshare_id_to_hash = {
39142985: "47d73381ef72b050002a8642194c6718a4954ec9e6c556f4c4ddc6ed84ceec92",
39149066: "e479cf9741a90c40401697e7fa54409e3b9cfa09f27502877382e64e86fbfcd0",
Expand Down
2 changes: 1 addition & 1 deletion skrub/datasets/_generating.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def make_deduplication_data(
Number of duplications per example.
prob_mistake_per_letter : float in [0, 1], default=0.2
Probability of misspelling a character in duplications.
By default, 1/5 of the characters will be misspeled.
By default, 1/5 of the characters will be misspelled.
random_state : int, RandomState instance, optional
Determines random number generation for dataset noise. Pass an int
for reproducible output across multiple function calls.
Expand Down
2 changes: 1 addition & 1 deletion skrub/tests/test_column_associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_column_associations(df_module):

def test_infinite(df_module):
# non-regression test for https://github.com/skrub-data/skrub/issues/1133
# (colum associations would raise an exception on low-cardinality float
# (column associations would raise an exception on low-cardinality float
# column with infinite values)
with warnings.catch_warnings():
# pandas convert_dtypes() emits a spurious warning while trying to decide if
Expand Down
2 changes: 1 addition & 1 deletion skrub/tests/test_multi_agg_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def test_wrong_keys_length(main_table, df_module):
"Check that providing wrong key lengths in the `MultiAggJoiner` raise an error."
main_table = df_module.DataFrame(main_table)

# Check wrong main_keys lenght
# Check wrong main_keys length
multi_agg_joiner = MultiAggJoiner(
aux_tables=[main_table, main_table],
operations=[["count"], ["count"]],
Expand Down

0 comments on commit 08467bb

Please sign in to comment.