FIX failing sphinx-gallery CI (skrub-data#1145)

jeromedockes · Nov 20, 2024 · 2b8d68b · 2b8d68b
1 parent b2c4f82
commit 2b8d68b
Show file tree

Hide file tree

Showing 4 changed files with 177 additions and 159 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -16,7 +16,7 @@ jobs:
           key: saved-cache
       - run:
           command: ./build_tools/circle/build_doc.sh
-          no_output_timeout: 40m
+          no_output_timeout: 30m
       - store_artifacts:
           path: doc/_build/html
           destination: doc

diff --git a/examples/02_text_with_string_encoders.py b/examples/02_text_with_string_encoders.py
@@ -209,8 +209,12 @@ def plot_box_results(named_results):
 # column in a dataframe. By default, |TextEncoder| uses the e5-small-v2 model.
 from skrub import TextEncoder
 
+text_encoder = TextEncoder(
+    "sentence-transformers/paraphrase-albert-small-v2",
+    device="cpu",
+)
 text_encoder_pipe = clone(gap_pipe).set_params(
-    **{"tablevectorizer__high_cardinality": TextEncoder()}
+    **{"tablevectorizer__high_cardinality": text_encoder}
 )
 text_encoder_results = cross_validate(text_encoder_pipe, X, y, scoring="roc_auc")
 results.append(("TextEncoder", text_encoder_results))

diff --git a/examples/07_multiple_key_join.py b/examples/07_multiple_key_join.py
@@ -47,9 +47,11 @@
 
 from skrub.datasets import fetch_figshare
 
+seed = 1
 flights = fetch_figshare("41771418").X
+
 # Sampling for faster computation.
-flights = flights.sample(20_000, random_state=1, ignore_index=True)
+flights = flights.sample(5_000, random_state=seed, ignore_index=True)
 flights.head()
 
 ###############################################################################
@@ -85,7 +87,7 @@
 
 weather = fetch_figshare("41771457").X
 # Sampling for faster computation.
-weather = weather.sample(100_000, random_state=1, ignore_index=True)
+weather = weather.sample(10_000, random_state=seed, ignore_index=True)
 weather.head()
 
 ########################################################################
@@ -164,10 +166,10 @@
 ###############################################################################
 # The results:
 
-from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import train_test_split
 
-scores = cross_val_score(pipeline_hgb, X, y)
-scores.mean()
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
+pipeline_hgb.fit(X_train, y_train).score(X_test, y_test)
 
 ###############################################################################
 # Conclusion
@@ -177,4 +179,4 @@
 # on imprecise and multiple-key correspondences.
 # This is made easy by skrub's |Joiner| transformer.
 #
-# Our final cross-validated accuracy score is 0.58.
+# Our final cross-validated accuracy score is 0.55.