-
Notifications
You must be signed in to change notification settings - Fork 109
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding the StringEncoder transformer #1159
Changes from 39 commits
ec37e13
b3dae47
99e5450
4f7e46e
583250b
4a39f36
ee2f739
30ad689
d7f1cd7
8686d7f
eb4de97
96423ba
e01637c
3a1f6eb
398f9db
3a45f19
38a9f2d
51856b3
58a3559
8e4fce2
afdb361
6c6d884
9366d90
6b474c6
e8f308e
8ea92d8
c999abf
8411a83
190ce2a
a43488e
cdfaf1a
c0c066f
887e047
af3b087
09b55a1
2bb353d
bfb8c55
7783565
50b6e14
171db27
3ff3f1a
ba6ace7
144ab11
ffc0d73
c5c3a73
b103ca6
d9242fa
b8ee33d
64c43c3
eb0a131
9268331
49553d9
a0afc68
463b8a4
0daec3f
757a22f
57ca040
c24dcc7
5007b99
002ca3b
b9a0074
41c29b5
f6b8631
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
from sklearn.decomposition import TruncatedSVD | ||
from sklearn.feature_extraction.text import ( | ||
HashingVectorizer, | ||
TfidfTransformer, | ||
TfidfVectorizer, | ||
) | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.utils.validation import check_is_fitted | ||
|
||
from . import _dataframe as sbd | ||
from ._on_each_column import SingleColumnTransformer | ||
|
||
|
||
class StringEncoder(SingleColumnTransformer): | ||
"""Generate a lightweight string encoding of a given column using tf-idf \ | ||
vectorization and truncated SVD. | ||
|
||
First, apply a tf-idf vectorization of the text, then reduce the dimensionality | ||
with a truncated SVD decomposition with the given number of parameters. | ||
|
||
New features will be named `{col_name}_{component}` if the series has a name, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you need double backticks |
||
and `tsvd_{component}` if it does not. | ||
|
||
Parameters | ||
---------- | ||
n_components : int, default=30 | ||
Number of components to be used for the PCA decomposition. Must be a | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to keep the number of acronyms under control maybe we should stick to "SVD" not "PCA". also we could have the expanded acronyms in parentheses the first time we mention them and links to their wikipedia pages in a Notes section |
||
positive integer. | ||
vectorizer : str, "tfidf" or "hashing" | ||
Vectorizer to apply to the strings, either `tfidf` or `hashing` for | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also here not sure what was your desired formatting -- single backticks will be italic, double for monospace |
||
scikit-learn TfidfVectorizer or HashingVectorizer respectively. | ||
|
||
ngram_range : tuple of (int, int) pairs, default=(3,4) | ||
Whether the feature should be made of word or character n-grams. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks like the docs for n_gram_range and analyzer got swapped |
||
Option ‘char_wb’ creates character n-grams only from text inside word | ||
boundaries; n-grams at the edges of words are padded with space. | ||
|
||
analyzer : str, "char", "word" or "char_wb", default="char_wb" | ||
The lower and upper boundary of the range of n-values for different | ||
n-grams to be extracted. All values of n such that min_n <= n <= max_n | ||
will be used. For example an `ngram_range` of `(1, 1)` means only unigrams, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment about rst vs markdown |
||
`(1, 2)` means unigrams and bigrams, and `(2, 2)` means only bigrams. | ||
|
||
See Also | ||
-------- | ||
MinHashEncoder : | ||
Encode string columns as a numeric array with the minhash method. | ||
GapEncoder : | ||
Encode string columns by constructing latent topics. | ||
SimilarityEncoder : | ||
Encode string columns as a numeric array with n-gram string similarity. | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
TextEncoder : | ||
Encode string columns using pre-trained language models. | ||
|
||
Examples | ||
-------- | ||
>>> import pandas as pd | ||
>>> from skrub import StringEncoder | ||
|
||
We will encode the comments using 2 components: | ||
|
||
>>> enc = StringEncoder(n_components=2) | ||
>>> X = pd.Series([ | ||
... "The professor snatched a good interview out of the jaws of these questions.", | ||
... "Bookmarking this to watch later.", | ||
... "When you don't know the lyrics of the song except the chorus", | ||
... ], name='video comments') | ||
|
||
>>> enc.fit_transform(X) # doctest: +SKIP | ||
video comments_0 video comments_1 | ||
0 8.218069e-01 4.557474e-17 | ||
1 6.971618e-16 1.000000e+00 | ||
2 8.218069e-01 -3.046564e-16 | ||
""" | ||
|
||
def __init__( | ||
self, | ||
n_components=30, | ||
vectorizer="tfidf", | ||
ngram_range=(3, 4), | ||
analyzer="char_wb", | ||
): | ||
self.n_components = n_components | ||
self.vectorizer = vectorizer | ||
self.ngram_range = ngram_range | ||
self.analyzer = analyzer | ||
|
||
def get_feature_names_out(self): | ||
"""Get output feature names for transformation. | ||
|
||
Returns | ||
------- | ||
feature_names_out : list of str objects | ||
Transformed feature names. | ||
""" | ||
return list(self.all_outputs_) | ||
|
||
def fit_transform(self, X, y=None): | ||
"""Fit the encoder and transform a column. | ||
|
||
Parameters | ||
---------- | ||
X : Pandas or Polars series | ||
The column to transform. | ||
y : None | ||
Unused. Here for compatibility with scikit-learn. | ||
|
||
Returns | ||
------- | ||
X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components) | ||
The embedding representation of the input. | ||
""" | ||
del y | ||
|
||
# ERROR CHECKING | ||
if self.analyzer not in ["char_wb", "char", "word"]: | ||
raise ValueError(f"Unknown analyzer {self.analyzer}") | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if self.vectorizer == "tfidf": | ||
self.pipe = Pipeline( | ||
[ | ||
( | ||
"tfidf", | ||
TfidfVectorizer( | ||
ngram_range=self.ngram_range, analyzer=self.analyzer | ||
), | ||
), | ||
("tsvd", TruncatedSVD(n_components=self.n_components)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as in the textencoder, I think we need to handle the case where we end up with the smaller dimension of the tfidf < self.n_components (could happen for example if fitting on a column with few unique words and setting a large n_components and using the word analyzer). in that case we can do the same as textencoder ie keep There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (adding that logic might require you to move the svd out of the pipeline) |
||
] | ||
) | ||
|
||
elif self.vectorizer == "hashing": | ||
pipe_elements = [ | ||
( | ||
"hashing", | ||
HashingVectorizer( | ||
ngram_range=self.ngram_range, analyzer=self.analyzer | ||
), | ||
), | ||
] | ||
pipe_elements.append(("tfidf", TfidfTransformer())) | ||
pipe_elements.append(("tsvd", TruncatedSVD(n_components=self.n_components))) | ||
self.pipe = Pipeline(pipe_elements) | ||
else: | ||
raise ValueError(f"Unknown vectorizer {self.vectorizer}.") | ||
|
||
name = sbd.name(X) | ||
if not name: | ||
name = "tsvd" | ||
self.all_outputs_ = [f"{name}_{idx}" for idx in range(self.n_components)] | ||
|
||
result = self.pipe.fit_transform(sbd.to_numpy(X)) | ||
|
||
self._is_fitted = True | ||
|
||
return self._transform(X, result) | ||
|
||
def transform(self, X): | ||
"""Transform a column. | ||
|
||
Parameters | ||
---------- | ||
X : Pandas or Polars series | ||
The column to transform. | ||
|
||
Returns | ||
------- | ||
X_out: Pandas or Polars dataframe with shape (len(X), tsvd_n_components) | ||
The embedding representation of the input. | ||
""" | ||
check_is_fitted(self) | ||
|
||
result = self.pipe.transform(sbd.to_numpy(X)) | ||
return self._transform(X, result) | ||
|
||
def _transform(self, X, result): | ||
result = sbd.make_dataframe_like(X, dict(zip(self.all_outputs_, result.T))) | ||
result = sbd.copy_index(X, result) | ||
|
||
return result | ||
|
||
def __sklearn_is_fitted__(self): | ||
""" | ||
Check fitted status and return a Boolean value. | ||
""" | ||
return hasattr(self, "_is_fitted") and self._is_fitted | ||
rcap107 marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
to keep the computation time for this ...