Skip to content

Commit

Permalink
TF-IDF (1500 Features Max) (#26)
Browse files Browse the repository at this point in the history
Use less terms in the TFI-DF document term matrix, to make the data more manageable and help the models train faster. 

* Generate smaller embeddings file

* Classification using smaller features

* Classification results
  • Loading branch information
s2t2 authored Dec 2, 2023
1 parent de1e096 commit 1b8372d
Show file tree
Hide file tree
Showing 153 changed files with 63,819 additions and 29 deletions.
19 changes: 19 additions & 0 deletions app/tfidf_embeddings/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,22 @@ FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.classification
```

This is taking a while. There are so many columns. We should consider using less features. Perhaps 1500 max to be in line with OpenAI text embeddings.

<hr>


## TF-IDF (Max 1500 Features)

Let's try setting max features limit to help models train faster and data to save easier:

```sh
TFIDF_MAX_FEATURES=1500 python -m app.tfidf_embeddings.pipeline
```

```sh
TFIDF_MAX_FEATURES=1500 FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.reduction
```

```sh
TFIDF_MAX_FEATURES=1500 FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.classification
```
48 changes: 24 additions & 24 deletions app/tfidf_embeddings/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from app.classification.random_forest import RandomForestPipeline
from app.classification.xgboost import XGBoostPipeline

from app.tfidf_embeddings.pipeline import TextEmbeddingPipeline
from app.tfidf_embeddings.pipeline import TextEmbeddingPipeline, TFIDF_RESULTS_DIRPATH_SUFFIX

CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "tfidf_classification")

CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, f"tfidf_classification{TFIDF_RESULTS_DIRPATH_SUFFIX}")

class TextDataset():
"""The original dataset interface assumes a CSV file and that's too opinionated"""
Expand Down Expand Up @@ -50,28 +51,27 @@ def __init__(self, df, x):
will_upload = False
for y_col in Y_COLS:
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression")
#pipeline = LogisticRegressionPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
#
# # C (float), default=1.0
# # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
# "classifier__C": [
# 1, #2, 5,
# 10, #25, 50,
# #100
# ],
#
# # default max_iter is 100
# "classifier__max_iter": [#10, 25,
# 50,
# 100,
# #250,
# 500,
# #1_000, #5_000, 10_000
# ],
#})
#pipeline.perform()

#continue
pipeline = LogisticRegressionPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={

# C (float), default=1.0
# Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
"classifier__C": [
1, 2, 5,
10, #25, 50,
100
],

# default max_iter is 100
"classifier__max_iter": [#10, 25,
50,
100,
#250,
500,
1_000, #5_000, 10_000
],
})
pipeline.perform()


results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost")
pipeline = XGBoostPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
Expand Down
22 changes: 18 additions & 4 deletions app/tfidf_embeddings/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,33 @@
from app.nlp import convert_non_ascii
from app.classification import Y_COLS

TFIDF_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "tfidf_embeddings")
TFIDF_MAX_FEATURES = os.getenv("TFIDF_MAX_FEATURES") # default of None
TFIDF_RESULTS_DIRPATH_SUFFIX = f"_{TFIDF_MAX_FEATURES}" if TFIDF_MAX_FEATURES else ""
TFIDF_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, f"tfidf_embeddings{TFIDF_RESULTS_DIRPATH_SUFFIX}")


#def remove_non_ascii(my_text):
# return my_text.encode('ascii', 'ignore').decode('ascii')


class TextEmbeddingPipeline:
def __init__(self, corpus, tokenizer=tokenizer, stopwords=list(SKLEARN_STOPWORDS), results_dirpath=TFIDF_RESULTS_DIRPATH): # save_results=True, # destructive=WORD2VEC_DESTRUCTIVE
def __init__(self, corpus, tokenizer=tokenizer, stopwords=list(SKLEARN_STOPWORDS), results_dirpath=TFIDF_RESULTS_DIRPATH, max_features=TFIDF_MAX_FEATURES): # save_results=True, # destructive=WORD2VEC_DESTRUCTIVE
"""Param corpus a pandas series of texts (text for each document)"""

self.corpus = corpus
self.tokenizer = tokenizer
self.stopwords = stopwords
self.max_features = max_features
if self.max_features:
self.max_features = int(self.max_features)

#self.save_results = bool(save_results)
#self.destructive = bool(destructive)

self.corpus = self.corpus.apply(convert_non_ascii) # 72_854

self.results_dirpath = results_dirpath
os.makedirs(self.results_dirpath, exist_ok=True)
self.model_filepath = os.path.join(self.results_dirpath, f"tfidf.model")
self.results_json_filepath = os.path.join(self.results_dirpath, "results.json")
self.terms_csv_filepath = os.path.join(self.results_dirpath, "terms.csv")
Expand Down Expand Up @@ -92,7 +99,7 @@ def perform(self):
print("----------------")
print("INITIALIZING NEW MODEL...")

self.model = TfidfVectorizer(tokenizer=self.tokenizer, stop_words=self.stopwords)
self.model = TfidfVectorizer(tokenizer=self.tokenizer, stop_words=self.stopwords, max_features=self.max_features)
print(self.model)

print("----------------")
Expand Down Expand Up @@ -133,7 +140,14 @@ def perform(self):
self.top_words_df.to_csv(self.terms_csv_filepath, index=True)
#print("...DOCUMENT EMBeDDINGS...")
#self.embeddings_df.to_csv(self.document_embeddings_csv_filepath, index=True) # TAKeS TOO LONG? tOO SPArSE? tOO MANY COlS?
self.embeddings_df.to_hdf(self.document_embeddings_hd5_filepath, index=True, key="document_embeddings")
#self.embeddings_df.to_hdf(self.document_embeddings_hd5_filepath, index=True, key="document_embeddings")
if self.max_features < 5_000:
# let's save a smaller version of the file
self.embeddings_df.to_csv(self.document_embeddings_csv_filepath, index=True)
else:
# let's save the large file? but actually its like 5GB so nevermind
#self.embeddings_df.to_hdf(self.document_embeddings_hd5_filepath, index=True, key="document_embeddings")
pass

#print("... MODEL...")
#joblib.dump(self.model, self.model_filepath)
Expand Down
2 changes: 1 addition & 1 deletion index.html
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ <h3><a href="results/tfidf_embeddings/index.html">Dimensionality Reduction Resul


<section>
<h3><a href="results/tfidf_classification/index.html">Classification Results</a></h3>
<h3><a href="results/tfidf_classification_1500/index.html">Classification Results</a></h3>

</section>

Expand Down

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 1b8372d

Please sign in to comment.