TF-IDF (1500 Features Max) (#26)

Use less terms in the TFI-DF document term matrix, to make the data more manageable and help the models train faster. * Generate smaller embeddings file * Classification using smaller features * Classification results
s2t2 · Dec 2, 2023 · 1b8372d · 1b8372d
1 parent de1e096
commit 1b8372d
Show file tree

Hide file tree

Showing 153 changed files with 63,819 additions and 29 deletions.
diff --git a/app/tfidf_embeddings/README.md b/app/tfidf_embeddings/README.md
@@ -28,3 +28,22 @@ FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.classification
 ```
 
 This is taking a while. There are so many columns. We should consider using less features. Perhaps 1500 max to be in line with OpenAI text embeddings.
+
+<hr>
+
+
+## TF-IDF (Max 1500 Features)
+
+Let's try setting max features limit to help models train faster and data to save easier:
+
+```sh
+TFIDF_MAX_FEATURES=1500 python -m app.tfidf_embeddings.pipeline
+```
+
+```sh
+TFIDF_MAX_FEATURES=1500 FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.reduction
+```
+
+```sh
+TFIDF_MAX_FEATURES=1500 FIG_SAVE=true FIG_SHOW=false python -m app.tfidf_embeddings.classification
+```
diff --git a/app/tfidf_embeddings/classification.py b/app/tfidf_embeddings/classification.py
@@ -10,9 +10,10 @@
 from app.classification.random_forest import RandomForestPipeline
 from app.classification.xgboost import XGBoostPipeline
 
-from app.tfidf_embeddings.pipeline import TextEmbeddingPipeline
+from app.tfidf_embeddings.pipeline import TextEmbeddingPipeline, TFIDF_RESULTS_DIRPATH_SUFFIX
 
-CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "tfidf_classification")
+
+CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, f"tfidf_classification{TFIDF_RESULTS_DIRPATH_SUFFIX}")
 
 class TextDataset():
     """The original dataset interface assumes a CSV file and that's too opinionated"""
@@ -50,28 +51,27 @@ def __init__(self, df, x):
     will_upload = False
     for y_col in Y_COLS:
         results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression")
-        #pipeline = LogisticRegressionPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
-        #
-        #    # C (float), default=1.0
-        #    # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
-        #    "classifier__C": [
-        #        1, #2, 5,
-        #        10, #25, 50,
-        #        #100
-        #    ],
-        #
-        #    # default max_iter is 100
-        #    "classifier__max_iter": [#10, 25,
-        #                             50,
-        #                             100,
-        #                             #250,
-        #                             500,
-        #                             #1_000, #5_000, 10_000
-        #                             ],
-        #})
-        #pipeline.perform()
-
-        #continue
+        pipeline = LogisticRegressionPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={
+
+            # C (float), default=1.0
+            # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
+            "classifier__C": [
+                1, 2, 5,
+                10, #25, 50,
+                100
+            ],
+
+            # default max_iter is 100
+            "classifier__max_iter": [#10, 25,
+                                     50,
+                                     100,
+                                     #250,
+                                     500,
+                                     1_000, #5_000, 10_000
+                                     ],
+        })
+        pipeline.perform()
+
 
         results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost")
         pipeline = XGBoostPipeline(ds=text_ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload, param_grid={

diff --git a/app/tfidf_embeddings/pipeline.py b/app/tfidf_embeddings/pipeline.py
@@ -19,26 +19,33 @@
 from app.nlp import convert_non_ascii
 from app.classification import Y_COLS
 
-TFIDF_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "tfidf_embeddings")
+TFIDF_MAX_FEATURES = os.getenv("TFIDF_MAX_FEATURES") # default of None
+TFIDF_RESULTS_DIRPATH_SUFFIX = f"_{TFIDF_MAX_FEATURES}" if TFIDF_MAX_FEATURES else ""
+TFIDF_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, f"tfidf_embeddings{TFIDF_RESULTS_DIRPATH_SUFFIX}")
 
 
 #def remove_non_ascii(my_text):
 #    return my_text.encode('ascii', 'ignore').decode('ascii')
 
 
 class TextEmbeddingPipeline:
-    def __init__(self, corpus, tokenizer=tokenizer, stopwords=list(SKLEARN_STOPWORDS), results_dirpath=TFIDF_RESULTS_DIRPATH): # save_results=True, # destructive=WORD2VEC_DESTRUCTIVE
+    def __init__(self, corpus, tokenizer=tokenizer, stopwords=list(SKLEARN_STOPWORDS), results_dirpath=TFIDF_RESULTS_DIRPATH, max_features=TFIDF_MAX_FEATURES): # save_results=True, # destructive=WORD2VEC_DESTRUCTIVE
         """Param corpus a pandas series of texts (text for each document)"""
 
         self.corpus = corpus
         self.tokenizer = tokenizer
         self.stopwords = stopwords
+        self.max_features = max_features
+        if self.max_features:
+            self.max_features = int(self.max_features)
+
         #self.save_results = bool(save_results)
         #self.destructive = bool(destructive)
 
         self.corpus = self.corpus.apply(convert_non_ascii) # 72_854
 
         self.results_dirpath = results_dirpath
+        os.makedirs(self.results_dirpath, exist_ok=True)
         self.model_filepath = os.path.join(self.results_dirpath, f"tfidf.model")
         self.results_json_filepath = os.path.join(self.results_dirpath, "results.json")
         self.terms_csv_filepath = os.path.join(self.results_dirpath, "terms.csv")
@@ -92,7 +99,7 @@ def perform(self):
         print("----------------")
         print("INITIALIZING NEW MODEL...")
 
-        self.model = TfidfVectorizer(tokenizer=self.tokenizer, stop_words=self.stopwords)
+        self.model = TfidfVectorizer(tokenizer=self.tokenizer, stop_words=self.stopwords, max_features=self.max_features)
         print(self.model)
 
         print("----------------")
@@ -133,7 +140,14 @@ def perform(self):
         self.top_words_df.to_csv(self.terms_csv_filepath, index=True)
         #print("...DOCUMENT EMBeDDINGS...")
         #self.embeddings_df.to_csv(self.document_embeddings_csv_filepath, index=True) # TAKeS TOO LONG? tOO SPArSE? tOO MANY COlS?
-        self.embeddings_df.to_hdf(self.document_embeddings_hd5_filepath, index=True, key="document_embeddings")
+        #self.embeddings_df.to_hdf(self.document_embeddings_hd5_filepath, index=True, key="document_embeddings")
+        if self.max_features < 5_000:
+            # let's save a smaller version of the file
+            self.embeddings_df.to_csv(self.document_embeddings_csv_filepath, index=True)
+        else:
+            # let's save the large file? but actually its like 5GB so nevermind
+            #self.embeddings_df.to_hdf(self.document_embeddings_hd5_filepath, index=True, key="document_embeddings")
+            pass
 
         #print("... MODEL...")
         #joblib.dump(self.model, self.model_filepath)

diff --git a/index.html b/index.html
@@ -59,7 +59,7 @@ <h3><a href="results/tfidf_embeddings/index.html">Dimensionality Reduction Resul
 
 
         <section>
-            <h3><a href="results/tfidf_classification/index.html">Classification Results</a></h3>
+            <h3><a href="results/tfidf_classification_1500/index.html">Classification Results</a></h3>
 
         </section>
 

diff --git a/results/tfidf_classification_1500/fourway_label/logistic_regression/confusion.html b/results/tfidf_classification_1500/fourway_label/logistic_regression/confusion.html
diff --git a/results/tfidf_classification_1500/fourway_label/logistic_regression/confusion.png b/results/tfidf_classification_1500/fourway_label/logistic_regression/confusion.png