Word2Vec Embeddings (#21)

* Word2Vec Word Embeddings * Word2Vec User Embeddings * Word2Vec Results Page
s2t2 · Nov 18, 2023 · 0d3d87f · 0d3d87f
1 parent a164ea8
commit 0d3d87f
Show file tree

Hide file tree

Showing 110 changed files with 128,249 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,11 @@ results/*/*.csv.gz
 results/*/*.json
 results/*/*.json
 
+results/embeddings/*/*.model
+results/embeddings/*/*.kv
+results/embeddings/*/*.csv
+
+
 #results/*/*/*.png
 #results/*/*/*.html
 results/*/*/*.json

diff --git a/app/embeddings/README.md b/app/embeddings/README.md
@@ -0,0 +1,24 @@
+## Text Embeddings Comparison
+
+
+### OpenAI
+
+See notebooks.
+
+### Word2Vec
+
+```sh
+python -m app.embeddings.word2vec
+
+# WORD2VEC_DESTRUCTIVE=true python -m app.embeddings.word2vec
+
+# FIG_SAVE=true FIG_SHOW=true python -m app.embeddings.word2vec
+```
+
+Perform dimensionality reduction on the resulting word and document embeddings, respectively:
+
+```sh
+python -m app.embeddings.word2vec_reduction
+
+# FIG_SAVE=true FIG_SHOW=true python -m app.embeddings.word2vec_reduction
+```
diff --git a/app/embeddings/word2vec.py b/app/embeddings/word2vec.py
@@ -0,0 +1,189 @@
+
+import os
+import shutil
+from functools import cached_property
+from pprint import pprint
+
+#from datetime import datetime
+from itertools import chain
+from collections import Counter
+
+from gensim.models import Word2Vec
+from gensim.utils import simple_preprocess as tokenizer
+from pandas import DataFrame, Series
+import numpy as np
+
+from app import RESULTS_DIRPATH
+from app.classification import Y_COLS
+
+WORD2VEC_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "embeddings", "word2vec")
+#WORD2VEC_DESTRUCTIVE = bool(os.getenv("WORD2VEC_DESTRUCTIVE", default="false") == 'true')
+
+#VECTOR_LENGTH = 100
+
+
+class WordPipe:
+    def __init__(self, corpus, tokenizer=tokenizer, results_dirpath=WORD2VEC_RESULTS_DIRPATH): # destructive=WORD2VEC_DESTRUCTIVE
+        """Param corpus a pandas series of arrays (tokens for each document)"""
+
+        self.corpus = corpus
+        self.tokenizer = tokenizer
+
+        #self.destructive = bool(destructive)
+        self.results_dirpath = results_dirpath
+        self.model_filepath = os.path.join(self.results_dirpath, f"w2v.model")
+        #self.kv_filepath = os.path.join(self.results_dirpath, f"w2v.kv")
+        self.word_vectors_csv_filepath = os.path.join(self.results_dirpath, "word_vectors.csv")
+        self.document_vectors_csv_filepath = os.path.join(self.results_dirpath, "document_vectors.csv")
+
+
+    @cached_property
+    def corpus_tokens(self):
+        return self.corpus.apply(tokenizer)
+
+    @cached_property
+    def word_counts(self):
+        all_words = list(chain.from_iterable(self.corpus_tokens)) # h/t chat gpt for this one
+        word_counter = Counter(all_words)
+        return Series(word_counter.values(), index=word_counter.keys(), name="word_count")
+
+
+    def perform(self):
+        # TOKEN ANALYSIS (SIDE qUEST)
+        print(self.word_counts.sort_values(ascending=False).head())
+
+        self.load_or_train_model()
+        print("WORDS:", len(self.words))
+
+        print("WORD VECTORS:", self.word_vectors_df.shape) # 100 columns, default vector_size=100
+        self.save_word_vectors()
+
+        print("DOCUMENT VECTORS:", self.document_vectors.shape)
+        self.save_document_vectors()
+
+
+    def load_or_train_model(self, vector_size=100, window=10, min_count=2, workers=4):
+        #if self.destructive:
+        #    print("----------------")
+        #    print("DESTRUCTIVE MODE...")
+        #    #shutil.rmtree(self.results_dirpath)
+        #    os.removedirs()
+
+        os.makedirs(self.results_dirpath, exist_ok=True)
+
+        if os.path.exists(self.model_filepath):
+            print("----------------")
+            print("LOADING MODEL FROM FILE...")
+            print(self.model_filepath)
+            self.model = Word2Vec.load(self.model_filepath)
+            print(self.model)
+            #print(type(self.model))
+        else:
+            print("----------------")
+            print("INITIALIZING NEW MODEL...")
+            self.model = Word2Vec(window=window, min_count=min_count, workers=workers, vector_size=vector_size)
+            print(self.model)
+
+            print("----------------")
+            print("VOCAB...")
+            self.model.build_vocab(self.corpus_tokens) # progress_per=1000
+            #print("N SAMPLES:", model.corpus_count)
+            #print("EPOCHS:", model.epochs)
+
+            print("----------------")
+            print("TRAINING...")
+            self.model.train(self.corpus_tokens, total_examples=self.model.corpus_count, epochs=self.model.epochs)
+            print(round(self.model.total_train_time, 0), "seconds")
+
+            print("----------------")
+            print("SAVING...")
+            self.model.save(self.model_filepath)
+            #self.model.wv.save(self.vectors_filepath)
+
+        return self.model
+
+    # AVAILABLE AFTER TRAINING:
+
+    # WORD ANaLYSIS
+
+    @property
+    def words(self):
+        return self.model.wv.index_to_key
+
+    @property
+    def word_vectors(self):
+        return self.model.wv.vectors
+
+    @property
+    def word_vectors_df(self):
+        return DataFrame(self.word_vectors, index=self.words)
+
+    @cached_property
+    def words_df(self):
+        words_df = self.word_vectors_df.merge(self.word_counts, how="inner", left_index=True, right_index=True)
+        words_df.index.name = "token"
+        return words_df
+
+    def save_word_vectors(self):
+        self.words_df.to_csv(self.word_vectors_csv_filepath, index=True)
+
+    # DOCUMENT ANALYSIS
+
+    def infer_document_vector(self, tokens):
+        """"Gets average vector for each set of tokens."""
+        # Filter tokens that are in the model's vocabulary
+        tokens = [token for token in tokens if token in self.model.wv.key_to_index]
+        if any(tokens):
+            # Calculate the average vector for the tokens in the document
+            doc_vector = np.mean([self.model.wv[token] for token in tokens], axis=0)
+        else:
+            # If none of the tokens are in the model's vocabulary, return a zero vector
+            doc_vector = np.zeros(self.model.vector_size)
+        return doc_vector
+
+    @cached_property
+    def document_vectors(self):
+        return self.corpus_tokens.apply(self.infer_document_vector)
+
+    @cached_property
+    def document_vectors_df(self, index_name="user_id"):
+        # UNpacK EMBEdDINGS tO THEIR OWN COLUMNS
+        docs_df = DataFrame(self.document_vectors.values.tolist())
+        docs_df.columns = [str(i) for i in range(0, len(docs_df.columns))]
+        docs_df.index = self.corpus_tokens.index
+        docs_df.index.name = index_name
+        return docs_df
+
+    def save_document_vectors(self):
+        self.document_vectors_df.to_csv(self.document_vectors_csv_filepath, index=True)
+
+
+if __name__ == "__main__":
+
+
+    from app.dataset import Dataset
+
+    ds = Dataset()
+    df = ds.df
+
+    #df["tokens"] = df["tweet_texts"].apply(tokenizer)
+    #print(df["tokens"].head())
+
+    wp = WordPipe(corpus=df["tweet_texts"])
+    wp.perform()
+
+    # INVEstIGatION
+    # https://radimrehurek.com/gensim/models/keyedvectors.html
+    wv = wp.model.wv #> gensim.models.keyedvectors.KeyedVectors
+    print(len(wv))  #> 34,729 ORIGINAL ( ______ STOPwORD-REMOVED)
+
+    #breakpoint()
+    trumplike = wv.most_similar("realdonaldtrump", topn=10)
+    pprint(trumplike)
+
+    #wv.similarity(w1="impeachment", w2="sham")
+    #wv.similarity(w1="impeachment", w2="just"))
+    #wv.similarity(w1="impeachment", w2="fair"))
+    #wv.similarity(w1="impeachment", w2="unfair"))
+    #wv.similarity(w1="realdonaldtrump", w2="guilty"))
+    #wv.similarity(w1="realdonaldtrump", w2="innocent"))
diff --git a/app/embeddings/word2vec_reduction.py b/app/embeddings/word2vec_reduction.py
@@ -0,0 +1,125 @@
+import os
+from pandas import DataFrame
+
+from app.reduction.pipeline import ReductionPipeline, REDUCER_TYPE, N_COMPONENTS
+from app.embeddings.word2vec import WORD2VEC_RESULTS_DIRPATH, WordPipe
+
+
+class WordVectorReductionPipeline(ReductionPipeline):
+
+    def __init__(self, x, results_dirpath,
+                 reducer_type=REDUCER_TYPE, n_components=N_COMPONENTS,
+                 labels_df=None, #x_scale=X_SCALE,
+                ):
+
+        self.x = x
+        self.labels_df = labels_df
+
+        self.reducer_type = reducer_type
+        self.n_components = n_components
+        self.results_dirpath = results_dirpath
+        os.makedirs(self.results_dirpath, exist_ok=True)
+
+        self.reducer_name = {"PCA": "pca", "T-SNE": "tsne", "UMAP": "umap"}[self.reducer_type]
+
+        self.reducer = None
+        self.embeddings = None
+        self.embeddings_df = None
+        self.loadings = None
+        self.loadings_df = None
+
+
+    def save_embeddings(self):
+        """
+        Save a slim copy of the embeddings to CSV (just user_id and component values).
+        With the goal of merging all the results into a single file later.
+        """
+        csv_filepath = os.path.join(self.results_dirpath, f"{self.reducer_name}_{self.n_components}_embeddings.csv")
+
+        results_df = self.embeddings_df.copy()
+        #results_df.index = self.x.index
+        results_df.index.name = "token"
+        #results_df["token"] = self.x.index
+
+        for colname in self.component_names:
+            # rename column to include info about which method produced it:
+            results_df.rename(columns={colname: f"{self.reducer_name}_{self.n_components}_{colname}"}, inplace=True)
+        results_df.to_csv(csv_filepath, index=True)
+
+
+
+if __name__ == "__main__":
+
+    from app.dataset import Dataset
+    from app.colors import COLORS_MAP, CATEGORY_ORDERS
+
+    ds = Dataset()
+    df = ds.df
+
+    wp = WordPipe(corpus=df["tweet_texts"])
+    wp.load_or_train_model()
+
+    print("------------")
+    print("WORD EMBEDDINGS...")
+
+    word_results_filepath = os.path.join(wp.results_dirpath, "word_reduction")
+    word_labels_df = wp.words_df[["word_count"]]
+    for reducer_type in ["PCA", "T-SNE", "UMAP"]: #
+        print(reducer_type)
+
+        drp = WordVectorReductionPipeline(x=wp.word_vectors_df, labels_df=word_labels_df, reducer_type=reducer_type, n_components=2, results_dirpath=word_results_filepath)
+        drp.perform()
+        drp.save_embeddings()
+
+        drp.embeddings_df["token"] = drp.embeddings_df.index
+        drp.plot_embeddings(hover_data=["token", "word_count"], subtitle="Word2Vec Word Embeddings")
+        # oh this is not that interesting unless we perform stopword removal
+        #TOP_N = 250
+        #drp.embeddings_df.sort_values(by=["word_count"], ascending=False, inplace=True) # it is already sorted, but just to be sure
+        #drp.embeddings_df = drp.embeddings_df.head(TOP_N)
+        #drp.plot_embeddings(size="word_count", hover_data=["token", "word_count"]) # subtitle=f"Top {TOP_N} Words"
+
+    print("------------")
+    print("DOCUMENT EMBEDDINGS...")
+
+    doc_results_dirpath = os.path.join(wp.results_dirpath, "doc_reduction")
+    doc_labels_df = ds.labels.copy()
+    for reducer_type in ["PCA", "T-SNE", "UMAP"]:
+        print(reducer_type)
+
+        drp = WordVectorReductionPipeline(x=wp.document_vectors_df, labels_df=doc_labels_df, reducer_type=reducer_type, n_components=2, results_dirpath=doc_results_dirpath)
+        drp.perform()
+        drp.save_embeddings()
+
+        subtitle = "Word2Vec Document Embeddings (User Tweet Timelines)"
+        drp.plot_embeddings(subtitle=subtitle)
+        #drp.plot_embeddings(subtitle=subtitle, color="bot_label")
+        #drp.plot_embeddings(subtitle=subtitle, color="opinion_community")
+        #drp.plot_embeddings(subtitle=subtitle, color="toxic_label")
+        #drp.plot_embeddings(subtitle=subtitle, color="fact_label")
+        #drp.plot_embeddings(subtitle=subtitle, color="fourway_label")
+
+        for groupby_col in [
+            "bot_label", "opinion_label", "bom_overall_label", "bom_astroturf_label",
+            "toxic_label", "factual_label",
+            "fourway_label", #"sixway_label",
+                        ]:
+            color_map = COLORS_MAP[groupby_col]
+            category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}
+
+            results_dirpath = os.path.join(doc_results_dirpath, groupby_col)
+            os.makedirs(results_dirpath, exist_ok=True)
+
+            drp.plot_embeddings(color=groupby_col, color_map=color_map,
+                                        category_orders=category_orders,
+                                    #hover_data=["user_id", "bot_label"],
+                                    #fig_show=True, fig_save=True,
+                                    results_dirpath=results_dirpath, subtitle=subtitle
+                                    )
+
+            drp.plot_centroids(groupby_col=groupby_col, color_map=color_map,
+                                    category_orders=category_orders,
+                                #hover_data=["user_id", "bot_label"],
+                                #fig_show=True, fig_save=True
+                                results_dirpath=results_dirpath, subtitle=subtitle
+                                )