Skip to content

Commit

Permalink
Word2Vec Embeddings (#21)
Browse files Browse the repository at this point in the history
* Word2Vec Word Embeddings 

* Word2Vec User Embeddings

* Word2Vec Results Page
  • Loading branch information
s2t2 authored Nov 18, 2023
1 parent a164ea8 commit 0d3d87f
Show file tree
Hide file tree
Showing 110 changed files with 128,249 additions and 9 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ results/*/*.csv.gz
results/*/*.json
results/*/*.json

results/embeddings/*/*.model
results/embeddings/*/*.kv
results/embeddings/*/*.csv


#results/*/*/*.png
#results/*/*/*.html
results/*/*/*.json
Expand Down
24 changes: 24 additions & 0 deletions app/embeddings/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
## Text Embeddings Comparison


### OpenAI

See notebooks.

### Word2Vec

```sh
python -m app.embeddings.word2vec

# WORD2VEC_DESTRUCTIVE=true python -m app.embeddings.word2vec

# FIG_SAVE=true FIG_SHOW=true python -m app.embeddings.word2vec
```

Perform dimensionality reduction on the resulting word and document embeddings, respectively:

```sh
python -m app.embeddings.word2vec_reduction

# FIG_SAVE=true FIG_SHOW=true python -m app.embeddings.word2vec_reduction
```
189 changes: 189 additions & 0 deletions app/embeddings/word2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@

import os
import shutil
from functools import cached_property
from pprint import pprint

#from datetime import datetime
from itertools import chain
from collections import Counter

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess as tokenizer
from pandas import DataFrame, Series
import numpy as np

from app import RESULTS_DIRPATH
from app.classification import Y_COLS

WORD2VEC_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "embeddings", "word2vec")
#WORD2VEC_DESTRUCTIVE = bool(os.getenv("WORD2VEC_DESTRUCTIVE", default="false") == 'true')

#VECTOR_LENGTH = 100


class WordPipe:
def __init__(self, corpus, tokenizer=tokenizer, results_dirpath=WORD2VEC_RESULTS_DIRPATH): # destructive=WORD2VEC_DESTRUCTIVE
"""Param corpus a pandas series of arrays (tokens for each document)"""

self.corpus = corpus
self.tokenizer = tokenizer

#self.destructive = bool(destructive)
self.results_dirpath = results_dirpath
self.model_filepath = os.path.join(self.results_dirpath, f"w2v.model")
#self.kv_filepath = os.path.join(self.results_dirpath, f"w2v.kv")
self.word_vectors_csv_filepath = os.path.join(self.results_dirpath, "word_vectors.csv")
self.document_vectors_csv_filepath = os.path.join(self.results_dirpath, "document_vectors.csv")


@cached_property
def corpus_tokens(self):
return self.corpus.apply(tokenizer)

@cached_property
def word_counts(self):
all_words = list(chain.from_iterable(self.corpus_tokens)) # h/t chat gpt for this one
word_counter = Counter(all_words)
return Series(word_counter.values(), index=word_counter.keys(), name="word_count")


def perform(self):
# TOKEN ANALYSIS (SIDE qUEST)
print(self.word_counts.sort_values(ascending=False).head())

self.load_or_train_model()
print("WORDS:", len(self.words))

print("WORD VECTORS:", self.word_vectors_df.shape) # 100 columns, default vector_size=100
self.save_word_vectors()

print("DOCUMENT VECTORS:", self.document_vectors.shape)
self.save_document_vectors()


def load_or_train_model(self, vector_size=100, window=10, min_count=2, workers=4):
#if self.destructive:
# print("----------------")
# print("DESTRUCTIVE MODE...")
# #shutil.rmtree(self.results_dirpath)
# os.removedirs()

os.makedirs(self.results_dirpath, exist_ok=True)

if os.path.exists(self.model_filepath):
print("----------------")
print("LOADING MODEL FROM FILE...")
print(self.model_filepath)
self.model = Word2Vec.load(self.model_filepath)
print(self.model)
#print(type(self.model))
else:
print("----------------")
print("INITIALIZING NEW MODEL...")
self.model = Word2Vec(window=window, min_count=min_count, workers=workers, vector_size=vector_size)
print(self.model)

print("----------------")
print("VOCAB...")
self.model.build_vocab(self.corpus_tokens) # progress_per=1000
#print("N SAMPLES:", model.corpus_count)
#print("EPOCHS:", model.epochs)

print("----------------")
print("TRAINING...")
self.model.train(self.corpus_tokens, total_examples=self.model.corpus_count, epochs=self.model.epochs)
print(round(self.model.total_train_time, 0), "seconds")

print("----------------")
print("SAVING...")
self.model.save(self.model_filepath)
#self.model.wv.save(self.vectors_filepath)

return self.model

# AVAILABLE AFTER TRAINING:

# WORD ANaLYSIS

@property
def words(self):
return self.model.wv.index_to_key

@property
def word_vectors(self):
return self.model.wv.vectors

@property
def word_vectors_df(self):
return DataFrame(self.word_vectors, index=self.words)

@cached_property
def words_df(self):
words_df = self.word_vectors_df.merge(self.word_counts, how="inner", left_index=True, right_index=True)
words_df.index.name = "token"
return words_df

def save_word_vectors(self):
self.words_df.to_csv(self.word_vectors_csv_filepath, index=True)

# DOCUMENT ANALYSIS

def infer_document_vector(self, tokens):
""""Gets average vector for each set of tokens."""
# Filter tokens that are in the model's vocabulary
tokens = [token for token in tokens if token in self.model.wv.key_to_index]
if any(tokens):
# Calculate the average vector for the tokens in the document
doc_vector = np.mean([self.model.wv[token] for token in tokens], axis=0)
else:
# If none of the tokens are in the model's vocabulary, return a zero vector
doc_vector = np.zeros(self.model.vector_size)
return doc_vector

@cached_property
def document_vectors(self):
return self.corpus_tokens.apply(self.infer_document_vector)

@cached_property
def document_vectors_df(self, index_name="user_id"):
# UNpacK EMBEdDINGS tO THEIR OWN COLUMNS
docs_df = DataFrame(self.document_vectors.values.tolist())
docs_df.columns = [str(i) for i in range(0, len(docs_df.columns))]
docs_df.index = self.corpus_tokens.index
docs_df.index.name = index_name
return docs_df

def save_document_vectors(self):
self.document_vectors_df.to_csv(self.document_vectors_csv_filepath, index=True)


if __name__ == "__main__":


from app.dataset import Dataset

ds = Dataset()
df = ds.df

#df["tokens"] = df["tweet_texts"].apply(tokenizer)
#print(df["tokens"].head())

wp = WordPipe(corpus=df["tweet_texts"])
wp.perform()

# INVEstIGatION
# https://radimrehurek.com/gensim/models/keyedvectors.html
wv = wp.model.wv #> gensim.models.keyedvectors.KeyedVectors
print(len(wv)) #> 34,729 ORIGINAL ( ______ STOPwORD-REMOVED)

#breakpoint()
trumplike = wv.most_similar("realdonaldtrump", topn=10)
pprint(trumplike)

#wv.similarity(w1="impeachment", w2="sham")
#wv.similarity(w1="impeachment", w2="just"))
#wv.similarity(w1="impeachment", w2="fair"))
#wv.similarity(w1="impeachment", w2="unfair"))
#wv.similarity(w1="realdonaldtrump", w2="guilty"))
#wv.similarity(w1="realdonaldtrump", w2="innocent"))
125 changes: 125 additions & 0 deletions app/embeddings/word2vec_reduction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
from pandas import DataFrame

from app.reduction.pipeline import ReductionPipeline, REDUCER_TYPE, N_COMPONENTS
from app.embeddings.word2vec import WORD2VEC_RESULTS_DIRPATH, WordPipe


class WordVectorReductionPipeline(ReductionPipeline):

def __init__(self, x, results_dirpath,
reducer_type=REDUCER_TYPE, n_components=N_COMPONENTS,
labels_df=None, #x_scale=X_SCALE,
):

self.x = x
self.labels_df = labels_df

self.reducer_type = reducer_type
self.n_components = n_components
self.results_dirpath = results_dirpath
os.makedirs(self.results_dirpath, exist_ok=True)

self.reducer_name = {"PCA": "pca", "T-SNE": "tsne", "UMAP": "umap"}[self.reducer_type]

self.reducer = None
self.embeddings = None
self.embeddings_df = None
self.loadings = None
self.loadings_df = None


def save_embeddings(self):
"""
Save a slim copy of the embeddings to CSV (just user_id and component values).
With the goal of merging all the results into a single file later.
"""
csv_filepath = os.path.join(self.results_dirpath, f"{self.reducer_name}_{self.n_components}_embeddings.csv")

results_df = self.embeddings_df.copy()
#results_df.index = self.x.index
results_df.index.name = "token"
#results_df["token"] = self.x.index

for colname in self.component_names:
# rename column to include info about which method produced it:
results_df.rename(columns={colname: f"{self.reducer_name}_{self.n_components}_{colname}"}, inplace=True)
results_df.to_csv(csv_filepath, index=True)



if __name__ == "__main__":

from app.dataset import Dataset
from app.colors import COLORS_MAP, CATEGORY_ORDERS

ds = Dataset()
df = ds.df

wp = WordPipe(corpus=df["tweet_texts"])
wp.load_or_train_model()

print("------------")
print("WORD EMBEDDINGS...")

word_results_filepath = os.path.join(wp.results_dirpath, "word_reduction")
word_labels_df = wp.words_df[["word_count"]]
for reducer_type in ["PCA", "T-SNE", "UMAP"]: #
print(reducer_type)

drp = WordVectorReductionPipeline(x=wp.word_vectors_df, labels_df=word_labels_df, reducer_type=reducer_type, n_components=2, results_dirpath=word_results_filepath)
drp.perform()
drp.save_embeddings()

drp.embeddings_df["token"] = drp.embeddings_df.index
drp.plot_embeddings(hover_data=["token", "word_count"], subtitle="Word2Vec Word Embeddings")
# oh this is not that interesting unless we perform stopword removal
#TOP_N = 250
#drp.embeddings_df.sort_values(by=["word_count"], ascending=False, inplace=True) # it is already sorted, but just to be sure
#drp.embeddings_df = drp.embeddings_df.head(TOP_N)
#drp.plot_embeddings(size="word_count", hover_data=["token", "word_count"]) # subtitle=f"Top {TOP_N} Words"

print("------------")
print("DOCUMENT EMBEDDINGS...")

doc_results_dirpath = os.path.join(wp.results_dirpath, "doc_reduction")
doc_labels_df = ds.labels.copy()
for reducer_type in ["PCA", "T-SNE", "UMAP"]:
print(reducer_type)

drp = WordVectorReductionPipeline(x=wp.document_vectors_df, labels_df=doc_labels_df, reducer_type=reducer_type, n_components=2, results_dirpath=doc_results_dirpath)
drp.perform()
drp.save_embeddings()

subtitle = "Word2Vec Document Embeddings (User Tweet Timelines)"
drp.plot_embeddings(subtitle=subtitle)
#drp.plot_embeddings(subtitle=subtitle, color="bot_label")
#drp.plot_embeddings(subtitle=subtitle, color="opinion_community")
#drp.plot_embeddings(subtitle=subtitle, color="toxic_label")
#drp.plot_embeddings(subtitle=subtitle, color="fact_label")
#drp.plot_embeddings(subtitle=subtitle, color="fourway_label")

for groupby_col in [
"bot_label", "opinion_label", "bom_overall_label", "bom_astroturf_label",
"toxic_label", "factual_label",
"fourway_label", #"sixway_label",
]:
color_map = COLORS_MAP[groupby_col]
category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]}

results_dirpath = os.path.join(doc_results_dirpath, groupby_col)
os.makedirs(results_dirpath, exist_ok=True)

drp.plot_embeddings(color=groupby_col, color_map=color_map,
category_orders=category_orders,
#hover_data=["user_id", "bot_label"],
#fig_show=True, fig_save=True,
results_dirpath=results_dirpath, subtitle=subtitle
)

drp.plot_centroids(groupby_col=groupby_col, color_map=color_map,
category_orders=category_orders,
#hover_data=["user_id", "bot_label"],
#fig_show=True, fig_save=True
results_dirpath=results_dirpath, subtitle=subtitle
)
Loading

0 comments on commit 0d3d87f

Please sign in to comment.