-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Word2Vec Word Embeddings * Word2Vec User Embeddings * Word2Vec Results Page
- Loading branch information
Showing
110 changed files
with
128,249 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
## Text Embeddings Comparison | ||
|
||
|
||
### OpenAI | ||
|
||
See notebooks. | ||
|
||
### Word2Vec | ||
|
||
```sh | ||
python -m app.embeddings.word2vec | ||
|
||
# WORD2VEC_DESTRUCTIVE=true python -m app.embeddings.word2vec | ||
|
||
# FIG_SAVE=true FIG_SHOW=true python -m app.embeddings.word2vec | ||
``` | ||
|
||
Perform dimensionality reduction on the resulting word and document embeddings, respectively: | ||
|
||
```sh | ||
python -m app.embeddings.word2vec_reduction | ||
|
||
# FIG_SAVE=true FIG_SHOW=true python -m app.embeddings.word2vec_reduction | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
|
||
import os | ||
import shutil | ||
from functools import cached_property | ||
from pprint import pprint | ||
|
||
#from datetime import datetime | ||
from itertools import chain | ||
from collections import Counter | ||
|
||
from gensim.models import Word2Vec | ||
from gensim.utils import simple_preprocess as tokenizer | ||
from pandas import DataFrame, Series | ||
import numpy as np | ||
|
||
from app import RESULTS_DIRPATH | ||
from app.classification import Y_COLS | ||
|
||
WORD2VEC_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "embeddings", "word2vec") | ||
#WORD2VEC_DESTRUCTIVE = bool(os.getenv("WORD2VEC_DESTRUCTIVE", default="false") == 'true') | ||
|
||
#VECTOR_LENGTH = 100 | ||
|
||
|
||
class WordPipe: | ||
def __init__(self, corpus, tokenizer=tokenizer, results_dirpath=WORD2VEC_RESULTS_DIRPATH): # destructive=WORD2VEC_DESTRUCTIVE | ||
"""Param corpus a pandas series of arrays (tokens for each document)""" | ||
|
||
self.corpus = corpus | ||
self.tokenizer = tokenizer | ||
|
||
#self.destructive = bool(destructive) | ||
self.results_dirpath = results_dirpath | ||
self.model_filepath = os.path.join(self.results_dirpath, f"w2v.model") | ||
#self.kv_filepath = os.path.join(self.results_dirpath, f"w2v.kv") | ||
self.word_vectors_csv_filepath = os.path.join(self.results_dirpath, "word_vectors.csv") | ||
self.document_vectors_csv_filepath = os.path.join(self.results_dirpath, "document_vectors.csv") | ||
|
||
|
||
@cached_property | ||
def corpus_tokens(self): | ||
return self.corpus.apply(tokenizer) | ||
|
||
@cached_property | ||
def word_counts(self): | ||
all_words = list(chain.from_iterable(self.corpus_tokens)) # h/t chat gpt for this one | ||
word_counter = Counter(all_words) | ||
return Series(word_counter.values(), index=word_counter.keys(), name="word_count") | ||
|
||
|
||
def perform(self): | ||
# TOKEN ANALYSIS (SIDE qUEST) | ||
print(self.word_counts.sort_values(ascending=False).head()) | ||
|
||
self.load_or_train_model() | ||
print("WORDS:", len(self.words)) | ||
|
||
print("WORD VECTORS:", self.word_vectors_df.shape) # 100 columns, default vector_size=100 | ||
self.save_word_vectors() | ||
|
||
print("DOCUMENT VECTORS:", self.document_vectors.shape) | ||
self.save_document_vectors() | ||
|
||
|
||
def load_or_train_model(self, vector_size=100, window=10, min_count=2, workers=4): | ||
#if self.destructive: | ||
# print("----------------") | ||
# print("DESTRUCTIVE MODE...") | ||
# #shutil.rmtree(self.results_dirpath) | ||
# os.removedirs() | ||
|
||
os.makedirs(self.results_dirpath, exist_ok=True) | ||
|
||
if os.path.exists(self.model_filepath): | ||
print("----------------") | ||
print("LOADING MODEL FROM FILE...") | ||
print(self.model_filepath) | ||
self.model = Word2Vec.load(self.model_filepath) | ||
print(self.model) | ||
#print(type(self.model)) | ||
else: | ||
print("----------------") | ||
print("INITIALIZING NEW MODEL...") | ||
self.model = Word2Vec(window=window, min_count=min_count, workers=workers, vector_size=vector_size) | ||
print(self.model) | ||
|
||
print("----------------") | ||
print("VOCAB...") | ||
self.model.build_vocab(self.corpus_tokens) # progress_per=1000 | ||
#print("N SAMPLES:", model.corpus_count) | ||
#print("EPOCHS:", model.epochs) | ||
|
||
print("----------------") | ||
print("TRAINING...") | ||
self.model.train(self.corpus_tokens, total_examples=self.model.corpus_count, epochs=self.model.epochs) | ||
print(round(self.model.total_train_time, 0), "seconds") | ||
|
||
print("----------------") | ||
print("SAVING...") | ||
self.model.save(self.model_filepath) | ||
#self.model.wv.save(self.vectors_filepath) | ||
|
||
return self.model | ||
|
||
# AVAILABLE AFTER TRAINING: | ||
|
||
# WORD ANaLYSIS | ||
|
||
@property | ||
def words(self): | ||
return self.model.wv.index_to_key | ||
|
||
@property | ||
def word_vectors(self): | ||
return self.model.wv.vectors | ||
|
||
@property | ||
def word_vectors_df(self): | ||
return DataFrame(self.word_vectors, index=self.words) | ||
|
||
@cached_property | ||
def words_df(self): | ||
words_df = self.word_vectors_df.merge(self.word_counts, how="inner", left_index=True, right_index=True) | ||
words_df.index.name = "token" | ||
return words_df | ||
|
||
def save_word_vectors(self): | ||
self.words_df.to_csv(self.word_vectors_csv_filepath, index=True) | ||
|
||
# DOCUMENT ANALYSIS | ||
|
||
def infer_document_vector(self, tokens): | ||
""""Gets average vector for each set of tokens.""" | ||
# Filter tokens that are in the model's vocabulary | ||
tokens = [token for token in tokens if token in self.model.wv.key_to_index] | ||
if any(tokens): | ||
# Calculate the average vector for the tokens in the document | ||
doc_vector = np.mean([self.model.wv[token] for token in tokens], axis=0) | ||
else: | ||
# If none of the tokens are in the model's vocabulary, return a zero vector | ||
doc_vector = np.zeros(self.model.vector_size) | ||
return doc_vector | ||
|
||
@cached_property | ||
def document_vectors(self): | ||
return self.corpus_tokens.apply(self.infer_document_vector) | ||
|
||
@cached_property | ||
def document_vectors_df(self, index_name="user_id"): | ||
# UNpacK EMBEdDINGS tO THEIR OWN COLUMNS | ||
docs_df = DataFrame(self.document_vectors.values.tolist()) | ||
docs_df.columns = [str(i) for i in range(0, len(docs_df.columns))] | ||
docs_df.index = self.corpus_tokens.index | ||
docs_df.index.name = index_name | ||
return docs_df | ||
|
||
def save_document_vectors(self): | ||
self.document_vectors_df.to_csv(self.document_vectors_csv_filepath, index=True) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
|
||
from app.dataset import Dataset | ||
|
||
ds = Dataset() | ||
df = ds.df | ||
|
||
#df["tokens"] = df["tweet_texts"].apply(tokenizer) | ||
#print(df["tokens"].head()) | ||
|
||
wp = WordPipe(corpus=df["tweet_texts"]) | ||
wp.perform() | ||
|
||
# INVEstIGatION | ||
# https://radimrehurek.com/gensim/models/keyedvectors.html | ||
wv = wp.model.wv #> gensim.models.keyedvectors.KeyedVectors | ||
print(len(wv)) #> 34,729 ORIGINAL ( ______ STOPwORD-REMOVED) | ||
|
||
#breakpoint() | ||
trumplike = wv.most_similar("realdonaldtrump", topn=10) | ||
pprint(trumplike) | ||
|
||
#wv.similarity(w1="impeachment", w2="sham") | ||
#wv.similarity(w1="impeachment", w2="just")) | ||
#wv.similarity(w1="impeachment", w2="fair")) | ||
#wv.similarity(w1="impeachment", w2="unfair")) | ||
#wv.similarity(w1="realdonaldtrump", w2="guilty")) | ||
#wv.similarity(w1="realdonaldtrump", w2="innocent")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import os | ||
from pandas import DataFrame | ||
|
||
from app.reduction.pipeline import ReductionPipeline, REDUCER_TYPE, N_COMPONENTS | ||
from app.embeddings.word2vec import WORD2VEC_RESULTS_DIRPATH, WordPipe | ||
|
||
|
||
class WordVectorReductionPipeline(ReductionPipeline): | ||
|
||
def __init__(self, x, results_dirpath, | ||
reducer_type=REDUCER_TYPE, n_components=N_COMPONENTS, | ||
labels_df=None, #x_scale=X_SCALE, | ||
): | ||
|
||
self.x = x | ||
self.labels_df = labels_df | ||
|
||
self.reducer_type = reducer_type | ||
self.n_components = n_components | ||
self.results_dirpath = results_dirpath | ||
os.makedirs(self.results_dirpath, exist_ok=True) | ||
|
||
self.reducer_name = {"PCA": "pca", "T-SNE": "tsne", "UMAP": "umap"}[self.reducer_type] | ||
|
||
self.reducer = None | ||
self.embeddings = None | ||
self.embeddings_df = None | ||
self.loadings = None | ||
self.loadings_df = None | ||
|
||
|
||
def save_embeddings(self): | ||
""" | ||
Save a slim copy of the embeddings to CSV (just user_id and component values). | ||
With the goal of merging all the results into a single file later. | ||
""" | ||
csv_filepath = os.path.join(self.results_dirpath, f"{self.reducer_name}_{self.n_components}_embeddings.csv") | ||
|
||
results_df = self.embeddings_df.copy() | ||
#results_df.index = self.x.index | ||
results_df.index.name = "token" | ||
#results_df["token"] = self.x.index | ||
|
||
for colname in self.component_names: | ||
# rename column to include info about which method produced it: | ||
results_df.rename(columns={colname: f"{self.reducer_name}_{self.n_components}_{colname}"}, inplace=True) | ||
results_df.to_csv(csv_filepath, index=True) | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
|
||
from app.dataset import Dataset | ||
from app.colors import COLORS_MAP, CATEGORY_ORDERS | ||
|
||
ds = Dataset() | ||
df = ds.df | ||
|
||
wp = WordPipe(corpus=df["tweet_texts"]) | ||
wp.load_or_train_model() | ||
|
||
print("------------") | ||
print("WORD EMBEDDINGS...") | ||
|
||
word_results_filepath = os.path.join(wp.results_dirpath, "word_reduction") | ||
word_labels_df = wp.words_df[["word_count"]] | ||
for reducer_type in ["PCA", "T-SNE", "UMAP"]: # | ||
print(reducer_type) | ||
|
||
drp = WordVectorReductionPipeline(x=wp.word_vectors_df, labels_df=word_labels_df, reducer_type=reducer_type, n_components=2, results_dirpath=word_results_filepath) | ||
drp.perform() | ||
drp.save_embeddings() | ||
|
||
drp.embeddings_df["token"] = drp.embeddings_df.index | ||
drp.plot_embeddings(hover_data=["token", "word_count"], subtitle="Word2Vec Word Embeddings") | ||
# oh this is not that interesting unless we perform stopword removal | ||
#TOP_N = 250 | ||
#drp.embeddings_df.sort_values(by=["word_count"], ascending=False, inplace=True) # it is already sorted, but just to be sure | ||
#drp.embeddings_df = drp.embeddings_df.head(TOP_N) | ||
#drp.plot_embeddings(size="word_count", hover_data=["token", "word_count"]) # subtitle=f"Top {TOP_N} Words" | ||
|
||
print("------------") | ||
print("DOCUMENT EMBEDDINGS...") | ||
|
||
doc_results_dirpath = os.path.join(wp.results_dirpath, "doc_reduction") | ||
doc_labels_df = ds.labels.copy() | ||
for reducer_type in ["PCA", "T-SNE", "UMAP"]: | ||
print(reducer_type) | ||
|
||
drp = WordVectorReductionPipeline(x=wp.document_vectors_df, labels_df=doc_labels_df, reducer_type=reducer_type, n_components=2, results_dirpath=doc_results_dirpath) | ||
drp.perform() | ||
drp.save_embeddings() | ||
|
||
subtitle = "Word2Vec Document Embeddings (User Tweet Timelines)" | ||
drp.plot_embeddings(subtitle=subtitle) | ||
#drp.plot_embeddings(subtitle=subtitle, color="bot_label") | ||
#drp.plot_embeddings(subtitle=subtitle, color="opinion_community") | ||
#drp.plot_embeddings(subtitle=subtitle, color="toxic_label") | ||
#drp.plot_embeddings(subtitle=subtitle, color="fact_label") | ||
#drp.plot_embeddings(subtitle=subtitle, color="fourway_label") | ||
|
||
for groupby_col in [ | ||
"bot_label", "opinion_label", "bom_overall_label", "bom_astroturf_label", | ||
"toxic_label", "factual_label", | ||
"fourway_label", #"sixway_label", | ||
]: | ||
color_map = COLORS_MAP[groupby_col] | ||
category_orders = {groupby_col: CATEGORY_ORDERS[groupby_col]} | ||
|
||
results_dirpath = os.path.join(doc_results_dirpath, groupby_col) | ||
os.makedirs(results_dirpath, exist_ok=True) | ||
|
||
drp.plot_embeddings(color=groupby_col, color_map=color_map, | ||
category_orders=category_orders, | ||
#hover_data=["user_id", "bot_label"], | ||
#fig_show=True, fig_save=True, | ||
results_dirpath=results_dirpath, subtitle=subtitle | ||
) | ||
|
||
drp.plot_centroids(groupby_col=groupby_col, color_map=color_map, | ||
category_orders=category_orders, | ||
#hover_data=["user_id", "bot_label"], | ||
#fig_show=True, fig_save=True | ||
results_dirpath=results_dirpath, subtitle=subtitle | ||
) |
Oops, something went wrong.