diff --git a/.gitignore b/.gitignore index 0d6142f..a1f0790 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ google-credentials.json google-credentials-shared.json +data/*.csv data/*/*.csv data/*/*.csv.gz diff --git a/README.md b/README.md index f64396a..eebb3da 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,8 @@ OPENAI_API_KEY="sk__________" GOOGLE_APPLICATION_CREDENTIALS="/path/to/openai-embeddings-2023/google-credentials.json" BUCKET_NAME="my-bucket" + +DATASET_ADDRESS="my_project.my_dataset" ``` ## Usage @@ -62,7 +64,7 @@ python -m app.openai_service ``` -### Dataset Loading +### Embeddings per User (v1) Demonstrate ability to load the dataset: @@ -70,8 +72,6 @@ Demonstrate ability to load the dataset: python -m app.dataset ``` -### Data Analysis - Perform machine learning and other analyses on the data: OpenAI Embeddings: @@ -87,6 +87,13 @@ Word2Vec Embeddings: + [Classification](app/word2vec_classification/README.md) +### Embeddings per Tweet (v1) + +OpenAI Embeddings: + + + [Fetching Embeddings](app/openai_embeddings/per_tweet/README.md) + + ## Testing ```sh diff --git a/app/bq_service.py b/app/bq_service.py new file mode 100644 index 0000000..2574008 --- /dev/null +++ b/app/bq_service.py @@ -0,0 +1,82 @@ +# https://raw.githubusercontent.com/s2t2/tweet-analysis-2023/main/app/bq_service.py + +import os +from datetime import datetime + +from dotenv import load_dotenv +from google.cloud import bigquery +#from google.cloud.bigquery import QueryJobConfig, ScalarQueryParameter +from pandas import DataFrame + +from app.google_apis import GOOGLE_APPLICATION_CREDENTIALS # implicit check by google.cloud + +load_dotenv() + +#GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud + +# used by child classes only, defined here for convenience +DATASET_ADDRESS = os.getenv("DATASET_ADDRESS", default="tweet-collector-py.impeachment_development") # "MY_PROJECT.MY_DATASET" + + +class BigQueryService(): + + def __init__(self, client=None, dataset_address=DATASET_ADDRESS): + self.client = client or bigquery.Client() + self.dataset_address = dataset_address + + def execute_query(self, sql, verbose=True): + if verbose == True: + print(sql) + job = self.client.query(sql) + return job.result() + + def query_to_df(self, sql, verbose=True): + """high-level wrapper to return a DataFrame""" + results = self.execute_query(sql, verbose=verbose) + records = [dict(row) for row in list(results)] + df = DataFrame(records) + return df + + @staticmethod + def split_into_batches(my_list, batch_size=10_000): + """Splits a list into evenly sized batches""" + # h/t: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks + for i in range(0, len(my_list), batch_size): + yield my_list[i : i + batch_size] + + @staticmethod + def generate_timestamp(dt=None): + """Formats datetime object for storing in BigQuery. Uses current time by default. """ + dt = dt or datetime.now() + return dt.strftime("%Y-%m-%d %H:%M:%S") + + def insert_records_in_batches(self, table, records, batch_size=5_000): + """ + Inserts records in batches because attempting to insert too many rows at once + may result in google.api_core.exceptions.BadRequest: 400 + + Params: + table (table ID string, Table, or TableReference) + records (list of dictionaries) + """ + rows_to_insert = [list(d.values()) for d in records] + #errors = self.client.insert_rows(table, rows_to_insert) + errors = [] + batches = list(BigQueryService.split_into_batches(rows_to_insert, batch_size=batch_size)) + for batch in batches: + errors += self.client.insert_rows(table, batch) + return errors + + + +if __name__ == "__main__": + + service = BigQueryService() + client = service.client + print("PROJECT:", client.project) + + print("DATASETS:") + datasets = list(client.list_datasets()) + for ds in datasets: + #print("...", ds.project, ds.dataset_id) + print("...", ds.reference) diff --git a/app/google_apis.py b/app/google_apis.py new file mode 100644 index 0000000..02ddc60 --- /dev/null +++ b/app/google_apis.py @@ -0,0 +1,11 @@ + + + + +import os + +from dotenv import load_dotenv + +load_dotenv() + +GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud diff --git a/app/model_storage.py b/app/model_storage.py index a18803f..ce153f4 100644 --- a/app/model_storage.py +++ b/app/model_storage.py @@ -5,9 +5,11 @@ from google.cloud import storage as gcs from dotenv import load_dotenv +from app.google_apis import GOOGLE_APPLICATION_CREDENTIALS # implicit check by google.cloud + load_dotenv() -GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud for env var +#GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud for env var #PROJECT_ID = os.getenv("GOOGLE_PROJECT_NAME") # "my-project" BUCKET_NAME = os.getenv("BUCKET_NAME") # "my-bucket" needs to be globally unique! diff --git a/app/openai_embeddings_v2/README.md b/app/openai_embeddings_v2/README.md new file mode 100644 index 0000000..a25d5a3 --- /dev/null +++ b/app/openai_embeddings_v2/README.md @@ -0,0 +1,226 @@ +# OpenAI Embeddings (v2) + +Get embeddings, not only per user, but also per tweet, so we can compare the two approaches. Pull a new sample of tweets for the users we have been analyzing, but this time make sure to keep track of which tweets are being used, which will aid comparisons. + +## Setup + +Migrations, as necessary. Here we create a table of all tweets from each user in the sample: + +```sql +--CREATE TABLE `tweet-collector-py.impeachment_production.botometer_sample_max_10` as ( +--CREATE TABLE `tweet-collector-py.impeachment_production.botometer_sample_max_50` as ( +CREATE TABLE `tweet-collector-py.impeachment_production.botometer_sample` as ( + WITH ranked_tweets AS ( + SELECT + u.user_id, t.status_id, t.status_text, t.created_at, + ROW_NUMBER() OVER (PARTITION BY u.user_id ORDER BY RAND()) AS row_num + FROM ( + SELECT DISTINCT user_id + FROM `tweet-collector-py.impeachment_production.botometer_sample_openai_tweet_embeddings_20230724` + ) u + JOIN `tweet-collector-py.impeachment_production.tweets_v2` t on t.user_id = u.user_id + ORDER BY u.user_id, t.created_at + --LIMIT 10 + ) + + SELECT user_id, row_num, + status_id, status_text, created_at, + FROM ranked_tweets + -- WHERE row_num <= 10 -- MAX_TWEETS_PER_USER + -- WHERE row_num <= 50 -- MAX_TWEETS_PER_USER + +); +``` + + +How to sample from this table (choose a `MAX_TWEETS_PER_USER`, which we set as 50 by default): + +```sql +SELECT + count(distinct user_id) as user_count -- 7566 + ,count(distinct status_id) as status_count -- 183727 +FROM `tweet-collector-py.impeachment_production.botometer_sample` +WHERE row_num <= 50 -- MAX_TWEETS_PER_USER +``` + +```sql +CREATE TABLE `tweet-collector-py.impeachment_production.botometer_sample_max_50` as ( + SELECT * + FROM `tweet-collector-py.impeachment_production.botometer_sample` + WHERE row_num <= 50 + ORDER BY user_id, row_num +) +``` + +The 7,566 users in this sample have 183,727 tweets. + +Unique table of texts with identifiers: + +```sql +DROP TABLE IF EXISTS `tweet-collector-py.impeachment_production.botometer_sample_max_50_texts_map`; +CREATE TABLE IF NOT EXISTS `tweet-collector-py.impeachment_production.botometer_sample_max_50_texts_map` as ( + --WITH texts_map as ( + SELECT --s.user_id, s.row_num, s.status_id, s.status_text, s.created_at + ROW_NUMBER() OVER () AS status_text_id + ,s.status_text + ,count(DISTINCT s.status_id) as status_count + ,array_agg(DISTINCT s.status_id) as status_ids + ,count(DISTINCT s.user_id) as user_count + ,array_agg(DISTINCT s.user_id) as user_ids + FROM `tweet-collector-py.impeachment_production.botometer_sample` s + WHERE s.row_num <= 50 -- MAX_TWEETS_PER_USER + GROUP BY 2 + --ORDER BY status_count desc + --) + --SELECT status_text, status_count, status_id + --FROM texts_map, + --UNNEST(status_ids) AS status_id +) +``` + +Of the 183,727 tweets in this sample, there are 80,205 unique texts. + +Migrate table to receive text embeddings: + +```sql +CREATE TABLE IF NOT EXISTS `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_text_embeddings` ( + status_text_id INT64, + embeddings ARRAY +) +``` + +Migrate table to receive user embeddings: + +```sql +DROP TABLE IF EXISTS `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_user_embeddings`; +CREATE TABLE IF NOT EXISTS `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_user_embeddings` ( + user_id INT64, + embeddings ARRAY +) +``` + +## Embeddings + +### User level Embeddings + +Fetch user-level embeddings, and store in BQ: + +```sh +python -m app.openai_embeddings.per_user + +USERS_LIMIT=10 python -m app.openai_embeddings.per_user +USERS_LIMIT=100 python -m app.openai_embeddings.per_user +USERS_LIMIT=1000 python -m app.openai_embeddings.per_user +``` + +Monitoring the results: + +```sql +SELECT + count(distinct s.user_id) as user_count + ,count(distinct case when emb.user_id is not null then s.user_id end) as users_collected + ,count(distinct case when emb.user_id is not null then s.user_id end) / count(distinct s.user_id) as pct_collected +FROM `tweet-collector-py.impeachment_production.botometer_sample` s +LEFT JOIN `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_user_embeddings` emb + ON s.user_id = emb.user_id + +``` + + +### Tweet level Embeddings + +Fetch tweet-level embeddings, and store in BQ: + +```sh +python -m app.openai_embeddings.per_tweet.embeddings_job + +TEXTS_LIMIT=10 python -m app.openai_embeddings.per_tweet +TEXTS_LIMIT=1500 python -m app.openai_embeddings.per_tweet +TEXTS_LIMIT=10000 python -m app.openai_embeddings.per_tweet +TEXTS_LIMIT=250000 python -m app.openai_embeddings.per_tweet +``` + +Monitoring the results: + +```sql +SELECT count(distinct status_text_id) as text_count +FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_text_embeddings` emb +``` + + +Reconstruct table of embedding per status (as they were originally fetched for each distinct text): + + +```sql +CREATE TABLE `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings` as ( + WITH lookup_table as ( + SELECT txt.status_text_id ,status_id + FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_texts_map` txt, + UNNEST(txt.status_ids) as status_id + ) + + SELECT txt.status_id, txt.status_text_id, emb.embeddings + FROM lookup_table txt + JOIN `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_text_embeddings` emb + ON txt.status_text_id = emb.status_text_id + ORDER BY 2 + --LIMIT 10 + +) +``` + +Looks like we may have some duplicates, so update the table to remove dups: + +```sql +-- SELECT status_id, count(*) as row_count +-- FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings` +-- GROUP BY 1 +-- HAVING row_count > 1 +-- ORDER BY 2 DESC +-- -- 14652 example status ids: 1212493877673779200, 1212848708171321344, 1217970948529364992 + +-- SELECT status_id, status_text_id, count(*) as row_count +-- FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings` +-- GROUP BY 1,2 +-- HAVING row_count > 1 +-- ORDER BY 2 DESC +-- -- 14652 dups, example status ids: 1212493877673779200, 1212848708171321344, 1217970948529364992 + +CREATE TABLE `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings_v2` as ( + -- DE-DUPLICATED :-) + SELECT status_id, status_text_id, any_value(embeddings) as embeddings + FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings` + GROUP BY 1,2 +) + +--SELECT count(distinct status_id) as status_count +--FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings_v2` +---- 183727 +``` + +Add the user-level info back to the table for convenience of future queries. Can always not select it later. + +```sql +CREATE TABLE `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings_v3` as ( + SELECT s.user_id, s.status_id, s.status_text, s.created_at + , array_length(emb.embeddings) as embeds_length + ,emb.embeddings + FROM `tweet-collector-py.impeachment_production.botometer_sample` s + JOIN `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings_v2` emb + ON s.status_id = emb.status_id + -- LIMIT 10000 +) + +``` + +The contents of the embeddings alone are greater than the BQ export limit of 1GB, so we have to [export to GCS](https://cloud.google.com/bigquery/docs/exporting-data), or stream via notebook. + + + +## Exporting CSV files to Drive + +See [notebooks](/notebooks/openai_embeddings_v2/README.md). + +## Analysis + +See [notebooks](/notebooks/openai_embeddings_v2/README.md). diff --git a/app/openai_embeddings_v2/per_tweet.py b/app/openai_embeddings_v2/per_tweet.py new file mode 100644 index 0000000..55d0b52 --- /dev/null +++ b/app/openai_embeddings_v2/per_tweet.py @@ -0,0 +1,66 @@ + + +import os +from dotenv import load_dotenv + +from app.bq_service import BigQueryService +from app.openai_service import OpenAIService + + +load_dotenv() + +TEXTS_LIMIT = os.getenv("TEXTS_LIMIT") + + +if __name__ == "__main__": + + bq = BigQueryService() + print(bq) + print("DATASET ADDRESS:", bq.dataset_address) + + print("---------------") + print("TEXTS...") + #print("LIMIT: ", TEXTS_LIMIT) + + sql = f""" + -- FETCH STATUSES WE HAVEN'T ALREADY RETRIEVED EMBEDDINGS FOR + SELECT DISTINCT txt.status_text_id, txt.status_text --, emb.status_text_id + FROM `{bq.dataset_address}.botometer_sample_max_50_texts_map` txt + LEFT JOIN `{bq.dataset_address}.botometer_sample_max_50_openai_text_embeddings` emb + ON emb.status_text_id = txt.status_text_id + WHERE emb.status_text_id IS NULL + ORDER BY txt.status_text_id + """ + + if TEXTS_LIMIT: + texts_limit = int(TEXTS_LIMIT) + sql += f" LIMIT {texts_limit} " + + df = bq.query_to_df(sql) + print(len(df)) + if df.empty: + print("NO MORE TEXTS TO PROCESS... GOODBYE!") + exit() + + print("---------------") + print("EMBEDDINGS...") + texts = df["status_text"].tolist() + + ai = OpenAIService() + embeddings = ai.get_embeddings_in_dynamic_batches(texts, batch_char_limit=15_000) + #print(len(embeddings)) + + df["embeddings"] = embeddings + records = df[["status_text_id", "embeddings"]].to_dict("records") + + print("---------------") + print("SAVING...") + + embeddings_table = bq.client.get_table(f"{bq.dataset_address}.botometer_sample_max_50_openai_text_embeddings") # API call! + errors = bq.insert_records_in_batches(embeddings_table, records, batch_size=50) # running into google api issues with larger batches - there are so many embeddings for each row, so we lower the batch count substantially + if any(errors): + print("ERRORS:") + print(errors) + + print("---------------") + print("JOB COMPLETE!") diff --git a/app/openai_embeddings_v2/per_user.py b/app/openai_embeddings_v2/per_user.py new file mode 100644 index 0000000..b350c29 --- /dev/null +++ b/app/openai_embeddings_v2/per_user.py @@ -0,0 +1,82 @@ + + +import os +from dotenv import load_dotenv + +from app.bq_service import BigQueryService +from app.openai_service import OpenAIService + + +load_dotenv() + +USERS_LIMIT = os.getenv("USERS_LIMIT") + +MAX_TWEETS_PER_USER = 50 + + +if __name__ == "__main__": + + bq = BigQueryService() + print(bq) + print("DATASET ADDRESS:", bq.dataset_address) + + print("---------------") + print("USERS...") + #print("LIMIT: ", USERS_LIMIT) + + sql = f""" + -- FETCH USERS WE HAVEN'T ALREADY RETRIEVED EMBEDDINGS FOR + + WITH users_sample as ( + SELECT + s.user_id --,min(s.row_num) as row_min, max(s.row_num) as row_max + ,count(distinct s.status_id) as status_count_max_50 + ,array_agg(distinct s.status_id) as status_ids + ,string_agg(distinct s.status_text, " ") as status_texts + FROM `{bq.dataset_address}.botometer_sample` s + WHERE s.row_num <= {int(MAX_TWEETS_PER_USER)} + GROUP BY 1 + -- ORDER BY user_id + ) + + SELECT u.user_id, u.status_count_max_50, u.status_ids, u.status_texts + FROM users_sample u + LEFT JOIN `{bq.dataset_address}.botometer_sample_max_50_openai_user_embeddings` emb + ON u.user_id = emb.user_id + WHERE emb.user_id IS NULL + ORDER BY u.user_id + """ + + if USERS_LIMIT: + users_limit = int(USERS_LIMIT) + sql += f" LIMIT {users_limit} " + + df = bq.query_to_df(sql) + print(len(df)) + if df.empty: + print("NO MORE USERS TO PROCESS... GOODBYE!") + exit() + + print("---------------") + print("EMBEDDINGS...") + texts = df["status_texts"].tolist() + + ai = OpenAIService() + embeddings = ai.get_embeddings_in_dynamic_batches(texts, batch_char_limit=15_000) + #print(len(embeddings)) + + df["embeddings"] = embeddings + records = df[["user_id", "embeddings"]].to_dict("records") + + print("---------------") + print("SAVING...") + + embeddings_table_name = f"{bq.dataset_address}.botometer_sample_max_50_openai_user_embeddings" + embeddings_table = bq.client.get_table(embeddings_table_name) # API call! + errors = bq.insert_records_in_batches(embeddings_table, records, batch_size=50) # running into google api issues with larger batches - there are so many embeddings for each row, so we lower the batch count substantially + if any(errors): + print("ERRORS:") + print(errors) + + print("---------------") + print("JOB COMPLETE!") diff --git a/app/openai_service.py b/app/openai_service.py index 7f4a94c..79446fb 100644 --- a/app/openai_service.py +++ b/app/openai_service.py @@ -186,7 +186,7 @@ def get_embeddings_in_dynamic_batches(self, texts, batch_char_limit=30_000, slee embeddings = [] counter = 1 for texts_batch in dynamic_batches(texts, batch_char_limit=batch_char_limit): - print(counter, len(texts_batch)) + print("BATCH:", counter, "SIZE:", len(texts_batch)) # retry loop while True: try: @@ -197,6 +197,11 @@ def get_embeddings_in_dynamic_batches(self, texts, batch_char_limit=30_000, slee print(f"... Rate limit reached. Sleeping for {sleep_seconds} seconds.") sleep(sleep_seconds) # retry the same batch + except openai.error.ServiceUnavailableError as err: + print(f"... Service Unavailz. Sleeping for {sleep_seconds} seconds.") + print(err) + sleep(sleep_seconds) + # retry the same batch counter += 1 return embeddings diff --git a/notebooks/1_Botometer_Users_Sample_and_OpenAI_Embeddings_20230704.ipynb b/notebooks/openai_embeddings_v1/1_Botometer_Users_Sample_and_OpenAI_Embeddings_20230704.ipynb similarity index 100% rename from notebooks/1_Botometer_Users_Sample_and_OpenAI_Embeddings_20230704.ipynb rename to notebooks/openai_embeddings_v1/1_Botometer_Users_Sample_and_OpenAI_Embeddings_20230704.ipynb diff --git a/notebooks/1_botometer_users_sample_and_openai_embeddings_20230704.py b/notebooks/openai_embeddings_v1/1_botometer_users_sample_and_openai_embeddings_20230704.py similarity index 100% rename from notebooks/1_botometer_users_sample_and_openai_embeddings_20230704.py rename to notebooks/openai_embeddings_v1/1_botometer_users_sample_and_openai_embeddings_20230704.py diff --git a/notebooks/2_Embeddings_Data_Export.ipynb b/notebooks/openai_embeddings_v1/2_Embeddings_Data_Export.ipynb similarity index 100% rename from notebooks/2_Embeddings_Data_Export.ipynb rename to notebooks/openai_embeddings_v1/2_Embeddings_Data_Export.ipynb diff --git a/notebooks/2_embeddings_data_export.py b/notebooks/openai_embeddings_v1/2_embeddings_data_export.py similarity index 100% rename from notebooks/2_embeddings_data_export.py rename to notebooks/openai_embeddings_v1/2_embeddings_data_export.py diff --git a/notebooks/3_Merging_Remaining_BOM_Scores.ipynb b/notebooks/openai_embeddings_v1/3_Merging_Remaining_BOM_Scores.ipynb similarity index 100% rename from notebooks/3_Merging_Remaining_BOM_Scores.ipynb rename to notebooks/openai_embeddings_v1/3_Merging_Remaining_BOM_Scores.ipynb diff --git a/notebooks/3_merging_remaining_bom_scores.py b/notebooks/openai_embeddings_v1/3_merging_remaining_bom_scores.py similarity index 100% rename from notebooks/3_merging_remaining_bom_scores.py rename to notebooks/openai_embeddings_v1/3_merging_remaining_bom_scores.py diff --git a/notebooks/Analysis_Single_Results_File_v4.ipynb b/notebooks/openai_embeddings_v1/Analysis_Single_Results_File_v4.ipynb similarity index 100% rename from notebooks/Analysis_Single_Results_File_v4.ipynb rename to notebooks/openai_embeddings_v1/Analysis_Single_Results_File_v4.ipynb diff --git a/notebooks/README.md b/notebooks/openai_embeddings_v1/README.md similarity index 99% rename from notebooks/README.md rename to notebooks/openai_embeddings_v1/README.md index 3db7bf6..af78fdf 100644 --- a/notebooks/README.md +++ b/notebooks/openai_embeddings_v1/README.md @@ -4,7 +4,7 @@ -# Notebooks and Code +# Notebooks and Code (v1) This section provides a walk-through of the methods, with working code for reference. The process starts with three Python notebooks and follows up with Python scripts in a larger code repository. diff --git a/notebooks/analysis_single_results_file/bars_fourway_label.png b/notebooks/openai_embeddings_v1/analysis_single_results_file/bars_fourway_label.png similarity index 100% rename from notebooks/analysis_single_results_file/bars_fourway_label.png rename to notebooks/openai_embeddings_v1/analysis_single_results_file/bars_fourway_label.png diff --git a/notebooks/analysis_single_results_file/bars_is_bom_astroturf.png b/notebooks/openai_embeddings_v1/analysis_single_results_file/bars_is_bom_astroturf.png similarity index 100% rename from notebooks/analysis_single_results_file/bars_is_bom_astroturf.png rename to notebooks/openai_embeddings_v1/analysis_single_results_file/bars_is_bom_astroturf.png diff --git a/notebooks/analysis_single_results_file/bars_is_bot.png b/notebooks/openai_embeddings_v1/analysis_single_results_file/bars_is_bot.png similarity index 100% rename from notebooks/analysis_single_results_file/bars_is_bot.png rename to notebooks/openai_embeddings_v1/analysis_single_results_file/bars_is_bot.png diff --git a/notebooks/analysis_single_results_file/bars_is_factual.png b/notebooks/openai_embeddings_v1/analysis_single_results_file/bars_is_factual.png similarity index 100% rename from notebooks/analysis_single_results_file/bars_is_factual.png rename to notebooks/openai_embeddings_v1/analysis_single_results_file/bars_is_factual.png diff --git a/notebooks/analysis_single_results_file/bars_is_toxic.png b/notebooks/openai_embeddings_v1/analysis_single_results_file/bars_is_toxic.png similarity index 100% rename from notebooks/analysis_single_results_file/bars_is_toxic.png rename to notebooks/openai_embeddings_v1/analysis_single_results_file/bars_is_toxic.png diff --git a/notebooks/analysis_single_results_file/bars_opinion_community.png b/notebooks/openai_embeddings_v1/analysis_single_results_file/bars_opinion_community.png similarity index 100% rename from notebooks/analysis_single_results_file/bars_opinion_community.png rename to notebooks/openai_embeddings_v1/analysis_single_results_file/bars_opinion_community.png diff --git a/notebooks/analysis_single_results_file/dumbbells_all.png b/notebooks/openai_embeddings_v1/analysis_single_results_file/dumbbells_all.png similarity index 100% rename from notebooks/analysis_single_results_file/dumbbells_all.png rename to notebooks/openai_embeddings_v1/analysis_single_results_file/dumbbells_all.png diff --git a/notebooks/analysis_single_results_file_v4.py b/notebooks/openai_embeddings_v1/analysis_single_results_file_v4.py similarity index 100% rename from notebooks/analysis_single_results_file_v4.py rename to notebooks/openai_embeddings_v1/analysis_single_results_file_v4.py diff --git a/notebooks/openai_embeddings_v2/Exporting_Embeddings_to_Drive_20240201_v3.ipynb b/notebooks/openai_embeddings_v2/Exporting_Embeddings_to_Drive_20240201_v3.ipynb new file mode 100644 index 0000000..4d3e7ce --- /dev/null +++ b/notebooks/openai_embeddings_v2/Exporting_Embeddings_to_Drive_20240201_v3.ipynb @@ -0,0 +1,3596 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "FF154lGK_1N6", + "EuDR5mjnq3fV" + ], + "toc_visible": true, + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "a9534258cd5d4015abe53b5cc42bea56": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ac0e74c8f58e49c6bb3fd4c1c1896f88", + "IPY_MODEL_adb6537bd87a414bb31ea9b94a5cea32", + "IPY_MODEL_81f254cc026e4d00a32635eb396c9ddf" + ], + "layout": "IPY_MODEL_039ed011d14044338c28dce8b1d5e4c4" + } + }, + "ac0e74c8f58e49c6bb3fd4c1c1896f88": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d0dc7020e8d14f468b3492c069634a45", + "placeholder": "​", + "style": "IPY_MODEL_322b984ee24e4027a9c2e08d862434b3", + "value": "Downloading: 100%" + } + }, + "adb6537bd87a414bb31ea9b94a5cea32": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8992d57e6b884e38bdcbe229b021cf1d", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_218cf7dbd973421e9bdcbfade62433b5", + "value": 1 + } + }, + "81f254cc026e4d00a32635eb396c9ddf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d33ad5423db544268200ce28b0736a0b", + "placeholder": "​", + "style": "IPY_MODEL_aa4ab83e43404f3181111e4fafb2d586", + "value": "" + } + }, + "039ed011d14044338c28dce8b1d5e4c4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d0dc7020e8d14f468b3492c069634a45": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "322b984ee24e4027a9c2e08d862434b3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8992d57e6b884e38bdcbe229b021cf1d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "218cf7dbd973421e9bdcbfade62433b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d33ad5423db544268200ce28b0736a0b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aa4ab83e43404f3181111e4fafb2d586": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "39f9a290d00b42a2b880aa81eec76fa6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_25dfedf9a54843bb80caaeed5ab6db34", + "IPY_MODEL_a7ff0760fb244fb7a4903b85c7bc7ad6", + "IPY_MODEL_56a8b786d34e4742a2450df0be1dbae8" + ], + "layout": "IPY_MODEL_d4b9ca844460431fb72287a7356486f6" + } + }, + "25dfedf9a54843bb80caaeed5ab6db34": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b7a4772df3c46cc9be56e1fe11a5e88", + "placeholder": "​", + "style": "IPY_MODEL_efdd95d3dd5b4cedba40aa01b6cb94b9", + "value": "Downloading: 100%" + } + }, + "a7ff0760fb244fb7a4903b85c7bc7ad6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cfd1c20bbcbd4c62bf430d7c7f4624e5", + "max": 183815, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e265dcb3e51c49d3ad32c8dbf73cdf30", + "value": 183815 + } + }, + "56a8b786d34e4742a2450df0be1dbae8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f238e606a590433b8db7da9c314b3f00", + "placeholder": "​", + "style": "IPY_MODEL_f65e9b709410414eae7a36a53527de65", + "value": "" + } + }, + "d4b9ca844460431fb72287a7356486f6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6b7a4772df3c46cc9be56e1fe11a5e88": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "efdd95d3dd5b4cedba40aa01b6cb94b9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cfd1c20bbcbd4c62bf430d7c7f4624e5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e265dcb3e51c49d3ad32c8dbf73cdf30": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f238e606a590433b8db7da9c314b3f00": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f65e9b709410414eae7a36a53527de65": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_-39w0IS18f-" + }, + "source": [ + "We fetched OpenAI embeddings and stored on BQ. Let's download a CSV file to drive for further analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ymoi-E5OjZD5" + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Google Drive" + ], + "metadata": { + "id": "FF154lGK_1N6" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from google.colab import drive\n", + "\n", + "drive.mount('/content/drive')\n", + "print(os.getcwd(), os.listdir(os.getcwd()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i_eMkJ5fpKDp", + "outputId": "8b5deedd-9b30-499d-a9d4-43cb37d83864" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "/content ['.config', 'drive', 'sample_data']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "LNuZpKWOGmFZ" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5OKjyFQ0owen", + "outputId": "82bef74a-d6e3-410a-d066-d4dd791c5e25" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/drive/MyDrive/Research/DS Research Shared 2024\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "# you might need to create a google drive SHORTCUT that has this same path\n", + "# ... or update the path to use your own google drive organization\n", + "#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'\n", + "#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023'\n", + "DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'\n", + "\n", + "print(DIRPATH)\n", + "os.path.isdir(DIRPATH)" + ] + }, + { + "cell_type": "code", + "source": [ + "DATA_DIRPATH = os.path.join(DIRPATH, \"projects\", \"Impeachment 2020 Embeddings\", \"data\")\n", + "os.path.isdir(DATA_DIRPATH)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jjkYs5KJ99LX", + "outputId": "667d345f-a72e-4631-d555-5deaa2b89277" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D7AHRh645FX3" + }, + "source": [ + "### BigQuery Service" + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import auth\n", + "\n", + "# asks you to login\n", + "auth.authenticate_user()" + ], + "metadata": { + "id": "rfJKRImngZAw" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "yOz8eD9JkA7-" + }, + "source": [ + "from google.cloud import bigquery\n", + "from pandas import DataFrame, read_gbq\n", + "\n", + "\n", + "PROJECT_ID = \"tweet-collector-py\"\n", + "\n", + "class BigQueryService():\n", + " def __init__(self, project_id=PROJECT_ID):\n", + " self.project_id = project_id\n", + " self.client = bigquery.Client(project=self.project_id)\n", + "\n", + " def execute_query(self, sql, verbose=True):\n", + " if verbose == True:\n", + " print(sql)\n", + " job = self.client.query(sql)\n", + " return job.result()\n", + "\n", + " #def query_to_df(self, sql, verbose=True):\n", + " # \"\"\"high-level wrapper to return a DataFrame\"\"\"\n", + " # results = self.execute_query(sql, verbose=verbose)\n", + " # return DataFrame([dict(row) for row in results])\n", + "\n", + " def query_to_df(self, sql, verbose=True):\n", + " \"\"\"high-level wrapper to return a DataFrame\"\"\"\n", + " if verbose == True:\n", + " print(sql)\n", + " # https://pandas.pydata.org/docs/reference/api/pandas.read_gbq.html#pandas-read-gbq\n", + " #return read_gbq(sql, project_id=self.project_id) # progress_bar_type=\"tqdm_notebook\"\n", + " #progress_bar_type=\"tqdm_notebook\"\n", + " return read_gbq(sql, project_id=self.project_id, progress_bar_type=\"tqdm_notebook\")\n", + "\n", + "\n" + ], + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-qBZo9ezksZz", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9129e2b5-6b1b-4866-c1b3-7c65f4e338e3" + }, + "source": [ + "bq = BigQueryService()\n", + "print(bq)" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "<__main__.BigQueryService object at 0x785284479510>\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(\"DATASETS:\")\n", + "datasets = list(bq.client.list_datasets())\n", + "for ds in datasets:\n", + " #print(\"...\", ds.project, ds.dataset_id)\n", + " print(\"...\", ds.reference)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RZU3pXwscjGG", + "outputId": "56343dd5-b460-4245-ea47-dae3d7e32422" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "DATASETS:\n", + "... tweet-collector-py.analysis_2021\n", + "... tweet-collector-py.analysis_2021_development\n", + "... tweet-collector-py.collection_2021\n", + "... tweet-collector-py.disinfo_2021_development\n", + "... tweet-collector-py.disinfo_2021_production\n", + "... tweet-collector-py.election_2020_analysis\n", + "... tweet-collector-py.election_2020_development\n", + "... tweet-collector-py.election_2020_production\n", + "... tweet-collector-py.f1_racing_2023_development\n", + "... tweet-collector-py.f1_racing_2023_production\n", + "... tweet-collector-py.impeachment_2021_development\n", + "... tweet-collector-py.impeachment_2021_production\n", + "... tweet-collector-py.impeachment_backup\n", + "... tweet-collector-py.impeachment_development\n", + "... tweet-collector-py.impeachment_production\n", + "... tweet-collector-py.impeachment_test\n", + "... tweet-collector-py.jan6_committee_development\n", + "... tweet-collector-py.jan6_committee_production\n", + "... tweet-collector-py.transition_2021_development\n", + "... tweet-collector-py.transition_2021_production\n", + "... tweet-collector-py.truth_2023_development\n", + "... tweet-collector-py.truth_2023_production\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Helper Functions" + ], + "metadata": { + "id": "P29oXQGyVQys" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Unpacking Embeddings" + ], + "metadata": { + "id": "ynQMv14-qE9O" + } + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "from pandas import DataFrame\n", + "\n", + "\n", + "def unpack(embeddings_str):\n", + " \"\"\"Takes a string value containing an array of OpenAI embeddings,\n", + " and returns a list of floats.\n", + " \"\"\"\n", + " if isinstance(embeddings_str, str):\n", + " return json.loads(embeddings_str)\n", + " else:\n", + " return embeddings_str\n", + "\n", + "\n", + "def unpacked(df, col_prefix=\"openai\"):\n", + " \"\"\"Takes a dataframe witha single column of OpenAI embeddings,\n", + " and unpacks them into their own separate columns,\n", + " and returns a modified version of the original dataframe,\n", + " with the original embeddings column replaced by the new unpacked columns\n", + " \"\"\"\n", + "\n", + " print(\"UNPACKING...\")\n", + " embeds = df[\"embeddings\"].apply(unpack)\n", + " print(type(embeds))\n", + "\n", + " print(\"RECONSTRUCTING...\")\n", + " embeds = DataFrame(embeds.values.tolist())\n", + " embeds.columns = [f\"{col_prefix}_{col}\" for col in embeds.columns]\n", + " embeds.index = df.index\n", + " print(embeds.shape)\n", + " #embeds.head()\n", + "\n", + " print(\"MERGING...\")\n", + " df_unpacked = df.merge(embeds, left_index=True, right_index=True)\n", + " df_unpacked.drop(columns=[\"embeddings\"], inplace=True)\n", + " print(df_unpacked.shape)\n", + " return df_unpacked\n", + "\n" + ], + "metadata": { + "id": "wgjELMJBVTaa" + }, + "execution_count": 30, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Embeddings" + ], + "metadata": { + "id": "OQulQbvbFpHo" + } + }, + { + "cell_type": "code", + "source": [ + "DATASET_ADDRESS = \"tweet-collector-py.impeachment_production\"" + ], + "metadata": { + "id": "tkyUzzQsCFRN" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sql = f\"\"\"\n", + " SELECT\n", + " count(distinct s.user_id) as user_count\n", + " ,count(distinct s.status_id) as status_count\n", + " FROM `{DATASET_ADDRESS}.botometer_sample` s\n", + " JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings_v2` emb\n", + " ON s.status_id = emb.status_id\n", + "\"\"\"\n", + "bq.query_to_df(sql, verbose=False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 113, + "referenced_widgets": [ + "a9534258cd5d4015abe53b5cc42bea56", + "ac0e74c8f58e49c6bb3fd4c1c1896f88", + "adb6537bd87a414bb31ea9b94a5cea32", + "81f254cc026e4d00a32635eb396c9ddf", + "039ed011d14044338c28dce8b1d5e4c4", + "d0dc7020e8d14f468b3492c069634a45", + "322b984ee24e4027a9c2e08d862434b3", + "8992d57e6b884e38bdcbe229b021cf1d", + "218cf7dbd973421e9bdcbfade62433b5", + "d33ad5423db544268200ce28b0736a0b", + "aa4ab83e43404f3181111e4fafb2d586" + ] + }, + "id": "lVEKM1LiB4i2", + "outputId": "a5fd9b78-0b3c-431d-c02e-ed95da064244" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| |" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "a9534258cd5d4015abe53b5cc42bea56" + } + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_count status_count\n", + "0 7566 183727" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_countstatus_count
07566183727
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## User Embeddings" + ], + "metadata": { + "id": "TJUWWC48HcGk" + } + }, + { + "cell_type": "markdown", + "source": [ + "7566 users" + ], + "metadata": { + "id": "CGpJ-kDaHfi5" + } + }, + { + "cell_type": "code", + "source": [ + "sql = f\"\"\"\n", + " SELECT\n", + " u.user_id, u.created_on\n", + " --, u.screen_name_count, u.screen_names, split(u.screen_names, \",\")[0] as screen_name\n", + " ,u.status_count, u.rt_count\n", + " ,u.is_bot --, u.bot_rt_network\n", + " ,u.opinion_community --, u.avg_score_lr, avg_score_nb, avg_score_bert\n", + " , u.is_q --, u.q_status_count\n", + " --, u.follower_count, u.follower_count_b, u.follower_count_h\n", + " --, u.friend_count, u.friend_count_b, u.friend_count_h\n", + "\n", + " ,u.avg_toxicity --, u.avg_severe_toxicity, u.avg_insult, u.avg_obscene, u.avg_threat, u.avg_identity_hate\n", + " , u.avg_fact_score -- ,u.fact_scored_count\n", + "\n", + " ,u.bom_astroturf, u.bom_overall --, u.bom_cap --,u.bom_lookup_count\n", + " --,u.bom_fake_follower, u.bom_financial, u.bom_other, u.bom_self_declared, u.bom_spammer\n", + "\n", + " ,emb.embeddings\n", + "\n", + " FROM `{DATASET_ADDRESS}.user_details_v20240128_slim` u\n", + " JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_user_embeddings` emb\n", + " ON emb.user_id = u.user_id\n", + " -- LIMIT 10\n", + "\"\"\"\n", + "\n", + "users_df = bq.query_to_df(sql, verbose=False)\n", + "print(users_df.shape)" + ], + "metadata": { + "id": "KoCuPC6FHdoZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "users_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 313 + }, + "id": "yzm8dWf_Xm3N", + "outputId": "36d24c1b-0aae-48e8-f4cc-4a4e8c216fbf" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id created_on status_count rt_count is_bot \\\n", + "0 3420436216 2015-08-13 555 540 True \n", + "1 108121958 2010-01-24 2 2 False \n", + "2 3038308638 2015-02-23 755 665 True \n", + "3 332396536 2011-07-09 951 951 True \n", + "4 955082522479808512 2018-01-21 570 533 True \n", + "\n", + " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf \\\n", + "0 0 False 0.056113 1.983193 0.295 \n", + "1 0 False 0.456710 NaN 0.580 \n", + "2 0 False 0.069860 3.401786 0.970 \n", + "3 1 False 0.044264 2.304511 0.580 \n", + "4 0 False 0.049325 4.714286 0.355 \n", + "\n", + " bom_overall embeddings \n", + "0 0.190 [-0.018801862373948097, -0.007904230616986752,... \n", + "1 0.110 [-0.030551623553037643, -0.0053298575803637505... \n", + "2 0.970 [-0.007297390140593052, 0.0010276929242536426,... \n", + "3 0.750 [-0.01834747940301895, -0.007322159130126238, ... \n", + "4 0.225 [-0.024803657084703445, 0.007516898214817047, ... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcreated_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturfbom_overallembeddings
034204362162015-08-13555540True0False0.0561131.9831930.2950.190[-0.018801862373948097, -0.007904230616986752,...
11081219582010-01-2422False0False0.456710NaN0.5800.110[-0.030551623553037643, -0.0053298575803637505...
230383086382015-02-23755665True0False0.0698603.4017860.9700.970[-0.007297390140593052, 0.0010276929242536426,...
33323965362011-07-09951951True1False0.0442642.3045110.5800.750[-0.01834747940301895, -0.007322159130126238, ...
49550825224798085122018-01-21570533True0False0.0493254.7142860.3550.225[-0.024803657084703445, 0.007516898214817047, ...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Saving CSV to drive:" + ], + "metadata": { + "id": "1TYFGOn7Ow-P" + } + }, + { + "cell_type": "code", + "source": [ + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_user_embeddings.csv.gz\")\n", + "users_df.to_csv(csv_filepath, index=False, compression=\"gzip\")" + ], + "metadata": { + "id": "V5m_ZmDFHeLx" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### ... Unpacked" + ], + "metadata": { + "id": "D0A_V2nXWoIm" + } + }, + { + "cell_type": "code", + "source": [ + "users_df_unpacked = unpacked(users_df)\n", + "print(users_df.shape)\n", + "users_df_unpacked.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 359 + }, + "id": "sucTnQwdW6vH", + "outputId": "cc8eac4c-8a73-4f6c-da63-5ab167763510" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "UNPACKING...\n", + "\n", + "RECONSTRUCTING...\n", + "(7566, 1536)\n", + "MERGING...\n", + "(7566, 1547)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id created_on status_count rt_count is_bot \\\n", + "0 3420436216 2015-08-13 555 540 True \n", + "1 108121958 2010-01-24 2 2 False \n", + "2 3038308638 2015-02-23 755 665 True \n", + "3 332396536 2011-07-09 951 951 True \n", + "4 955082522479808512 2018-01-21 570 533 True \n", + "\n", + " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n", + "0 0 False 0.056113 1.983193 0.295 ... \n", + "1 0 False 0.456710 NaN 0.580 ... \n", + "2 0 False 0.069860 3.401786 0.970 ... \n", + "3 1 False 0.044264 2.304511 0.580 ... \n", + "4 0 False 0.049325 4.714286 0.355 ... \n", + "\n", + " openai_1526 openai_1527 openai_1528 openai_1529 openai_1530 \\\n", + "0 -0.001867 -0.013167 0.020885 -0.022568 -0.033631 \n", + "1 0.017651 -0.009439 0.024375 -0.032553 -0.042185 \n", + "2 -0.026273 -0.008139 0.030285 -0.029902 -0.030887 \n", + "3 -0.005520 -0.005288 0.017071 -0.033637 -0.040202 \n", + "4 0.009959 0.004695 0.005555 -0.012851 -0.032229 \n", + "\n", + " openai_1531 openai_1532 openai_1533 openai_1534 openai_1535 \n", + "0 0.016153 0.024127 -0.017519 0.002636 -0.039838 \n", + "1 0.013782 0.011320 -0.014862 -0.010413 -0.020359 \n", + "2 0.022481 -0.005476 -0.016279 -0.010138 -0.021454 \n", + "3 0.041773 -0.009370 0.003352 0.009391 -0.042671 \n", + "4 0.031443 0.008163 -0.018501 -0.008724 -0.042027 \n", + "\n", + "[5 rows x 1547 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcreated_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturf...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
034204362162015-08-13555540True0False0.0561131.9831930.295...-0.001867-0.0131670.020885-0.022568-0.0336310.0161530.024127-0.0175190.002636-0.039838
11081219582010-01-2422False0False0.456710NaN0.580...0.017651-0.0094390.024375-0.032553-0.0421850.0137820.011320-0.014862-0.010413-0.020359
230383086382015-02-23755665True0False0.0698603.4017860.970...-0.026273-0.0081390.030285-0.029902-0.0308870.022481-0.005476-0.016279-0.010138-0.021454
33323965362011-07-09951951True1False0.0442642.3045110.580...-0.005520-0.0052880.017071-0.033637-0.0402020.041773-0.0093700.0033520.009391-0.042671
49550825224798085122018-01-21570533True0False0.0493254.7142860.355...0.0099590.0046950.005555-0.012851-0.0322290.0314430.008163-0.018501-0.008724-0.042027
\n", + "

5 rows × 1547 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "source": [ + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\")\n", + "users_df_unpacked.to_csv(csv_filepath, index=False, compression=\"gzip\")" + ], + "metadata": { + "id": "6Ll2G8XpXa2O" + }, + "execution_count": 32, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4t48ewACjXQy" + }, + "source": [ + "## Tweet Embeddings" + ] + }, + { + "cell_type": "markdown", + "source": [ + "183K statuses" + ], + "metadata": { + "id": "5sJsvSTWCVVX" + } + }, + { + "cell_type": "markdown", + "source": [ + "Wow wow wow this is taking a long time (1hr +...) to stream the data down over the network..." + ], + "metadata": { + "id": "4gIJd0h_-rXO" + } + }, + { + "cell_type": "markdown", + "source": [ + "Re-doing with the statuses table v2, that has duplicate lookups removed (row per unique status)...\n", + "\n", + "Re-doing with statuses table v3, which has status texts as well..." + ], + "metadata": { + "id": "Ho6uZl7csvkf" + } + }, + { + "cell_type": "code", + "source": [ + "sql = f\"\"\"\n", + " SELECT user_id, status_id, status_text, created_at, embeds_length, embeddings\n", + " FROM `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings_v3`\n", + " -- LIMIT 10000\n", + "\"\"\"\n", + "\n", + "tweets_df = bq.query_to_df(sql, verbose=True)\n", + "print(tweets_df.shape)\n", + "tweets_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 692, + "referenced_widgets": [ + "39f9a290d00b42a2b880aa81eec76fa6", + "25dfedf9a54843bb80caaeed5ab6db34", + "a7ff0760fb244fb7a4903b85c7bc7ad6", + "56a8b786d34e4742a2450df0be1dbae8", + "d4b9ca844460431fb72287a7356486f6", + "6b7a4772df3c46cc9be56e1fe11a5e88", + "efdd95d3dd5b4cedba40aa01b6cb94b9", + "cfd1c20bbcbd4c62bf430d7c7f4624e5", + "e265dcb3e51c49d3ad32c8dbf73cdf30", + "f238e606a590433b8db7da9c314b3f00", + "f65e9b709410414eae7a36a53527de65" + ] + }, + "id": "VYBVlVBN9tIf", + "outputId": "9a24052f-7a26-4ba8-ceb1-7d74deabf42d" + }, + "execution_count": 33, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " SELECT user_id, status_id, status_text, created_at, embeds_length, embeddings\n", + " FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings_v3` \n", + " -- LIMIT 10000\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "39f9a290d00b42a2b880aa81eec76fa6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| |" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(183815, 6)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id status_id \\\n", + "0 897845802701377536 1221540755451392001 \n", + "1 935739601301458947 1223458629837295619 \n", + "2 571774622 1217445781663363072 \n", + "3 384679808 1223705594818748416 \n", + "4 701264221653217281 1218459840277729281 \n", + "\n", + " status_text \\\n", + "0 Doubt it..It appears they all have gone the wa... \n", + "1 RT @Wyn1745: Democrats are ‘setting the stage’... \n", + "2 RT @sarahdwire: I’m loathe to insert myself in... \n", + "3 RT @RepRatcliffe: We warned them...As Schiff a... \n", + "4 RT @chipfranklin: Because \"impeachment\" in the... \n", + "\n", + " created_at embeds_length \\\n", + "0 2020-01-26 21:09:45+00:00 1536 \n", + "1 2020-02-01 04:10:42+00:00 1536 \n", + "2 2020-01-15 13:57:48+00:00 1536 \n", + "3 2020-02-01 20:32:03+00:00 1536 \n", + "4 2020-01-18 09:07:18+00:00 1536 \n", + "\n", + " embeddings \n", + "0 [-0.020428381860256195, -0.006719687487930059,... \n", + "1 [-0.03668860346078873, -0.0074811591766774654,... \n", + "2 [-0.033381544053554535, -0.006886449176818132,... \n", + "3 [-0.008476617746055126, -0.007363526616245508,... \n", + "4 [-0.009453612379729748, 0.017376383766531944, ... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idstatus_idstatus_textcreated_atembeds_lengthembeddings
08978458027013775361221540755451392001Doubt it..It appears they all have gone the wa...2020-01-26 21:09:45+00:001536[-0.020428381860256195, -0.006719687487930059,...
19357396013014589471223458629837295619RT @Wyn1745: Democrats are ‘setting the stage’...2020-02-01 04:10:42+00:001536[-0.03668860346078873, -0.0074811591766774654,...
25717746221217445781663363072RT @sarahdwire: I’m loathe to insert myself in...2020-01-15 13:57:48+00:001536[-0.033381544053554535, -0.006886449176818132,...
33846798081223705594818748416RT @RepRatcliffe: We warned them...As Schiff a...2020-02-01 20:32:03+00:001536[-0.008476617746055126, -0.007363526616245508,...
47012642216532172811218459840277729281RT @chipfranklin: Because \"impeachment\" in the...2020-01-18 09:07:18+00:001536[-0.009453612379729748, 0.017376383766531944, ...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 33 + } + ] + }, + { + "cell_type": "code", + "source": [ + "tweets_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 556 + }, + "id": "IuWH0hQ_rNVd", + "outputId": "662aaecc-17a2-4e96-e477-72d4b4cc3489" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id status_id \\\n", + "0 897845802701377536 1221540755451392001 \n", + "1 935739601301458947 1223458629837295619 \n", + "2 571774622 1217445781663363072 \n", + "3 384679808 1223705594818748416 \n", + "4 701264221653217281 1218459840277729281 \n", + "\n", + " status_text \\\n", + "0 Doubt it..It appears they all have gone the wa... \n", + "1 RT @Wyn1745: Democrats are ‘setting the stage’... \n", + "2 RT @sarahdwire: I’m loathe to insert myself in... \n", + "3 RT @RepRatcliffe: We warned them...As Schiff a... \n", + "4 RT @chipfranklin: Because \"impeachment\" in the... \n", + "\n", + " created_at embeds_length \\\n", + "0 2020-01-26 21:09:45+00:00 1536 \n", + "1 2020-02-01 04:10:42+00:00 1536 \n", + "2 2020-01-15 13:57:48+00:00 1536 \n", + "3 2020-02-01 20:32:03+00:00 1536 \n", + "4 2020-01-18 09:07:18+00:00 1536 \n", + "\n", + " embeddings \n", + "0 [-0.020428381860256195, -0.006719687487930059,... \n", + "1 [-0.03668860346078873, -0.0074811591766774654,... \n", + "2 [-0.033381544053554535, -0.006886449176818132,... \n", + "3 [-0.008476617746055126, -0.007363526616245508,... \n", + "4 [-0.009453612379729748, 0.017376383766531944, ... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idstatus_idstatus_textcreated_atembeds_lengthembeddings
08978458027013775361221540755451392001Doubt it..It appears they all have gone the wa...2020-01-26 21:09:45+00:001536[-0.020428381860256195, -0.006719687487930059,...
19357396013014589471223458629837295619RT @Wyn1745: Democrats are ‘setting the stage’...2020-02-01 04:10:42+00:001536[-0.03668860346078873, -0.0074811591766774654,...
25717746221217445781663363072RT @sarahdwire: I’m loathe to insert myself in...2020-01-15 13:57:48+00:001536[-0.033381544053554535, -0.006886449176818132,...
33846798081223705594818748416RT @RepRatcliffe: We warned them...As Schiff a...2020-02-01 20:32:03+00:001536[-0.008476617746055126, -0.007363526616245508,...
47012642216532172811218459840277729281RT @chipfranklin: Because \"impeachment\" in the...2020-01-18 09:07:18+00:001536[-0.009453612379729748, 0.017376383766531944, ...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Saving CSV to drive:" + ], + "metadata": { + "id": "hkOZyyS2SbEt" + } + }, + { + "cell_type": "code", + "source": [ + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3.csv.gz\")\n", + "tweets_df.to_csv(csv_filepath, index=False, compression=\"gzip\")" + ], + "metadata": { + "id": "_QvWANEiSbE3" + }, + "execution_count": 36, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### ... Unpacked" + ], + "metadata": { + "id": "y-0275gvxwO4" + } + }, + { + "cell_type": "code", + "source": [ + "unpacked_tweets_df = unpacked(tweets_df)\n", + "unpacked_tweets_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 689 + }, + "id": "IY02mObwyH8-", + "outputId": "fa0d10c2-fb12-40ca-a6c2-f7f546ae8e38" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "UNPACKING...\n", + "\n", + "RECONSTRUCTING...\n", + "(183815, 1536)\n", + "MERGING...\n", + "(183815, 1541)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id status_id \\\n", + "0 897845802701377536 1221540755451392001 \n", + "1 935739601301458947 1223458629837295619 \n", + "2 571774622 1217445781663363072 \n", + "3 384679808 1223705594818748416 \n", + "4 701264221653217281 1218459840277729281 \n", + "\n", + " status_text \\\n", + "0 Doubt it..It appears they all have gone the wa... \n", + "1 RT @Wyn1745: Democrats are ‘setting the stage’... \n", + "2 RT @sarahdwire: I’m loathe to insert myself in... \n", + "3 RT @RepRatcliffe: We warned them...As Schiff a... \n", + "4 RT @chipfranklin: Because \"impeachment\" in the... \n", + "\n", + " created_at embeds_length openai_0 openai_1 openai_2 \\\n", + "0 2020-01-26 21:09:45+00:00 1536 -0.020428 -0.006720 0.007308 \n", + "1 2020-02-01 04:10:42+00:00 1536 -0.036689 -0.007481 0.007968 \n", + "2 2020-01-15 13:57:48+00:00 1536 -0.033382 -0.006886 -0.003244 \n", + "3 2020-02-01 20:32:03+00:00 1536 -0.008477 -0.007364 0.000919 \n", + "4 2020-01-18 09:07:18+00:00 1536 -0.009454 0.017376 0.007016 \n", + "\n", + " openai_3 openai_4 ... openai_1526 openai_1527 openai_1528 \\\n", + "0 -0.022157 -0.041841 ... 0.014616 0.004705 0.012661 \n", + "1 -0.006632 -0.022805 ... -0.001696 0.002522 0.020397 \n", + "2 -0.015834 0.000172 ... 0.001027 0.002464 0.002013 \n", + "3 -0.006435 0.008101 ... -0.028269 0.003193 0.015056 \n", + "4 -0.020075 -0.023674 ... -0.013590 0.015564 0.005130 \n", + "\n", + " openai_1529 openai_1530 openai_1531 openai_1532 openai_1533 \\\n", + "0 -0.020974 -0.003458 0.045166 0.029871 -0.021186 \n", + "1 -0.046374 -0.046611 0.021068 -0.000085 -0.003701 \n", + "2 -0.032766 -0.034265 0.006545 0.014804 0.003027 \n", + "3 -0.015333 -0.028137 0.032510 0.010327 -0.013621 \n", + "4 0.003077 -0.029167 0.015523 0.017914 -0.008789 \n", + "\n", + " openai_1534 openai_1535 \n", + "0 -0.003376 -0.024937 \n", + "1 -0.015370 -0.019213 \n", + "2 -0.001518 -0.030946 \n", + "3 -0.007686 -0.016216 \n", + "4 -0.019767 -0.042353 \n", + "\n", + "[5 rows x 1541 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idstatus_idstatus_textcreated_atembeds_lengthopenai_0openai_1openai_2openai_3openai_4...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
08978458027013775361221540755451392001Doubt it..It appears they all have gone the wa...2020-01-26 21:09:45+00:001536-0.020428-0.0067200.007308-0.022157-0.041841...0.0146160.0047050.012661-0.020974-0.0034580.0451660.029871-0.021186-0.003376-0.024937
19357396013014589471223458629837295619RT @Wyn1745: Democrats are ‘setting the stage’...2020-02-01 04:10:42+00:001536-0.036689-0.0074810.007968-0.006632-0.022805...-0.0016960.0025220.020397-0.046374-0.0466110.021068-0.000085-0.003701-0.015370-0.019213
25717746221217445781663363072RT @sarahdwire: I’m loathe to insert myself in...2020-01-15 13:57:48+00:001536-0.033382-0.006886-0.003244-0.0158340.000172...0.0010270.0024640.002013-0.032766-0.0342650.0065450.0148040.003027-0.001518-0.030946
33846798081223705594818748416RT @RepRatcliffe: We warned them...As Schiff a...2020-02-01 20:32:03+00:001536-0.008477-0.0073640.000919-0.0064350.008101...-0.0282690.0031930.015056-0.015333-0.0281370.0325100.010327-0.013621-0.007686-0.016216
47012642216532172811218459840277729281RT @chipfranklin: Because \"impeachment\" in the...2020-01-18 09:07:18+00:001536-0.0094540.0173760.007016-0.020075-0.023674...-0.0135900.0155640.0051300.003077-0.0291670.0155230.017914-0.008789-0.019767-0.042353
\n", + "

5 rows × 1541 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 37 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html\n", + "\n", + "pq_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip\")\n", + "unpacked_tweets_df.to_parquet(pq_filepath, compression=\"gzip\")" + ], + "metadata": { + "id": "2QaBlr5cYWIa" + }, + "execution_count": 39, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked.csv.gz\")\n", + "unpacked_tweets_df.to_csv(csv_filepath, index=False, compression=\"gzip\")" + ], + "metadata": { + "id": "GWc253mgrSx2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#arrow_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked.arrow\")\n", + "#df.to_feather(arrow_filepath)" + ], + "metadata": { + "id": "SjU2t7PJXyEC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Scratch Work" + ], + "metadata": { + "id": "EuDR5mjnq3fV" + } + }, + { + "cell_type": "code", + "source": [ + "##from pandas import concat\n", + "##\n", + "##limit = 1_000\n", + "##offset = 0\n", + "##\n", + "##all = DataFrame()\n", + "##\n", + "##while offset < 5_500:\n", + "## sql = f\"\"\"\n", + "## SELECT s.user_id, s.status_id, s.status_text, s.created_at, emb.embeddings\n", + "## FROM `{DATASET_ADDRESS}.botometer_sample` s\n", + "## JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings` emb\n", + "## ON s.status_id = emb.status_id\n", + "## LIMIT {int(limit)}\n", + "## OFFSET {int(offset)}\n", + "## \"\"\"\n", + "##\n", + "## batch = bq.query_to_df(sql, verbose=True)\n", + "## print(tweets_df.shape)\n", + "## if batch.empty:\n", + "## print(\"ALL DONE!\")\n", + "## break\n", + "##\n", + "## concat(all, batch)\n", + "## offset += limit\n", + "\n", + "" + ], + "metadata": { + "id": "B9bIY-wb-fHb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Compressed Table" + ], + "metadata": { + "id": "zDo0Yqm2ujxN" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://cloud.google.com/bigquery/docs/exporting-data#bigquery_extract_table_compressed-python" + ], + "metadata": { + "id": "P6ACmvi1ulKu" + } + }, + { + "cell_type": "code", + "source": [ + "# from google.cloud import bigquery\n", + "# client = bigquery.Client()\n", + "# bucket_name = 'my-bucket'\n", + "\n", + "#destination_uri = \"gs://{}/{}\".format(bucket_name, \"shakespeare.csv.gz\")\n", + "#dataset_ref = bigquery.DatasetReference(project, dataset_id)\n", + "#table_ref = dataset_ref.table(\"shakespeare\")\n", + "#job_config = bigquery.job.ExtractJobConfig()\n", + "#job_config.compression = bigquery.Compression.GZIP\n", + "#\n", + "#extract_job = client.extract_table(\n", + "# table_ref,\n", + "# destination_uri,\n", + "# # Location must match that of the source table.\n", + "# location=\"US\",\n", + "# job_config=job_config,\n", + "#) # API request\n", + "#extract_job.result() # Waits for job to complete." + ], + "metadata": { + "id": "g7p7KRN7ulhz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# from google.cloud import bigquery\n", + "# client = bigquery.Client()\n", + "# bucket_name = 'my-bucket'\n", + "\n", + "\n", + "#from google.cloud import bigquery\n", + "#\n", + "#\n", + "##ds_ref = bigquery.DatasetReference(PROJECT_ID, DATASET_ADDRESS)\n", + "#DATASET_ID = \"impeachment_production\"\n", + "#ds_ref = bigquery.DatasetReference(PROJECT_ID, DATASET_ID)\n", + "#table_ref = ds_ref.table(\"botometer_sample_max_50_openai_status_embeddings_v3\")\n", + "#\n", + "#job_config = bigquery.job.ExtractJobConfig()\n", + "#job_config.compression = bigquery.Compression.GZIP\n", + "#\n", + "#BUCKET_NAME = \"impeachment-analysis-2020\"\n", + "##destination_uri = f\"gs://{BUCKET_NAME}/impeachment_production/botometer_sample_max_50_openai_status_embeddings_v4.csv.gz\"\n", + "##> too large to be exported to a single file. Specify a uri including a * to shard export. See 'Exporting data into one or more files' in https://cloud.google.com/bigquery/docs/exporting-data.\n", + "#destination_uri = f\"gs://{BUCKET_NAME}/impeachment_production/botometer_sample_max_50_openai_status_embeddings_v4_*.csv.gz\"\n", + "#\n", + "#client = bq.client\n", + "#extract_job = client.extract_table(\n", + "# table_ref,\n", + "# destination_uri,\n", + "# # Location must match that of the source table.\n", + "# location=\"US\",\n", + "# job_config=job_config,\n", + "#) # API request\n", + "#extract_job.result() # Waits for job to complete." + ], + "metadata": { + "id": "Iz1mp4C1ujG0" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/openai_embeddings_v2/Impeachment_2020_Embeddings_Analysis_Template_(20240129).ipynb b/notebooks/openai_embeddings_v2/Impeachment_2020_Embeddings_Analysis_Template_(20240129).ipynb new file mode 100644 index 0000000..1b4265b --- /dev/null +++ b/notebooks/openai_embeddings_v2/Impeachment_2020_Embeddings_Analysis_Template_(20240129).ipynb @@ -0,0 +1,1955 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "Ymoi-E5OjZD5" + ], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_-39w0IS18f-" + }, + "source": [ + "We fetched user-level and tweet-level OpenAI embeddings and stored on BQ, and copied the data to CSV files on Drive.\n", + "\n", + "This notebook provides an example of how to load those CSV files. Feel free to make a copy of this notebook and perform your own analyses." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ymoi-E5OjZD5" + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Google Drive" + ], + "metadata": { + "id": "FF154lGK_1N6" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from google.colab import drive\n", + "\n", + "drive.mount('/content/drive')\n", + "print(os.getcwd(), os.listdir(os.getcwd()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i_eMkJ5fpKDp", + "outputId": "b82c2891-d6b0-45ce-ff4b-dddf370e6716" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n", + "/content ['.config', 'drive', 'sample_data']\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5OKjyFQ0owen", + "outputId": "d148f498-8af6-4de7-90c1-1072a0309607" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/drive/MyDrive/Research/DS Research Shared 2024\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "# you might need to create a google drive SHORTCUT that has this same path\n", + "# ... or update the path to use your own google drive organization\n", + "#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'\n", + "#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023'\n", + "DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'\n", + "\n", + "print(DIRPATH)\n", + "os.path.isdir(DIRPATH)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "New project-based directory structure for 2024:\n", + "\n", + "https://drive.google.com/drive/folders/1SuXkqVT400uZ2OYFGGV8SYBf7NhtBo5k?usp=drive_link" + ], + "metadata": { + "id": "dNCNBPJkg9St" + } + }, + { + "cell_type": "code", + "source": [ + "DATA_DIRPATH = os.path.join(DIRPATH, \"projects\", \"Impeachment 2020 Embeddings\", \"data\")\n", + "os.path.isdir(DATA_DIRPATH)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jjkYs5KJ99LX", + "outputId": "3ca78c5f-4fa0-4519-b126-e02403785ec9" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "os.listdir(DATA_DIRPATH)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "x9QGLQH_dUGV", + "outputId": "abc2ba42-9476-453f-fc95-70eea47f31e6" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['botometer_sample_max_50_openai_user_embeddings.csv.gz',\n", + " 'botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz',\n", + " 'botometer_sample_max_50_openai_status_embeddings_v3.csv.gz',\n", + " 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip',\n", + " 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked.csv.gz']" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The \"unpacked\" versions have a column per embedding, and are generally easier to work with.\n", + "\n", + "The files we will be working with are:\n", + " + \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\" and\n", + " + \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip\"." + ], + "metadata": { + "id": "JCNrEG7vhOKo" + } + }, + { + "cell_type": "markdown", + "source": [ + "## User Embeddings" + ], + "metadata": { + "id": "TJUWWC48HcGk" + } + }, + { + "cell_type": "markdown", + "source": [ + "7566 users" + ], + "metadata": { + "id": "CGpJ-kDaHfi5" + } + }, + { + "cell_type": "markdown", + "source": [ + "Loading CSV from drive:" + ], + "metadata": { + "id": "1TYFGOn7Ow-P" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "csv_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz\")\n", + "users_df = read_csv(csv_filepath, compression=\"gzip\")\n", + "print(users_df.shape)\n", + "print(users_df.columns)\n", + "users_df.head()" + ], + "metadata": { + "id": "V5m_ZmDFHeLx", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 416 + }, + "outputId": "ad620cd6-6ecb-408a-ec34-c75cf0718e8d" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7566, 1547)\n", + "Index(['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot',\n", + " 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score',\n", + " 'bom_astroturf',\n", + " ...\n", + " 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',\n", + " 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',\n", + " 'openai_1534', 'openai_1535'],\n", + " dtype='object', length=1547)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id created_on status_count rt_count is_bot \\\n", + "0 3420436216 2015-08-13 555 540 True \n", + "1 108121958 2010-01-24 2 2 False \n", + "2 3038308638 2015-02-23 755 665 True \n", + "3 332396536 2011-07-09 951 951 True \n", + "4 955082522479808512 2018-01-21 570 533 True \n", + "\n", + " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n", + "0 0 False 0.056113 1.983193 0.295 ... \n", + "1 0 False 0.456710 NaN 0.580 ... \n", + "2 0 False 0.069860 3.401786 0.970 ... \n", + "3 1 False 0.044264 2.304511 0.580 ... \n", + "4 0 False 0.049325 4.714286 0.355 ... \n", + "\n", + " openai_1526 openai_1527 openai_1528 openai_1529 openai_1530 \\\n", + "0 -0.001867 -0.013167 0.020885 -0.022568 -0.033631 \n", + "1 0.017651 -0.009439 0.024375 -0.032553 -0.042185 \n", + "2 -0.026273 -0.008139 0.030285 -0.029902 -0.030887 \n", + "3 -0.005520 -0.005288 0.017071 -0.033637 -0.040202 \n", + "4 0.009959 0.004695 0.005555 -0.012851 -0.032229 \n", + "\n", + " openai_1531 openai_1532 openai_1533 openai_1534 openai_1535 \n", + "0 0.016153 0.024127 -0.017519 0.002636 -0.039838 \n", + "1 0.013782 0.011320 -0.014862 -0.010413 -0.020359 \n", + "2 0.022481 -0.005476 -0.016279 -0.010138 -0.021454 \n", + "3 0.041773 -0.009370 0.003352 0.009391 -0.042671 \n", + "4 0.031443 0.008163 -0.018501 -0.008724 -0.042027 \n", + "\n", + "[5 rows x 1547 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcreated_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturf...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
034204362162015-08-13555540True0False0.0561131.9831930.295...-0.001867-0.0131670.020885-0.022568-0.0336310.0161530.024127-0.0175190.002636-0.039838
11081219582010-01-2422False0False0.456710NaN0.580...0.017651-0.0094390.024375-0.032553-0.0421850.0137820.011320-0.014862-0.010413-0.020359
230383086382015-02-23755665True0False0.0698603.4017860.970...-0.026273-0.0081390.030285-0.029902-0.0308870.022481-0.005476-0.016279-0.010138-0.021454
33323965362011-07-09951951True1False0.0442642.3045110.580...-0.005520-0.0052880.017071-0.033637-0.0402020.041773-0.0093700.0033520.009391-0.042671
49550825224798085122018-01-21570533True0False0.0493254.7142860.355...0.0099590.0046950.005555-0.012851-0.0322290.0314430.008163-0.018501-0.008724-0.042027
\n", + "

5 rows × 1547 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"user_id\"].nunique()" + ], + "metadata": { + "id": "nQGfxCyBHeIi", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e3d7456a-6c0d-4424-8d8d-64bca24c552f" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "7566" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"is_bot\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JIwbbnB71suN", + "outputId": "dad6b8ba-2ab5-49b2-a957-934272d76e84" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False 4466\n", + "True 3100\n", + "Name: is_bot, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"opinion_community\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Yi8Qlxi_1spO", + "outputId": "0589fbde-5029-41a4-a234-0fd47e3823a9" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 4891\n", + "1 2675\n", + "Name: opinion_community, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"avg_fact_score\"].info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dG4-L7nDeQC-", + "outputId": "cba8adc6-210e-419d-f766-677a66174714" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 7566 entries, 0 to 7565\n", + "Series name: avg_fact_score\n", + "Non-Null Count Dtype \n", + "-------------- ----- \n", + "3292 non-null float64\n", + "dtypes: float64(1)\n", + "memory usage: 59.2 KB\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "n-pHzzQyi-dT" + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "HRKFH1UTi-Yu" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "\n", + "from pandas import isnull\n", + "\n", + "def add_labels(users_df):\n", + " # APPLY SAME LABELS AS THE ORIGINAL SOURCE CODE\n", + " # https://github.com/s2t2/openai-embeddings-2023/blob/1b8372dd36982009df5d4a80871f4c182ada743d/notebooks/2_embeddings_data_export.py#L51\n", + " # https://github.com/s2t2/openai-embeddings-2023/blob/main/app/dataset.py#L37-L64\n", + "\n", + " # labels:\n", + " users_df[\"opinion_label\"] = users_df[\"opinion_community\"].map({0:\"Anti-Trump\", 1:\"Pro-Trump\"})\n", + " users_df[\"bot_label\"] = users_df[\"is_bot\"].map({True:\"Bot\", False:\"Human\"})\n", + " users_df[\"fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bot_label\"]\n", + "\n", + " # language toxicity scores (0 low - 1 high)\n", + " toxic_threshold = 0.1\n", + " users_df[\"is_toxic\"] = users_df[\"avg_toxicity\"] >= toxic_threshold\n", + " users_df[\"is_toxic\"] = users_df[\"is_toxic\"].map({True: 1, False :0 })\n", + " users_df[\"toxic_label\"] = users_df[\"is_toxic\"].map({1: \"Toxic\", 0 :\"Normal\" })\n", + "\n", + " # fact check / media quality scores (1 low - 5 high)\n", + " fact_threshold = 3.0\n", + " users_df[\"is_factual\"] = users_df[\"avg_fact_score\"].apply(lambda score: score if isnull(score) else score >= fact_threshold)\n", + "\n", + " # botometer binary and labels:\n", + " users_df[\"is_bom_overall\"] = users_df[\"bom_overall\"].round()\n", + " users_df[\"is_bom_astroturf\"] = users_df[\"bom_astroturf\"].round()\n", + " users_df[\"bom_overall_label\"] = users_df[\"is_bom_overall\"].map({1:\"Bot\", 0:\"Human\"})\n", + " users_df[\"bom_astroturf_label\"] = users_df[\"is_bom_astroturf\"].map({1:\"Bot\", 0:\"Human\"})\n", + " users_df[\"bom_overall_fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bom_overall_label\"]\n", + " users_df[\"bom_astroturf_fourway_label\"] = users_df[\"opinion_label\"] + \" \" + users_df[\"bom_astroturf_label\"]\n", + "\n", + " return users_df\n", + "\n", + "\n", + "users_df = add_labels(users_df)\n", + "print(users_df.shape)\n", + "print(users_df.columns.tolist())\n", + "users_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 309 + }, + "id": "jK9I2mpri_ER", + "outputId": "724101e9-f34c-4363-f680-57f71ba15bb7" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7566, 1559)\n", + "['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot', 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score', 'bom_astroturf', 'bom_overall', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4', 'openai_5', 'openai_6', 'openai_7', 'openai_8', 'openai_9', 'openai_10', 'openai_11', 'openai_12', 'openai_13', 'openai_14', 'openai_15', 'openai_16', 'openai_17', 'openai_18', 'openai_19', 'openai_20', 'openai_21', 'openai_22', 'openai_23', 'openai_24', 'openai_25', 'openai_26', 'openai_27', 'openai_28', 'openai_29', 'openai_30', 'openai_31', 'openai_32', 'openai_33', 'openai_34', 'openai_35', 'openai_36', 'openai_37', 'openai_38', 'openai_39', 'openai_40', 'openai_41', 'openai_42', 'openai_43', 'openai_44', 'openai_45', 'openai_46', 'openai_47', 'openai_48', 'openai_49', 'openai_50', 'openai_51', 'openai_52', 'openai_53', 'openai_54', 'openai_55', 'openai_56', 'openai_57', 'openai_58', 'openai_59', 'openai_60', 'openai_61', 'openai_62', 'openai_63', 'openai_64', 'openai_65', 'openai_66', 'openai_67', 'openai_68', 'openai_69', 'openai_70', 'openai_71', 'openai_72', 'openai_73', 'openai_74', 'openai_75', 'openai_76', 'openai_77', 'openai_78', 'openai_79', 'openai_80', 'openai_81', 'openai_82', 'openai_83', 'openai_84', 'openai_85', 'openai_86', 'openai_87', 'openai_88', 'openai_89', 'openai_90', 'openai_91', 'openai_92', 'openai_93', 'openai_94', 'openai_95', 'openai_96', 'openai_97', 'openai_98', 'openai_99', 'openai_100', 'openai_101', 'openai_102', 'openai_103', 'openai_104', 'openai_105', 'openai_106', 'openai_107', 'openai_108', 'openai_109', 'openai_110', 'openai_111', 'openai_112', 'openai_113', 'openai_114', 'openai_115', 'openai_116', 'openai_117', 'openai_118', 'openai_119', 'openai_120', 'openai_121', 'openai_122', 'openai_123', 'openai_124', 'openai_125', 'openai_126', 'openai_127', 'openai_128', 'openai_129', 'openai_130', 'openai_131', 'openai_132', 'openai_133', 'openai_134', 'openai_135', 'openai_136', 'openai_137', 'openai_138', 'openai_139', 'openai_140', 'openai_141', 'openai_142', 'openai_143', 'openai_144', 'openai_145', 'openai_146', 'openai_147', 'openai_148', 'openai_149', 'openai_150', 'openai_151', 'openai_152', 'openai_153', 'openai_154', 'openai_155', 'openai_156', 'openai_157', 'openai_158', 'openai_159', 'openai_160', 'openai_161', 'openai_162', 'openai_163', 'openai_164', 'openai_165', 'openai_166', 'openai_167', 'openai_168', 'openai_169', 'openai_170', 'openai_171', 'openai_172', 'openai_173', 'openai_174', 'openai_175', 'openai_176', 'openai_177', 'openai_178', 'openai_179', 'openai_180', 'openai_181', 'openai_182', 'openai_183', 'openai_184', 'openai_185', 'openai_186', 'openai_187', 'openai_188', 'openai_189', 'openai_190', 'openai_191', 'openai_192', 'openai_193', 'openai_194', 'openai_195', 'openai_196', 'openai_197', 'openai_198', 'openai_199', 'openai_200', 'openai_201', 'openai_202', 'openai_203', 'openai_204', 'openai_205', 'openai_206', 'openai_207', 'openai_208', 'openai_209', 'openai_210', 'openai_211', 'openai_212', 'openai_213', 'openai_214', 'openai_215', 'openai_216', 'openai_217', 'openai_218', 'openai_219', 'openai_220', 'openai_221', 'openai_222', 'openai_223', 'openai_224', 'openai_225', 'openai_226', 'openai_227', 'openai_228', 'openai_229', 'openai_230', 'openai_231', 'openai_232', 'openai_233', 'openai_234', 'openai_235', 'openai_236', 'openai_237', 'openai_238', 'openai_239', 'openai_240', 'openai_241', 'openai_242', 'openai_243', 'openai_244', 'openai_245', 'openai_246', 'openai_247', 'openai_248', 'openai_249', 'openai_250', 'openai_251', 'openai_252', 'openai_253', 'openai_254', 'openai_255', 'openai_256', 'openai_257', 'openai_258', 'openai_259', 'openai_260', 'openai_261', 'openai_262', 'openai_263', 'openai_264', 'openai_265', 'openai_266', 'openai_267', 'openai_268', 'openai_269', 'openai_270', 'openai_271', 'openai_272', 'openai_273', 'openai_274', 'openai_275', 'openai_276', 'openai_277', 'openai_278', 'openai_279', 'openai_280', 'openai_281', 'openai_282', 'openai_283', 'openai_284', 'openai_285', 'openai_286', 'openai_287', 'openai_288', 'openai_289', 'openai_290', 'openai_291', 'openai_292', 'openai_293', 'openai_294', 'openai_295', 'openai_296', 'openai_297', 'openai_298', 'openai_299', 'openai_300', 'openai_301', 'openai_302', 'openai_303', 'openai_304', 'openai_305', 'openai_306', 'openai_307', 'openai_308', 'openai_309', 'openai_310', 'openai_311', 'openai_312', 'openai_313', 'openai_314', 'openai_315', 'openai_316', 'openai_317', 'openai_318', 'openai_319', 'openai_320', 'openai_321', 'openai_322', 'openai_323', 'openai_324', 'openai_325', 'openai_326', 'openai_327', 'openai_328', 'openai_329', 'openai_330', 'openai_331', 'openai_332', 'openai_333', 'openai_334', 'openai_335', 'openai_336', 'openai_337', 'openai_338', 'openai_339', 'openai_340', 'openai_341', 'openai_342', 'openai_343', 'openai_344', 'openai_345', 'openai_346', 'openai_347', 'openai_348', 'openai_349', 'openai_350', 'openai_351', 'openai_352', 'openai_353', 'openai_354', 'openai_355', 'openai_356', 'openai_357', 'openai_358', 'openai_359', 'openai_360', 'openai_361', 'openai_362', 'openai_363', 'openai_364', 'openai_365', 'openai_366', 'openai_367', 'openai_368', 'openai_369', 'openai_370', 'openai_371', 'openai_372', 'openai_373', 'openai_374', 'openai_375', 'openai_376', 'openai_377', 'openai_378', 'openai_379', 'openai_380', 'openai_381', 'openai_382', 'openai_383', 'openai_384', 'openai_385', 'openai_386', 'openai_387', 'openai_388', 'openai_389', 'openai_390', 'openai_391', 'openai_392', 'openai_393', 'openai_394', 'openai_395', 'openai_396', 'openai_397', 'openai_398', 'openai_399', 'openai_400', 'openai_401', 'openai_402', 'openai_403', 'openai_404', 'openai_405', 'openai_406', 'openai_407', 'openai_408', 'openai_409', 'openai_410', 'openai_411', 'openai_412', 'openai_413', 'openai_414', 'openai_415', 'openai_416', 'openai_417', 'openai_418', 'openai_419', 'openai_420', 'openai_421', 'openai_422', 'openai_423', 'openai_424', 'openai_425', 'openai_426', 'openai_427', 'openai_428', 'openai_429', 'openai_430', 'openai_431', 'openai_432', 'openai_433', 'openai_434', 'openai_435', 'openai_436', 'openai_437', 'openai_438', 'openai_439', 'openai_440', 'openai_441', 'openai_442', 'openai_443', 'openai_444', 'openai_445', 'openai_446', 'openai_447', 'openai_448', 'openai_449', 'openai_450', 'openai_451', 'openai_452', 'openai_453', 'openai_454', 'openai_455', 'openai_456', 'openai_457', 'openai_458', 'openai_459', 'openai_460', 'openai_461', 'openai_462', 'openai_463', 'openai_464', 'openai_465', 'openai_466', 'openai_467', 'openai_468', 'openai_469', 'openai_470', 'openai_471', 'openai_472', 'openai_473', 'openai_474', 'openai_475', 'openai_476', 'openai_477', 'openai_478', 'openai_479', 'openai_480', 'openai_481', 'openai_482', 'openai_483', 'openai_484', 'openai_485', 'openai_486', 'openai_487', 'openai_488', 'openai_489', 'openai_490', 'openai_491', 'openai_492', 'openai_493', 'openai_494', 'openai_495', 'openai_496', 'openai_497', 'openai_498', 'openai_499', 'openai_500', 'openai_501', 'openai_502', 'openai_503', 'openai_504', 'openai_505', 'openai_506', 'openai_507', 'openai_508', 'openai_509', 'openai_510', 'openai_511', 'openai_512', 'openai_513', 'openai_514', 'openai_515', 'openai_516', 'openai_517', 'openai_518', 'openai_519', 'openai_520', 'openai_521', 'openai_522', 'openai_523', 'openai_524', 'openai_525', 'openai_526', 'openai_527', 'openai_528', 'openai_529', 'openai_530', 'openai_531', 'openai_532', 'openai_533', 'openai_534', 'openai_535', 'openai_536', 'openai_537', 'openai_538', 'openai_539', 'openai_540', 'openai_541', 'openai_542', 'openai_543', 'openai_544', 'openai_545', 'openai_546', 'openai_547', 'openai_548', 'openai_549', 'openai_550', 'openai_551', 'openai_552', 'openai_553', 'openai_554', 'openai_555', 'openai_556', 'openai_557', 'openai_558', 'openai_559', 'openai_560', 'openai_561', 'openai_562', 'openai_563', 'openai_564', 'openai_565', 'openai_566', 'openai_567', 'openai_568', 'openai_569', 'openai_570', 'openai_571', 'openai_572', 'openai_573', 'openai_574', 'openai_575', 'openai_576', 'openai_577', 'openai_578', 'openai_579', 'openai_580', 'openai_581', 'openai_582', 'openai_583', 'openai_584', 'openai_585', 'openai_586', 'openai_587', 'openai_588', 'openai_589', 'openai_590', 'openai_591', 'openai_592', 'openai_593', 'openai_594', 'openai_595', 'openai_596', 'openai_597', 'openai_598', 'openai_599', 'openai_600', 'openai_601', 'openai_602', 'openai_603', 'openai_604', 'openai_605', 'openai_606', 'openai_607', 'openai_608', 'openai_609', 'openai_610', 'openai_611', 'openai_612', 'openai_613', 'openai_614', 'openai_615', 'openai_616', 'openai_617', 'openai_618', 'openai_619', 'openai_620', 'openai_621', 'openai_622', 'openai_623', 'openai_624', 'openai_625', 'openai_626', 'openai_627', 'openai_628', 'openai_629', 'openai_630', 'openai_631', 'openai_632', 'openai_633', 'openai_634', 'openai_635', 'openai_636', 'openai_637', 'openai_638', 'openai_639', 'openai_640', 'openai_641', 'openai_642', 'openai_643', 'openai_644', 'openai_645', 'openai_646', 'openai_647', 'openai_648', 'openai_649', 'openai_650', 'openai_651', 'openai_652', 'openai_653', 'openai_654', 'openai_655', 'openai_656', 'openai_657', 'openai_658', 'openai_659', 'openai_660', 'openai_661', 'openai_662', 'openai_663', 'openai_664', 'openai_665', 'openai_666', 'openai_667', 'openai_668', 'openai_669', 'openai_670', 'openai_671', 'openai_672', 'openai_673', 'openai_674', 'openai_675', 'openai_676', 'openai_677', 'openai_678', 'openai_679', 'openai_680', 'openai_681', 'openai_682', 'openai_683', 'openai_684', 'openai_685', 'openai_686', 'openai_687', 'openai_688', 'openai_689', 'openai_690', 'openai_691', 'openai_692', 'openai_693', 'openai_694', 'openai_695', 'openai_696', 'openai_697', 'openai_698', 'openai_699', 'openai_700', 'openai_701', 'openai_702', 'openai_703', 'openai_704', 'openai_705', 'openai_706', 'openai_707', 'openai_708', 'openai_709', 'openai_710', 'openai_711', 'openai_712', 'openai_713', 'openai_714', 'openai_715', 'openai_716', 'openai_717', 'openai_718', 'openai_719', 'openai_720', 'openai_721', 'openai_722', 'openai_723', 'openai_724', 'openai_725', 'openai_726', 'openai_727', 'openai_728', 'openai_729', 'openai_730', 'openai_731', 'openai_732', 'openai_733', 'openai_734', 'openai_735', 'openai_736', 'openai_737', 'openai_738', 'openai_739', 'openai_740', 'openai_741', 'openai_742', 'openai_743', 'openai_744', 'openai_745', 'openai_746', 'openai_747', 'openai_748', 'openai_749', 'openai_750', 'openai_751', 'openai_752', 'openai_753', 'openai_754', 'openai_755', 'openai_756', 'openai_757', 'openai_758', 'openai_759', 'openai_760', 'openai_761', 'openai_762', 'openai_763', 'openai_764', 'openai_765', 'openai_766', 'openai_767', 'openai_768', 'openai_769', 'openai_770', 'openai_771', 'openai_772', 'openai_773', 'openai_774', 'openai_775', 'openai_776', 'openai_777', 'openai_778', 'openai_779', 'openai_780', 'openai_781', 'openai_782', 'openai_783', 'openai_784', 'openai_785', 'openai_786', 'openai_787', 'openai_788', 'openai_789', 'openai_790', 'openai_791', 'openai_792', 'openai_793', 'openai_794', 'openai_795', 'openai_796', 'openai_797', 'openai_798', 'openai_799', 'openai_800', 'openai_801', 'openai_802', 'openai_803', 'openai_804', 'openai_805', 'openai_806', 'openai_807', 'openai_808', 'openai_809', 'openai_810', 'openai_811', 'openai_812', 'openai_813', 'openai_814', 'openai_815', 'openai_816', 'openai_817', 'openai_818', 'openai_819', 'openai_820', 'openai_821', 'openai_822', 'openai_823', 'openai_824', 'openai_825', 'openai_826', 'openai_827', 'openai_828', 'openai_829', 'openai_830', 'openai_831', 'openai_832', 'openai_833', 'openai_834', 'openai_835', 'openai_836', 'openai_837', 'openai_838', 'openai_839', 'openai_840', 'openai_841', 'openai_842', 'openai_843', 'openai_844', 'openai_845', 'openai_846', 'openai_847', 'openai_848', 'openai_849', 'openai_850', 'openai_851', 'openai_852', 'openai_853', 'openai_854', 'openai_855', 'openai_856', 'openai_857', 'openai_858', 'openai_859', 'openai_860', 'openai_861', 'openai_862', 'openai_863', 'openai_864', 'openai_865', 'openai_866', 'openai_867', 'openai_868', 'openai_869', 'openai_870', 'openai_871', 'openai_872', 'openai_873', 'openai_874', 'openai_875', 'openai_876', 'openai_877', 'openai_878', 'openai_879', 'openai_880', 'openai_881', 'openai_882', 'openai_883', 'openai_884', 'openai_885', 'openai_886', 'openai_887', 'openai_888', 'openai_889', 'openai_890', 'openai_891', 'openai_892', 'openai_893', 'openai_894', 'openai_895', 'openai_896', 'openai_897', 'openai_898', 'openai_899', 'openai_900', 'openai_901', 'openai_902', 'openai_903', 'openai_904', 'openai_905', 'openai_906', 'openai_907', 'openai_908', 'openai_909', 'openai_910', 'openai_911', 'openai_912', 'openai_913', 'openai_914', 'openai_915', 'openai_916', 'openai_917', 'openai_918', 'openai_919', 'openai_920', 'openai_921', 'openai_922', 'openai_923', 'openai_924', 'openai_925', 'openai_926', 'openai_927', 'openai_928', 'openai_929', 'openai_930', 'openai_931', 'openai_932', 'openai_933', 'openai_934', 'openai_935', 'openai_936', 'openai_937', 'openai_938', 'openai_939', 'openai_940', 'openai_941', 'openai_942', 'openai_943', 'openai_944', 'openai_945', 'openai_946', 'openai_947', 'openai_948', 'openai_949', 'openai_950', 'openai_951', 'openai_952', 'openai_953', 'openai_954', 'openai_955', 'openai_956', 'openai_957', 'openai_958', 'openai_959', 'openai_960', 'openai_961', 'openai_962', 'openai_963', 'openai_964', 'openai_965', 'openai_966', 'openai_967', 'openai_968', 'openai_969', 'openai_970', 'openai_971', 'openai_972', 'openai_973', 'openai_974', 'openai_975', 'openai_976', 'openai_977', 'openai_978', 'openai_979', 'openai_980', 'openai_981', 'openai_982', 'openai_983', 'openai_984', 'openai_985', 'openai_986', 'openai_987', 'openai_988', 'openai_989', 'openai_990', 'openai_991', 'openai_992', 'openai_993', 'openai_994', 'openai_995', 'openai_996', 'openai_997', 'openai_998', 'openai_999', 'openai_1000', 'openai_1001', 'openai_1002', 'openai_1003', 'openai_1004', 'openai_1005', 'openai_1006', 'openai_1007', 'openai_1008', 'openai_1009', 'openai_1010', 'openai_1011', 'openai_1012', 'openai_1013', 'openai_1014', 'openai_1015', 'openai_1016', 'openai_1017', 'openai_1018', 'openai_1019', 'openai_1020', 'openai_1021', 'openai_1022', 'openai_1023', 'openai_1024', 'openai_1025', 'openai_1026', 'openai_1027', 'openai_1028', 'openai_1029', 'openai_1030', 'openai_1031', 'openai_1032', 'openai_1033', 'openai_1034', 'openai_1035', 'openai_1036', 'openai_1037', 'openai_1038', 'openai_1039', 'openai_1040', 'openai_1041', 'openai_1042', 'openai_1043', 'openai_1044', 'openai_1045', 'openai_1046', 'openai_1047', 'openai_1048', 'openai_1049', 'openai_1050', 'openai_1051', 'openai_1052', 'openai_1053', 'openai_1054', 'openai_1055', 'openai_1056', 'openai_1057', 'openai_1058', 'openai_1059', 'openai_1060', 'openai_1061', 'openai_1062', 'openai_1063', 'openai_1064', 'openai_1065', 'openai_1066', 'openai_1067', 'openai_1068', 'openai_1069', 'openai_1070', 'openai_1071', 'openai_1072', 'openai_1073', 'openai_1074', 'openai_1075', 'openai_1076', 'openai_1077', 'openai_1078', 'openai_1079', 'openai_1080', 'openai_1081', 'openai_1082', 'openai_1083', 'openai_1084', 'openai_1085', 'openai_1086', 'openai_1087', 'openai_1088', 'openai_1089', 'openai_1090', 'openai_1091', 'openai_1092', 'openai_1093', 'openai_1094', 'openai_1095', 'openai_1096', 'openai_1097', 'openai_1098', 'openai_1099', 'openai_1100', 'openai_1101', 'openai_1102', 'openai_1103', 'openai_1104', 'openai_1105', 'openai_1106', 'openai_1107', 'openai_1108', 'openai_1109', 'openai_1110', 'openai_1111', 'openai_1112', 'openai_1113', 'openai_1114', 'openai_1115', 'openai_1116', 'openai_1117', 'openai_1118', 'openai_1119', 'openai_1120', 'openai_1121', 'openai_1122', 'openai_1123', 'openai_1124', 'openai_1125', 'openai_1126', 'openai_1127', 'openai_1128', 'openai_1129', 'openai_1130', 'openai_1131', 'openai_1132', 'openai_1133', 'openai_1134', 'openai_1135', 'openai_1136', 'openai_1137', 'openai_1138', 'openai_1139', 'openai_1140', 'openai_1141', 'openai_1142', 'openai_1143', 'openai_1144', 'openai_1145', 'openai_1146', 'openai_1147', 'openai_1148', 'openai_1149', 'openai_1150', 'openai_1151', 'openai_1152', 'openai_1153', 'openai_1154', 'openai_1155', 'openai_1156', 'openai_1157', 'openai_1158', 'openai_1159', 'openai_1160', 'openai_1161', 'openai_1162', 'openai_1163', 'openai_1164', 'openai_1165', 'openai_1166', 'openai_1167', 'openai_1168', 'openai_1169', 'openai_1170', 'openai_1171', 'openai_1172', 'openai_1173', 'openai_1174', 'openai_1175', 'openai_1176', 'openai_1177', 'openai_1178', 'openai_1179', 'openai_1180', 'openai_1181', 'openai_1182', 'openai_1183', 'openai_1184', 'openai_1185', 'openai_1186', 'openai_1187', 'openai_1188', 'openai_1189', 'openai_1190', 'openai_1191', 'openai_1192', 'openai_1193', 'openai_1194', 'openai_1195', 'openai_1196', 'openai_1197', 'openai_1198', 'openai_1199', 'openai_1200', 'openai_1201', 'openai_1202', 'openai_1203', 'openai_1204', 'openai_1205', 'openai_1206', 'openai_1207', 'openai_1208', 'openai_1209', 'openai_1210', 'openai_1211', 'openai_1212', 'openai_1213', 'openai_1214', 'openai_1215', 'openai_1216', 'openai_1217', 'openai_1218', 'openai_1219', 'openai_1220', 'openai_1221', 'openai_1222', 'openai_1223', 'openai_1224', 'openai_1225', 'openai_1226', 'openai_1227', 'openai_1228', 'openai_1229', 'openai_1230', 'openai_1231', 'openai_1232', 'openai_1233', 'openai_1234', 'openai_1235', 'openai_1236', 'openai_1237', 'openai_1238', 'openai_1239', 'openai_1240', 'openai_1241', 'openai_1242', 'openai_1243', 'openai_1244', 'openai_1245', 'openai_1246', 'openai_1247', 'openai_1248', 'openai_1249', 'openai_1250', 'openai_1251', 'openai_1252', 'openai_1253', 'openai_1254', 'openai_1255', 'openai_1256', 'openai_1257', 'openai_1258', 'openai_1259', 'openai_1260', 'openai_1261', 'openai_1262', 'openai_1263', 'openai_1264', 'openai_1265', 'openai_1266', 'openai_1267', 'openai_1268', 'openai_1269', 'openai_1270', 'openai_1271', 'openai_1272', 'openai_1273', 'openai_1274', 'openai_1275', 'openai_1276', 'openai_1277', 'openai_1278', 'openai_1279', 'openai_1280', 'openai_1281', 'openai_1282', 'openai_1283', 'openai_1284', 'openai_1285', 'openai_1286', 'openai_1287', 'openai_1288', 'openai_1289', 'openai_1290', 'openai_1291', 'openai_1292', 'openai_1293', 'openai_1294', 'openai_1295', 'openai_1296', 'openai_1297', 'openai_1298', 'openai_1299', 'openai_1300', 'openai_1301', 'openai_1302', 'openai_1303', 'openai_1304', 'openai_1305', 'openai_1306', 'openai_1307', 'openai_1308', 'openai_1309', 'openai_1310', 'openai_1311', 'openai_1312', 'openai_1313', 'openai_1314', 'openai_1315', 'openai_1316', 'openai_1317', 'openai_1318', 'openai_1319', 'openai_1320', 'openai_1321', 'openai_1322', 'openai_1323', 'openai_1324', 'openai_1325', 'openai_1326', 'openai_1327', 'openai_1328', 'openai_1329', 'openai_1330', 'openai_1331', 'openai_1332', 'openai_1333', 'openai_1334', 'openai_1335', 'openai_1336', 'openai_1337', 'openai_1338', 'openai_1339', 'openai_1340', 'openai_1341', 'openai_1342', 'openai_1343', 'openai_1344', 'openai_1345', 'openai_1346', 'openai_1347', 'openai_1348', 'openai_1349', 'openai_1350', 'openai_1351', 'openai_1352', 'openai_1353', 'openai_1354', 'openai_1355', 'openai_1356', 'openai_1357', 'openai_1358', 'openai_1359', 'openai_1360', 'openai_1361', 'openai_1362', 'openai_1363', 'openai_1364', 'openai_1365', 'openai_1366', 'openai_1367', 'openai_1368', 'openai_1369', 'openai_1370', 'openai_1371', 'openai_1372', 'openai_1373', 'openai_1374', 'openai_1375', 'openai_1376', 'openai_1377', 'openai_1378', 'openai_1379', 'openai_1380', 'openai_1381', 'openai_1382', 'openai_1383', 'openai_1384', 'openai_1385', 'openai_1386', 'openai_1387', 'openai_1388', 'openai_1389', 'openai_1390', 'openai_1391', 'openai_1392', 'openai_1393', 'openai_1394', 'openai_1395', 'openai_1396', 'openai_1397', 'openai_1398', 'openai_1399', 'openai_1400', 'openai_1401', 'openai_1402', 'openai_1403', 'openai_1404', 'openai_1405', 'openai_1406', 'openai_1407', 'openai_1408', 'openai_1409', 'openai_1410', 'openai_1411', 'openai_1412', 'openai_1413', 'openai_1414', 'openai_1415', 'openai_1416', 'openai_1417', 'openai_1418', 'openai_1419', 'openai_1420', 'openai_1421', 'openai_1422', 'openai_1423', 'openai_1424', 'openai_1425', 'openai_1426', 'openai_1427', 'openai_1428', 'openai_1429', 'openai_1430', 'openai_1431', 'openai_1432', 'openai_1433', 'openai_1434', 'openai_1435', 'openai_1436', 'openai_1437', 'openai_1438', 'openai_1439', 'openai_1440', 'openai_1441', 'openai_1442', 'openai_1443', 'openai_1444', 'openai_1445', 'openai_1446', 'openai_1447', 'openai_1448', 'openai_1449', 'openai_1450', 'openai_1451', 'openai_1452', 'openai_1453', 'openai_1454', 'openai_1455', 'openai_1456', 'openai_1457', 'openai_1458', 'openai_1459', 'openai_1460', 'openai_1461', 'openai_1462', 'openai_1463', 'openai_1464', 'openai_1465', 'openai_1466', 'openai_1467', 'openai_1468', 'openai_1469', 'openai_1470', 'openai_1471', 'openai_1472', 'openai_1473', 'openai_1474', 'openai_1475', 'openai_1476', 'openai_1477', 'openai_1478', 'openai_1479', 'openai_1480', 'openai_1481', 'openai_1482', 'openai_1483', 'openai_1484', 'openai_1485', 'openai_1486', 'openai_1487', 'openai_1488', 'openai_1489', 'openai_1490', 'openai_1491', 'openai_1492', 'openai_1493', 'openai_1494', 'openai_1495', 'openai_1496', 'openai_1497', 'openai_1498', 'openai_1499', 'openai_1500', 'openai_1501', 'openai_1502', 'openai_1503', 'openai_1504', 'openai_1505', 'openai_1506', 'openai_1507', 'openai_1508', 'openai_1509', 'openai_1510', 'openai_1511', 'openai_1512', 'openai_1513', 'openai_1514', 'openai_1515', 'openai_1516', 'openai_1517', 'openai_1518', 'openai_1519', 'openai_1520', 'openai_1521', 'openai_1522', 'openai_1523', 'openai_1524', 'openai_1525', 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529', 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533', 'openai_1534', 'openai_1535', 'is_factual', 'opinion_label', 'bot_label', 'fourway_label', 'is_toxic', 'toxic_label', 'is_bom_overall', 'is_bom_astroturf', 'bom_overall_label', 'bom_astroturf_label', 'bom_overall_fourway_label', 'bom_astroturf_fourway_label']\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id created_on status_count rt_count is_bot \\\n", + "0 3420436216 2015-08-13 555 540 True \n", + "1 108121958 2010-01-24 2 2 False \n", + "2 3038308638 2015-02-23 755 665 True \n", + "3 332396536 2011-07-09 951 951 True \n", + "4 955082522479808512 2018-01-21 570 533 True \n", + "\n", + " opinion_community is_q avg_toxicity avg_fact_score bom_astroturf ... \\\n", + "0 0 False 0.056113 1.983193 0.295 ... \n", + "1 0 False 0.456710 NaN 0.580 ... \n", + "2 0 False 0.069860 3.401786 0.970 ... \n", + "3 1 False 0.044264 2.304511 0.580 ... \n", + "4 0 False 0.049325 4.714286 0.355 ... \n", + "\n", + " bot_label fourway_label is_toxic toxic_label is_bom_overall \\\n", + "0 Bot Anti-Trump Bot 0 Normal 0.0 \n", + "1 Human Anti-Trump Human 1 Toxic 0.0 \n", + "2 Bot Anti-Trump Bot 0 Normal 1.0 \n", + "3 Bot Pro-Trump Bot 0 Normal 1.0 \n", + "4 Bot Anti-Trump Bot 0 Normal 0.0 \n", + "\n", + " is_bom_astroturf bom_overall_label bom_astroturf_label \\\n", + "0 0.0 Human Human \n", + "1 1.0 Human Bot \n", + "2 1.0 Bot Bot \n", + "3 1.0 Bot Bot \n", + "4 0.0 Human Human \n", + "\n", + " bom_overall_fourway_label bom_astroturf_fourway_label \n", + "0 Anti-Trump Human Anti-Trump Human \n", + "1 Anti-Trump Human Anti-Trump Bot \n", + "2 Anti-Trump Bot Anti-Trump Bot \n", + "3 Pro-Trump Bot Pro-Trump Bot \n", + "4 Anti-Trump Human Anti-Trump Human \n", + "\n", + "[5 rows x 1559 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcreated_onstatus_countrt_countis_botopinion_communityis_qavg_toxicityavg_fact_scorebom_astroturf...bot_labelfourway_labelis_toxictoxic_labelis_bom_overallis_bom_astroturfbom_overall_labelbom_astroturf_labelbom_overall_fourway_labelbom_astroturf_fourway_label
034204362162015-08-13555540True0False0.0561131.9831930.295...BotAnti-Trump Bot0Normal0.00.0HumanHumanAnti-Trump HumanAnti-Trump Human
11081219582010-01-2422False0False0.456710NaN0.580...HumanAnti-Trump Human1Toxic0.01.0HumanBotAnti-Trump HumanAnti-Trump Bot
230383086382015-02-23755665True0False0.0698603.4017860.970...BotAnti-Trump Bot0Normal1.01.0BotBotAnti-Trump BotAnti-Trump Bot
33323965362011-07-09951951True1False0.0442642.3045110.580...BotPro-Trump Bot0Normal1.01.0BotBotPro-Trump BotPro-Trump Bot
49550825224798085122018-01-21570533True0False0.0493254.7142860.355...BotAnti-Trump Bot0Normal0.00.0HumanHumanAnti-Trump HumanAnti-Trump Human
\n", + "

5 rows × 1559 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 29 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"is_factual\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CU_qpBVcjFD4", + "outputId": "b7b3e9d9-73e3-4c5c-a775-2bf87ee4bd09" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "False 1696\n", + "True 1596\n", + "Name: is_factual, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"is_toxic\"].value_counts()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pbaa5rTJh5NY", + "outputId": "6d98e7a3-a734-44ef-8d48-eca658d42c95" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 6132\n", + "1 1434\n", + "Name: is_toxic, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"bot_label\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "T31nFSuniKdY", + "outputId": "0513619c-f8ac-402c-bef8-7880506b33dc" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Human 4466\n", + "Bot 3100\n", + "Name: bot_label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 26 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"opinion_label\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4QX5FgjMk3E0", + "outputId": "c68929d4-2e82-4037-9b14-813efda2b105" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Anti-Trump 4891\n", + "Pro-Trump 2675\n", + "Name: opinion_label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ] + }, + { + "cell_type": "code", + "source": [ + "users_df[\"fourway_label\"].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wKHKOfGplAv8", + "outputId": "7394ad50-5a86-4e98-cf4f-fb6cf42839e8" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Anti-Trump Human 3010\n", + "Anti-Trump Bot 1881\n", + "Pro-Trump Human 1456\n", + "Pro-Trump Bot 1219\n", + "Name: fourway_label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4t48ewACjXQy" + }, + "source": [ + "## Tweet Embeddings" + ] + }, + { + "cell_type": "markdown", + "source": [ + "183K statuses:" + ], + "metadata": { + "id": "5sJsvSTWCVVX" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_parquet\n", + "\n", + "pq_filepath = os.path.join(DATA_DIRPATH, \"botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip\")\n", + "statuses_df = read_parquet(pq_filepath)\n", + "print(statuses_df.shape)\n", + "print(statuses_df.columns)\n", + "statuses_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 728 + }, + "id": "wy-OIPg_eYX-", + "outputId": "dd50b10f-a81b-4217-d794-bb0bd9f14c53" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(183815, 1541)\n", + "Index(['user_id', 'status_id', 'status_text', 'created_at', 'embeds_length',\n", + " 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4',\n", + " ...\n", + " 'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',\n", + " 'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',\n", + " 'openai_1534', 'openai_1535'],\n", + " dtype='object', length=1541)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " user_id status_id \\\n", + "0 897845802701377536 1221540755451392001 \n", + "1 935739601301458947 1223458629837295619 \n", + "2 571774622 1217445781663363072 \n", + "3 384679808 1223705594818748416 \n", + "4 701264221653217281 1218459840277729281 \n", + "\n", + " status_text \\\n", + "0 Doubt it..It appears they all have gone the wa... \n", + "1 RT @Wyn1745: Democrats are ‘setting the stage’... \n", + "2 RT @sarahdwire: I’m loathe to insert myself in... \n", + "3 RT @RepRatcliffe: We warned them...As Schiff a... \n", + "4 RT @chipfranklin: Because \"impeachment\" in the... \n", + "\n", + " created_at embeds_length openai_0 openai_1 openai_2 \\\n", + "0 2020-01-26 21:09:45+00:00 1536 -0.020428 -0.006720 0.007308 \n", + "1 2020-02-01 04:10:42+00:00 1536 -0.036689 -0.007481 0.007968 \n", + "2 2020-01-15 13:57:48+00:00 1536 -0.033382 -0.006886 -0.003244 \n", + "3 2020-02-01 20:32:03+00:00 1536 -0.008477 -0.007364 0.000919 \n", + "4 2020-01-18 09:07:18+00:00 1536 -0.009454 0.017376 0.007016 \n", + "\n", + " openai_3 openai_4 ... openai_1526 openai_1527 openai_1528 \\\n", + "0 -0.022157 -0.041841 ... 0.014616 0.004705 0.012661 \n", + "1 -0.006632 -0.022805 ... -0.001696 0.002522 0.020397 \n", + "2 -0.015834 0.000172 ... 0.001027 0.002464 0.002013 \n", + "3 -0.006435 0.008101 ... -0.028269 0.003193 0.015056 \n", + "4 -0.020075 -0.023674 ... -0.013590 0.015564 0.005130 \n", + "\n", + " openai_1529 openai_1530 openai_1531 openai_1532 openai_1533 \\\n", + "0 -0.020974 -0.003458 0.045166 0.029871 -0.021186 \n", + "1 -0.046374 -0.046611 0.021068 -0.000085 -0.003701 \n", + "2 -0.032766 -0.034265 0.006545 0.014804 0.003027 \n", + "3 -0.015333 -0.028137 0.032510 0.010327 -0.013621 \n", + "4 0.003077 -0.029167 0.015523 0.017914 -0.008789 \n", + "\n", + " openai_1534 openai_1535 \n", + "0 -0.003376 -0.024937 \n", + "1 -0.015370 -0.019213 \n", + "2 -0.001518 -0.030946 \n", + "3 -0.007686 -0.016216 \n", + "4 -0.019767 -0.042353 \n", + "\n", + "[5 rows x 1541 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idstatus_idstatus_textcreated_atembeds_lengthopenai_0openai_1openai_2openai_3openai_4...openai_1526openai_1527openai_1528openai_1529openai_1530openai_1531openai_1532openai_1533openai_1534openai_1535
08978458027013775361221540755451392001Doubt it..It appears they all have gone the wa...2020-01-26 21:09:45+00:001536-0.020428-0.0067200.007308-0.022157-0.041841...0.0146160.0047050.012661-0.020974-0.0034580.0451660.029871-0.021186-0.003376-0.024937
19357396013014589471223458629837295619RT @Wyn1745: Democrats are ‘setting the stage’...2020-02-01 04:10:42+00:001536-0.036689-0.0074810.007968-0.006632-0.022805...-0.0016960.0025220.020397-0.046374-0.0466110.021068-0.000085-0.003701-0.015370-0.019213
25717746221217445781663363072RT @sarahdwire: I’m loathe to insert myself in...2020-01-15 13:57:48+00:001536-0.033382-0.006886-0.003244-0.0158340.000172...0.0010270.0024640.002013-0.032766-0.0342650.0065450.0148040.003027-0.001518-0.030946
33846798081223705594818748416RT @RepRatcliffe: We warned them...As Schiff a...2020-02-01 20:32:03+00:001536-0.008477-0.0073640.000919-0.0064350.008101...-0.0282690.0031930.015056-0.015333-0.0281370.0325100.010327-0.013621-0.007686-0.016216
47012642216532172811218459840277729281RT @chipfranklin: Because \"impeachment\" in the...2020-01-18 09:07:18+00:001536-0.0094540.0173760.007016-0.020075-0.023674...-0.0135900.0155640.0051300.003077-0.0291670.0155230.017914-0.008789-0.019767-0.042353
\n", + "

5 rows × 1541 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "statuses_df[\"user_id\"].nunique()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NGVktpyCkgJM", + "outputId": "8ec77e42-6b02-4c89-adb7-dfed2a6ded67" + }, + "execution_count": 70, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "7566" + ] + }, + "metadata": {}, + "execution_count": 70 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "TlotZx1R-fMZ" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/openai_embeddings_v2/README.md b/notebooks/openai_embeddings_v2/README.md new file mode 100644 index 0000000..f3877c9 --- /dev/null +++ b/notebooks/openai_embeddings_v2/README.md @@ -0,0 +1,13 @@ + + + + + + +# OpenAI Embeddings (v2) + +This supercedes earlier approach to fetch embeddings. In this second attempt we are grabbing user-level as well as tweet-level embeddings, to compare the difference in these approaches. + +The "Exporting Embeddings" notebook takes embeddings stored in BigQuery (see app/openai_embeddings_v2/README.md), and exports them to CSV / parquet files on Google Drive for easier and cheaper access + +The "Analysis Template" notebook provides an example of how to load the files from drive for further analysis. diff --git a/notebooks/openai_embeddings_v2/exporting_embeddings_to_drive_20240201_v3.py b/notebooks/openai_embeddings_v2/exporting_embeddings_to_drive_20240201_v3.py new file mode 100644 index 0000000..2f3adcd --- /dev/null +++ b/notebooks/openai_embeddings_v2/exporting_embeddings_to_drive_20240201_v3.py @@ -0,0 +1,314 @@ +# -*- coding: utf-8 -*- +"""Exporting Embeddings to Drive - 20240201 - v3 + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1tFWFj1yUgGxS-8WveeSnpEdrsgiG4-jh + +We fetched OpenAI embeddings and stored on BQ. Let's download a CSV file to drive for further analysis. + +## Setup + +### Google Drive +""" + +import os +from google.colab import drive + +drive.mount('/content/drive') +print(os.getcwd(), os.listdir(os.getcwd())) + + + +# you might need to create a google drive SHORTCUT that has this same path +# ... or update the path to use your own google drive organization +#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022' +#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023' +DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024' + +print(DIRPATH) +os.path.isdir(DIRPATH) + +DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data") +os.path.isdir(DATA_DIRPATH) + +"""### BigQuery Service""" + +from google.colab import auth + +# asks you to login +auth.authenticate_user() + +from google.cloud import bigquery +from pandas import DataFrame, read_gbq + + +PROJECT_ID = "tweet-collector-py" + +class BigQueryService(): + def __init__(self, project_id=PROJECT_ID): + self.project_id = project_id + self.client = bigquery.Client(project=self.project_id) + + def execute_query(self, sql, verbose=True): + if verbose == True: + print(sql) + job = self.client.query(sql) + return job.result() + + #def query_to_df(self, sql, verbose=True): + # """high-level wrapper to return a DataFrame""" + # results = self.execute_query(sql, verbose=verbose) + # return DataFrame([dict(row) for row in results]) + + def query_to_df(self, sql, verbose=True): + """high-level wrapper to return a DataFrame""" + if verbose == True: + print(sql) + # https://pandas.pydata.org/docs/reference/api/pandas.read_gbq.html#pandas-read-gbq + #return read_gbq(sql, project_id=self.project_id) # progress_bar_type="tqdm_notebook" + #progress_bar_type="tqdm_notebook" + return read_gbq(sql, project_id=self.project_id, progress_bar_type="tqdm_notebook") + +bq = BigQueryService() +print(bq) + +print("DATASETS:") +datasets = list(bq.client.list_datasets()) +for ds in datasets: + #print("...", ds.project, ds.dataset_id) + print("...", ds.reference) + +"""## Helper Functions + +### Unpacking Embeddings +""" + +import json +from pandas import DataFrame + + +def unpack(embeddings_str): + """Takes a string value containing an array of OpenAI embeddings, + and returns a list of floats. + """ + if isinstance(embeddings_str, str): + return json.loads(embeddings_str) + else: + return embeddings_str + + +def unpacked(df, col_prefix="openai"): + """Takes a dataframe witha single column of OpenAI embeddings, + and unpacks them into their own separate columns, + and returns a modified version of the original dataframe, + with the original embeddings column replaced by the new unpacked columns + """ + + print("UNPACKING...") + embeds = df["embeddings"].apply(unpack) + print(type(embeds)) + + print("RECONSTRUCTING...") + embeds = DataFrame(embeds.values.tolist()) + embeds.columns = [f"{col_prefix}_{col}" for col in embeds.columns] + embeds.index = df.index + print(embeds.shape) + #embeds.head() + + print("MERGING...") + df_unpacked = df.merge(embeds, left_index=True, right_index=True) + df_unpacked.drop(columns=["embeddings"], inplace=True) + print(df_unpacked.shape) + return df_unpacked + +"""# Embeddings""" + +DATASET_ADDRESS = "tweet-collector-py.impeachment_production" + +sql = f""" + SELECT + count(distinct s.user_id) as user_count + ,count(distinct s.status_id) as status_count + FROM `{DATASET_ADDRESS}.botometer_sample` s + JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings_v2` emb + ON s.status_id = emb.status_id +""" +bq.query_to_df(sql, verbose=False) + +"""## User Embeddings + +7566 users +""" + +sql = f""" + SELECT + u.user_id, u.created_on + --, u.screen_name_count, u.screen_names, split(u.screen_names, ",")[0] as screen_name + ,u.status_count, u.rt_count + ,u.is_bot --, u.bot_rt_network + ,u.opinion_community --, u.avg_score_lr, avg_score_nb, avg_score_bert + , u.is_q --, u.q_status_count + --, u.follower_count, u.follower_count_b, u.follower_count_h + --, u.friend_count, u.friend_count_b, u.friend_count_h + + ,u.avg_toxicity --, u.avg_severe_toxicity, u.avg_insult, u.avg_obscene, u.avg_threat, u.avg_identity_hate + , u.avg_fact_score -- ,u.fact_scored_count + + ,u.bom_astroturf, u.bom_overall --, u.bom_cap --,u.bom_lookup_count + --,u.bom_fake_follower, u.bom_financial, u.bom_other, u.bom_self_declared, u.bom_spammer + + ,emb.embeddings + + FROM `{DATASET_ADDRESS}.user_details_v20240128_slim` u + JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_user_embeddings` emb + ON emb.user_id = u.user_id + -- LIMIT 10 +""" + +users_df = bq.query_to_df(sql, verbose=False) +print(users_df.shape) + +users_df.head() + +"""Saving CSV to drive:""" + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings.csv.gz") +users_df.to_csv(csv_filepath, index=False, compression="gzip") + +"""### ... Unpacked""" + +users_df_unpacked = unpacked(users_df) +print(users_df.shape) +users_df_unpacked.head() + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz") +users_df_unpacked.to_csv(csv_filepath, index=False, compression="gzip") + +"""## Tweet Embeddings + +183K statuses + +Wow wow wow this is taking a long time (1hr +...) to stream the data down over the network... + +Re-doing with the statuses table v2, that has duplicate lookups removed (row per unique status)... + +Re-doing with statuses table v3, which has status texts as well... +""" + +sql = f""" + SELECT user_id, status_id, status_text, created_at, embeds_length, embeddings + FROM `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings_v3` + -- LIMIT 10000 +""" + +tweets_df = bq.query_to_df(sql, verbose=True) +print(tweets_df.shape) +tweets_df.head() + +tweets_df.head() + +"""Saving CSV to drive:""" + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3.csv.gz") +tweets_df.to_csv(csv_filepath, index=False, compression="gzip") + +"""### ... Unpacked""" + +unpacked_tweets_df = unpacked(tweets_df) +unpacked_tweets_df.head() + +# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html + +pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip") +unpacked_tweets_df.to_parquet(pq_filepath, compression="gzip") + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.csv.gz") +unpacked_tweets_df.to_csv(csv_filepath, index=False, compression="gzip") + +#arrow_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.arrow") +#df.to_feather(arrow_filepath) + +"""## Scratch Work""" + +##from pandas import concat +## +##limit = 1_000 +##offset = 0 +## +##all = DataFrame() +## +##while offset < 5_500: +## sql = f""" +## SELECT s.user_id, s.status_id, s.status_text, s.created_at, emb.embeddings +## FROM `{DATASET_ADDRESS}.botometer_sample` s +## JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings` emb +## ON s.status_id = emb.status_id +## LIMIT {int(limit)} +## OFFSET {int(offset)} +## """ +## +## batch = bq.query_to_df(sql, verbose=True) +## print(tweets_df.shape) +## if batch.empty: +## print("ALL DONE!") +## break +## +## concat(all, batch) +## offset += limit + +"""### Compressed Table + +https://cloud.google.com/bigquery/docs/exporting-data#bigquery_extract_table_compressed-python +""" + +# from google.cloud import bigquery +# client = bigquery.Client() +# bucket_name = 'my-bucket' + +#destination_uri = "gs://{}/{}".format(bucket_name, "shakespeare.csv.gz") +#dataset_ref = bigquery.DatasetReference(project, dataset_id) +#table_ref = dataset_ref.table("shakespeare") +#job_config = bigquery.job.ExtractJobConfig() +#job_config.compression = bigquery.Compression.GZIP +# +#extract_job = client.extract_table( +# table_ref, +# destination_uri, +# # Location must match that of the source table. +# location="US", +# job_config=job_config, +#) # API request +#extract_job.result() # Waits for job to complete. + +# from google.cloud import bigquery +# client = bigquery.Client() +# bucket_name = 'my-bucket' + + +#from google.cloud import bigquery +# +# +##ds_ref = bigquery.DatasetReference(PROJECT_ID, DATASET_ADDRESS) +#DATASET_ID = "impeachment_production" +#ds_ref = bigquery.DatasetReference(PROJECT_ID, DATASET_ID) +#table_ref = ds_ref.table("botometer_sample_max_50_openai_status_embeddings_v3") +# +#job_config = bigquery.job.ExtractJobConfig() +#job_config.compression = bigquery.Compression.GZIP +# +#BUCKET_NAME = "impeachment-analysis-2020" +##destination_uri = f"gs://{BUCKET_NAME}/impeachment_production/botometer_sample_max_50_openai_status_embeddings_v4.csv.gz" +##> too large to be exported to a single file. Specify a uri including a * to shard export. See 'Exporting data into one or more files' in https://cloud.google.com/bigquery/docs/exporting-data. +#destination_uri = f"gs://{BUCKET_NAME}/impeachment_production/botometer_sample_max_50_openai_status_embeddings_v4_*.csv.gz" +# +#client = bq.client +#extract_job = client.extract_table( +# table_ref, +# destination_uri, +# # Location must match that of the source table. +# location="US", +# job_config=job_config, +#) # API request +#extract_job.result() # Waits for job to complete. \ No newline at end of file diff --git a/notebooks/openai_embeddings_v2/impeachment_2020_embeddings_analysis_template_(20240129).py b/notebooks/openai_embeddings_v2/impeachment_2020_embeddings_analysis_template_(20240129).py new file mode 100644 index 0000000..726c12d --- /dev/null +++ b/notebooks/openai_embeddings_v2/impeachment_2020_embeddings_analysis_template_(20240129).py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- +"""Impeachment 2020 Embeddings Analysis Template (20240129) + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1dAlLxG-SbQNzBVLyD84a9x_6xlBUPQjQ + +We fetched user-level and tweet-level OpenAI embeddings and stored on BQ, and copied the data to CSV files on Drive. + +This notebook provides an example of how to load those CSV files. Feel free to make a copy of this notebook and perform your own analyses. + +## Setup + +### Google Drive +""" + +import os +from google.colab import drive + +drive.mount('/content/drive') +print(os.getcwd(), os.listdir(os.getcwd())) + +# you might need to create a google drive SHORTCUT that has this same path +# ... or update the path to use your own google drive organization +#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022' +#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023' +DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024' + +print(DIRPATH) +os.path.isdir(DIRPATH) + +"""New project-based directory structure for 2024: + +https://drive.google.com/drive/folders/1SuXkqVT400uZ2OYFGGV8SYBf7NhtBo5k?usp=drive_link +""" + +DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data") +os.path.isdir(DATA_DIRPATH) + +os.listdir(DATA_DIRPATH) + +"""The "unpacked" versions have a column per embedding, and are generally easier to work with. + +The files we will be working with are: + + "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz" and + + "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip". + +## User Embeddings + +7566 users + +Loading CSV from drive: +""" + +from pandas import read_csv + +csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz") +users_df = read_csv(csv_filepath, compression="gzip") +print(users_df.shape) +print(users_df.columns) +users_df.head() + +users_df["user_id"].nunique() + +users_df["is_bot"].value_counts() + +users_df["opinion_community"].value_counts() + +users_df["avg_fact_score"].info() + +from pandas import isnull + +def add_labels(users_df): + # APPLY SAME LABELS AS THE ORIGINAL SOURCE CODE + # https://github.com/s2t2/openai-embeddings-2023/blob/1b8372dd36982009df5d4a80871f4c182ada743d/notebooks/2_embeddings_data_export.py#L51 + # https://github.com/s2t2/openai-embeddings-2023/blob/main/app/dataset.py#L37-L64 + + # labels: + users_df["opinion_label"] = users_df["opinion_community"].map({0:"Anti-Trump", 1:"Pro-Trump"}) + users_df["bot_label"] = users_df["is_bot"].map({True:"Bot", False:"Human"}) + users_df["fourway_label"] = users_df["opinion_label"] + " " + users_df["bot_label"] + + # language toxicity scores (0 low - 1 high) + toxic_threshold = 0.1 + users_df["is_toxic"] = users_df["avg_toxicity"] >= toxic_threshold + users_df["is_toxic"] = users_df["is_toxic"].map({True: 1, False :0 }) + users_df["toxic_label"] = users_df["is_toxic"].map({1: "Toxic", 0 :"Normal" }) + + # fact check / media quality scores (1 low - 5 high) + fact_threshold = 3.0 + users_df["is_factual"] = users_df["avg_fact_score"].apply(lambda score: score if isnull(score) else score >= fact_threshold) + + # botometer binary and labels: + users_df["is_bom_overall"] = users_df["bom_overall"].round() + users_df["is_bom_astroturf"] = users_df["bom_astroturf"].round() + users_df["bom_overall_label"] = users_df["is_bom_overall"].map({1:"Bot", 0:"Human"}) + users_df["bom_astroturf_label"] = users_df["is_bom_astroturf"].map({1:"Bot", 0:"Human"}) + users_df["bom_overall_fourway_label"] = users_df["opinion_label"] + " " + users_df["bom_overall_label"] + users_df["bom_astroturf_fourway_label"] = users_df["opinion_label"] + " " + users_df["bom_astroturf_label"] + + return users_df + + +users_df = add_labels(users_df) +print(users_df.shape) +print(users_df.columns.tolist()) +users_df.head() + +users_df["is_factual"].value_counts() + +users_df["is_toxic"].value_counts() + +users_df["bot_label"].value_counts() + +users_df["opinion_label"].value_counts() + +users_df["fourway_label"].value_counts() + +"""## Tweet Embeddings + +183K statuses: +""" + +from pandas import read_parquet + +pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip") +statuses_df = read_parquet(pq_filepath) +print(statuses_df.shape) +print(statuses_df.columns) +statuses_df.head() + +statuses_df["user_id"].nunique() + diff --git a/requirements.txt b/requirements.txt index f027615..93a1a81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,6 +34,8 @@ gensim # model storage: google-cloud-storage +# data storage: +google-cloud-bigquery #==3.2.0 # automated tests: pytest