Merge pull request #121 from jacobp24/scripts_changes

Making scripts pylint compliant
jacobp24 · Mar 13, 2024 · fac2957 · fac2957
2 parents f9c4244 + 7fc6683
commit fac2957
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 44 deletions.
diff --git a/scripts/Embeddings.py b/scripts/Embeddings.py
@@ -1,79 +1,78 @@
+# Needed to do this for the use of a general exception to deal with broad API errors.
+# pylint: disable=W0718,W0621
 """
 This script processes a dataset of book summaries to generate and utilize embeddings
 for semantic analysis, leveraging the voyageai API for embedding generation.
 """
 
-# Import standard libraries
-import os
-import time
-
 # Import third-party libraries
 import pandas as pd
-import numpy as np
 import nltk
-from nltk.tokenize import word_tokenize
-import voyageai
+from voyageai import Client as VoyageClient
 
 # Download the 'punkt' tokenizer model
 nltk.download('punkt')
 
 # Load the dataset
-DATA_PATH = "C:/Users/stlp/Desktop/Geeky/Software/bookworm_local/attempt_1/complete_w_ratings.csv"
+DATA_PATH = "complete_w_ratings.csv"
 df = pd.read_csv(DATA_PATH)
 
-# Load the instance of the model
-client = voyageai.Client(api_key="")
+# Initialize the voyageai Client
+voyage_client = VoyageClient(api_key="")
 
-def token_count(summary):
+def token_count(summary, client):
     """
     Counts the number of tokens in a summary using the voyageai Client.
 
     Parameters:
     - summary: The text summary to count tokens in.
+    - client: The voyageai client instance.
 
     Returns:
     - The token count.
     """
     return client.count_tokens([summary])
 
-# Data cleaning and preprocessing
-df['token_count'] = df['summary'].apply(token_count)
+# Apply token counting
+df['token_count'] = df['summary'].apply(lambda x: token_count(x, voyage_client))
 filtered_df = df[df['token_count'] <= 4000]
 filtered_df.drop(columns=['token_count'], inplace=True)
 
 # Prepare texts for embedding generation
-texts = filtered_df['summary'].tolist()
+text_summaries = filtered_df['summary'].tolist()
 
-def generate_embeddings(texts, batch_size=24):
+def generate_embeddings(texts, client, batch_size=24):
     """
     Generates embeddings for a list of texts in batches.
 
     Parameters:
     - texts: A list of text summaries.
+    - client: The voyageai client instance.
     - batch_size: The size of each batch for processing.
 
     Returns:
     - A list of embeddings.
     """
-    client = voyageai.Client(api_key="")
-    embeddings = []
+    all_embeddings = []
     progress_count = 0
 
     for i in range(0, len(texts), batch_size):
         batch_texts = texts[i:i + batch_size]
         try:
-            batch_embeddings = client.embed(batch_texts, model="voyage-lite-02-instruct", input_type="document").embeddings
-            embeddings.extend(batch_embeddings)
-        except Exception:
-            process_individual_texts(batch_texts, embeddings, client)
+            batch_result = client.embed(batch_texts, model="voyage-lite-02-instruct",
+                                        input_type="document")
+            batch_embeddings = batch_result.embeddings
+            all_embeddings.extend(batch_embeddings)
+        except Exception:  # Use of a general exception to deal with broad API Errors.
+            process_individual_texts(batch_texts, all_embeddings, client)
 
         # Progress update
         progress = (progress_count / (len(texts) / batch_size)) * 100
-        print(f"\rProgress: {progress:.3f}%", end='')
+        print(f"\rProgress: {progress:.2f}%", end='')
         progress_count += 1
 
     print("\nDone!")
-    return embeddings
+    return all_embeddings
 
 def process_individual_texts(batch_texts, embeddings, client):
     """
@@ -84,17 +83,16 @@ def process_individual_texts(batch_texts, embeddings, client):
     - embeddings: The list to append embeddings to.
     - client: The voyageai client instance.
     """
-    batch_embeddings = []
     for text in batch_texts:
         try:
-            embedding = client.embed([text], model="voyage-lite-02-instruct", input_type="document").embeddings
-            batch_embeddings.extend(embedding)
-        except Exception:
-            batch_embeddings.append(None)
-    embeddings.extend(batch_embeddings)
+            result = client.embed([text], model="voyage-lite-02-instruct",
+                                  input_type="document")
+            embeddings.extend(result.embeddings)
+        except Exception:  # Use of a general exception to deal with broad API Errors
+            embeddings.append(None)
 
 # Generate embeddings
-embeddings = generate_embeddings(texts)
+embeddings = generate_embeddings(text_summaries, voyage_client)
 filtered_df["embeddings"] = embeddings
 
 # Save the processed dataframe

diff --git a/scripts/README.md b/scripts/README.md
@@ -30,9 +30,9 @@ Before you begin, ensure you have met the following requirements:
 
 ## Files Description
 
-- `Embeddings.py`: This script processes a dataset of book summaries to generate embeddings using the voyageai API. It includes data cleaning, token counting, and embedding generation.
+- `embeddings.py`: This script processes a dataset of book summaries to generate embeddings using the voyageai API. It includes data cleaning, token counting, and embedding generation.
 
-- `Semantic Scores.py`: After generating embeddings, this script loads them and uses the k-Nearest Neighbors algorithm to find and analyze the closest summaries based on their semantic similarity.
+- `semantic_scores.py`: After generating embeddings, this script loads them and uses the k-Nearest Neighbors algorithm to find and analyze the closest summaries based on their semantic similarity.
 
 ## Running the Scripts
 
@@ -44,8 +44,12 @@ Before you begin, ensure you have met the following requirements:
     python embeddings.py
     ```
 
-3. After generating the embeddings, run `Semantic Scores.py` to perform the nearest neighbors analysis.
+3. After generating the embeddings, run `semantic_scores.py` to perform the nearest neighbors analysis.
 
     ```bash
-    python nearest_neighbors.py
-    ```
+    python semantic_scores.py
+    ```
+
+## Note on Test Coverage
+
+Please note that the scripts folder does not have test coverage because all these scripts are intended for one-time use. They were specifically designed to process a dataset for a singular analysis purpose, and as such, traditional unit or integration testing paradigms are not directly applicable.
diff --git a/scripts/Semantic Scores.py → scripts/semantic_scores.py b/scripts/Semantic Scores.py → scripts/semantic_scores.py
@@ -4,10 +4,10 @@
 based on their embeddings, and saves/loads the inference results.
 """
 
-# Import necessary libraries
+# Corrected import order
+import ast  # For safe evaluation of strings containing Python literals
 import pandas as pd
 import numpy as np
-import ast  # For safe evaluation of strings containing Python literals
 from sklearn.neighbors import NearestNeighbors
 
 # Load dataset
@@ -51,23 +51,23 @@ def convert_embeddings_to_list(embeddings_series):
 distances_loaded = np.load('distances_updated.npy')
 indices_loaded = np.load('indices_updated.npy')
 
-def print_most_similar_items(distances, indices, item_index=0, num_items=1):
+def print_most_similar_items(loaded_distances, loaded_indices, item_index=0, num_items=1):
     """
     Prints the most similar items based on the kNN analysis.
 
     Parameters:
-    - distances: A NumPy array of distances between items.
-    - indices: A NumPy array of indices of the nearest neighbors.
+    - loaded_distances: A NumPy array of distances between items, loaded from file.
+    - loaded_indices: A NumPy array of indices of the nearest neighbors, loaded from file.
     - item_index: The index of the item for which to find similar items.
     - num_items: The number of similar items to display.
     """
-    most_similar_index = indices_loaded[item_index][1]  # Skip the item itself
-    similarity_score = 1 - distances_loaded[item_index][1]  # Convert distance to similarity
+    most_similar_index = loaded_indices[item_index][1]  # Skip the item itself
+    similarity_score = 1 - loaded_distances[item_index][1]  # Convert distance to similarity
     print(f"Most similar plot index: {most_similar_index}")
     print(f"Similarity score: {similarity_score}")
 
-    next_closest_indices = indices[item_index][1:num_items+1]
-    similarity_scores = 1 - distances_loaded[item_index][1:num_items+1]
+    next_closest_indices = loaded_indices[item_index][1:num_items+1]
+    similarity_scores = 1 - loaded_distances[item_index][1:num_items+1]
     print("Indices of the next closest items:", next_closest_indices)
     print(f"Similarity scores: {similarity_scores}")