Skip to content

Commit

Permalink
Merge pull request #121 from jacobp24/scripts_changes
Browse files Browse the repository at this point in the history
Making scripts pylint compliant
  • Loading branch information
jacobp24 authored Mar 13, 2024
2 parents f9c4244 + 7fc6683 commit fac2957
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 44 deletions.
58 changes: 28 additions & 30 deletions scripts/Embeddings.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,78 @@
# Needed to do this for the use of a general exception to deal with broad API errors.
# pylint: disable=W0718,W0621
"""
This script processes a dataset of book summaries to generate and utilize embeddings
for semantic analysis, leveraging the voyageai API for embedding generation.
"""

# Import standard libraries
import os
import time

# Import third-party libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import voyageai
from voyageai import Client as VoyageClient

# Download the 'punkt' tokenizer model
nltk.download('punkt')

# Load the dataset
DATA_PATH = "C:/Users/stlp/Desktop/Geeky/Software/bookworm_local/attempt_1/complete_w_ratings.csv"
DATA_PATH = "complete_w_ratings.csv"
df = pd.read_csv(DATA_PATH)

# Load the instance of the model
client = voyageai.Client(api_key="")
# Initialize the voyageai Client
voyage_client = VoyageClient(api_key="")

def token_count(summary):
def token_count(summary, client):
"""
Counts the number of tokens in a summary using the voyageai Client.
Parameters:
- summary: The text summary to count tokens in.
- client: The voyageai client instance.
Returns:
- The token count.
"""
return client.count_tokens([summary])

# Data cleaning and preprocessing
df['token_count'] = df['summary'].apply(token_count)
# Apply token counting
df['token_count'] = df['summary'].apply(lambda x: token_count(x, voyage_client))
filtered_df = df[df['token_count'] <= 4000]
filtered_df.drop(columns=['token_count'], inplace=True)

# Prepare texts for embedding generation
texts = filtered_df['summary'].tolist()
text_summaries = filtered_df['summary'].tolist()

def generate_embeddings(texts, batch_size=24):
def generate_embeddings(texts, client, batch_size=24):
"""
Generates embeddings for a list of texts in batches.
Parameters:
- texts: A list of text summaries.
- client: The voyageai client instance.
- batch_size: The size of each batch for processing.
Returns:
- A list of embeddings.
"""
client = voyageai.Client(api_key="")
embeddings = []
all_embeddings = []
progress_count = 0

for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i + batch_size]
try:
batch_embeddings = client.embed(batch_texts, model="voyage-lite-02-instruct", input_type="document").embeddings
embeddings.extend(batch_embeddings)
except Exception:
process_individual_texts(batch_texts, embeddings, client)
batch_result = client.embed(batch_texts, model="voyage-lite-02-instruct",
input_type="document")
batch_embeddings = batch_result.embeddings
all_embeddings.extend(batch_embeddings)
except Exception: # Use of a general exception to deal with broad API Errors.
process_individual_texts(batch_texts, all_embeddings, client)

# Progress update
progress = (progress_count / (len(texts) / batch_size)) * 100
print(f"\rProgress: {progress:.3f}%", end='')
print(f"\rProgress: {progress:.2f}%", end='')
progress_count += 1

print("\nDone!")
return embeddings
return all_embeddings

def process_individual_texts(batch_texts, embeddings, client):
"""
Expand All @@ -84,17 +83,16 @@ def process_individual_texts(batch_texts, embeddings, client):
- embeddings: The list to append embeddings to.
- client: The voyageai client instance.
"""
batch_embeddings = []
for text in batch_texts:
try:
embedding = client.embed([text], model="voyage-lite-02-instruct", input_type="document").embeddings
batch_embeddings.extend(embedding)
except Exception:
batch_embeddings.append(None)
embeddings.extend(batch_embeddings)
result = client.embed([text], model="voyage-lite-02-instruct",
input_type="document")
embeddings.extend(result.embeddings)
except Exception: # Use of a general exception to deal with broad API Errors
embeddings.append(None)

# Generate embeddings
embeddings = generate_embeddings(texts)
embeddings = generate_embeddings(text_summaries, voyage_client)
filtered_df["embeddings"] = embeddings

# Save the processed dataframe
Expand Down
14 changes: 9 additions & 5 deletions scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ Before you begin, ensure you have met the following requirements:

## Files Description

- `Embeddings.py`: This script processes a dataset of book summaries to generate embeddings using the voyageai API. It includes data cleaning, token counting, and embedding generation.
- `embeddings.py`: This script processes a dataset of book summaries to generate embeddings using the voyageai API. It includes data cleaning, token counting, and embedding generation.

- `Semantic Scores.py`: After generating embeddings, this script loads them and uses the k-Nearest Neighbors algorithm to find and analyze the closest summaries based on their semantic similarity.
- `semantic_scores.py`: After generating embeddings, this script loads them and uses the k-Nearest Neighbors algorithm to find and analyze the closest summaries based on their semantic similarity.

## Running the Scripts

Expand All @@ -44,8 +44,12 @@ Before you begin, ensure you have met the following requirements:
python embeddings.py
```

3. After generating the embeddings, run `Semantic Scores.py` to perform the nearest neighbors analysis.
3. After generating the embeddings, run `semantic_scores.py` to perform the nearest neighbors analysis.

```bash
python nearest_neighbors.py
```
python semantic_scores.py
```

## Note on Test Coverage

Please note that the scripts folder does not have test coverage because all these scripts are intended for one-time use. They were specifically designed to process a dataset for a singular analysis purpose, and as such, traditional unit or integration testing paradigms are not directly applicable.
18 changes: 9 additions & 9 deletions scripts/Semantic Scores.py → scripts/semantic_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
based on their embeddings, and saves/loads the inference results.
"""

# Import necessary libraries
# Corrected import order
import ast # For safe evaluation of strings containing Python literals
import pandas as pd
import numpy as np
import ast # For safe evaluation of strings containing Python literals
from sklearn.neighbors import NearestNeighbors

# Load dataset
Expand Down Expand Up @@ -51,23 +51,23 @@ def convert_embeddings_to_list(embeddings_series):
distances_loaded = np.load('distances_updated.npy')
indices_loaded = np.load('indices_updated.npy')

def print_most_similar_items(distances, indices, item_index=0, num_items=1):
def print_most_similar_items(loaded_distances, loaded_indices, item_index=0, num_items=1):
"""
Prints the most similar items based on the kNN analysis.
Parameters:
- distances: A NumPy array of distances between items.
- indices: A NumPy array of indices of the nearest neighbors.
- loaded_distances: A NumPy array of distances between items, loaded from file.
- loaded_indices: A NumPy array of indices of the nearest neighbors, loaded from file.
- item_index: The index of the item for which to find similar items.
- num_items: The number of similar items to display.
"""
most_similar_index = indices_loaded[item_index][1] # Skip the item itself
similarity_score = 1 - distances_loaded[item_index][1] # Convert distance to similarity
most_similar_index = loaded_indices[item_index][1] # Skip the item itself
similarity_score = 1 - loaded_distances[item_index][1] # Convert distance to similarity
print(f"Most similar plot index: {most_similar_index}")
print(f"Similarity score: {similarity_score}")

next_closest_indices = indices[item_index][1:num_items+1]
similarity_scores = 1 - distances_loaded[item_index][1:num_items+1]
next_closest_indices = loaded_indices[item_index][1:num_items+1]
similarity_scores = 1 - loaded_distances[item_index][1:num_items+1]
print("Indices of the next closest items:", next_closest_indices)
print(f"Similarity scores: {similarity_scores}")

Expand Down

0 comments on commit fac2957

Please sign in to comment.