Merge pull request #32 from jacobp24/priyam

modifying search
jacobp24 · Mar 4, 2024 · bf8e7a8 · bf8e7a8
2 parents 10fdc4d + 24a2a60
commit bf8e7a8
Show file tree

Hide file tree

Showing 13 changed files with 129 additions and 11,280 deletions.
diff --git a/.gitignore b/.gitignore
@@ -82,6 +82,9 @@ target/
 profile_default/
 ipython_config.py
 
+# Environment variable
+.env
+
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:

diff --git a/bookworm/search.py b/bookworm/search.py
@@ -1,128 +1,121 @@
 import pandas as pd
 import numpy as np
-
 import ast
-
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import linear_kernel
-
-df = pd.read_csv("data/complete_w_ratings.csv")
-#distances = np.load('data/distances.npy')
-#indices = np.load('data/indices.npy')
-
-def parse_genres(genre_str):
-    try:
-        # Safely evaluate the string as a dictionary
-        genres_dict = ast.literal_eval(genre_str)
-        # Extract genre names and join them into a single string
-        genres_text = ', '.join(genres_dict.values())
-    except (ValueError, SyntaxError):
-        # Handle cases where the genre string is malformed or empty
-        genres_text = 'Unknown Genre'
-    return genres_text
-
-# Function to preprocess text
-def preprocess_text(text):
-    """Basic text preprocessing"""
-    text = str(text).lower()  # Convert NaN to 'nan' string, then lowercase text
-    # Additional preprocessing here (e.g., remove punctuation, stopwords)
-    return text
-
-# Assuming df is your DataFrame containing the books dataset with modified 'genre'
-def get_keyword_results(df, query, num_books=10):
-    # Fill NaN values with a placeholder for other fields if necessary
-    #df.fillna({'author_file1': 'Unknown', 'book_title_file1': 'Unknown', 'genre_file1': 'Unknown', 'summary_file1': 'No Summary Available'}, inplace=True)
+from thefuzz import fuzz
+import os
+from dotenv import load_dotenv
+from sklearn.metrics.pairwise import cosine_similarity
+
+import voyageai
+
+# Load environment variables from .env file
+load_dotenv()
+
+# API for embeddings voyage
+api_key = os.getenv("API_KEY")
+vo = voyageai.Client(api_key=api_key)
+
+# Load data
+# adding the df
+# Read the first dataframe, which will provide the column names for the combined dataframe
+df1 = pd.read_csv("data/complete_w_embeddings/complete_w_embeddings.csv_part_1.csv")
+
+# Read the remaining dataframes without adding their headers as column names
+df2 = pd.read_csv("data/complete_w_embeddings/complete_w_embeddings.csv_part_2.csv", header=None)
+df3 = pd.read_csv("data/complete_w_embeddings/complete_w_embeddings.csv_part_3.csv", header=None)
+df4 = pd.read_csv("data/complete_w_embeddings/complete_w_embeddings.csv_part_4.csv", header=None)
+df2.columns = df1.columns
+df3.columns = df1.columns
+df4.columns = df1.columns
+
+df = pd.concat([df1, df2, df3, df4], ignore_index=True)
+
+distances = np.load('data/distances_updated.npy')
+indices = np.load('data/indices_updated.npy')
+
+class HelperFunctions:
+    @staticmethod
+    def parse_genres(genre_str):
+        try:
+            genres_dict = ast.literal_eval(genre_str)
+            genres_text = ', '.join(genres_dict.values())
+        except (ValueError, SyntaxError):
+            genres_text = 'Unknown Genre'
+        return genres_text
+
+    @staticmethod
+    def preprocess_text(text):
+        text = str(text).lower()
+        return text
+
+    @staticmethod
+    def get_semantic_results(book_index, num_books=10):
+        similar_books_indices = indices[book_index][:num_books]
+        return similar_books_indices
+
+    @staticmethod
+    def query_to_index(df, query, vectorizer=None):
+        df.fillna({'author': 'Unknown', 'book_title': 'Unknown', 'genre': 'Unknown', 'summary': 'No Summary Available'}, inplace=True)
+        df['combined_text'] = df.apply(lambda x: HelperFunctions.preprocess_text(f"{x['book_title']} {x['author']} {HelperFunctions.parse_genres(x['genre'])} {x['summary']}"), axis=1)
+
+        if vectorizer is None:
+            vectorizer = TfidfVectorizer(stop_words='english')
+            vectorizer.fit(df['combined_text'])
+        query_vec = vectorizer.transform([query])
+        cosine_similarities = linear_kernel(query_vec, vectorizer.transform(df['combined_text'])).flatten()
+        most_relevant_index = cosine_similarities.argsort()[-1]
+        return most_relevant_index
+
+def keyword_search(df, query, num_books=10):
     df.fillna({'author': 'Unknown', 'book_title': 'Unknown', 'genre': 'Unknown', 'summary': 'No Summary Available'}, inplace=True)
-
-    # Combine text from different fields into a single text column
-    #df['combined_text'] = (df['book_title_file1'] + ' ' + df['author_file1'] + ' ' + df['genre_file1'] + ' ' + df['summary_file1']).apply(preprocess_text)
-    df['combined_text'] = (df['book_title'] + ' ' + df['author'] + ' ' + df['genre'] + ' ' + df['summary']).apply(preprocess_text)
-
-
-    query = preprocess_text(query)
-
-    # Use TF-IDF Vectorizer to transform texts into feature vectors
+    df['combined_text'] = df.apply(lambda x: HelperFunctions.preprocess_text(f"{x['book_title']} {x['author']} {HelperFunctions.parse_genres(x['genre'])} {x['summary']}"), axis=1)
+    query = HelperFunctions.preprocess_text(query)
     vectorizer = TfidfVectorizer(stop_words='english')
     tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
-
-    # Vectorize the query
     query_vec = vectorizer.transform([query])
-
-    # Compute the cosine similarity between query_vec and all book vectors
     cosine_similarities = linear_kernel(query_vec, tfidf_matrix).flatten()
-
-    # Get the top N matching books
     top_book_indices = cosine_similarities.argsort()[-num_books:][::-1]
-
-    return df.iloc[top_book_indices]
 
-""" def get_semantic_results(book_index, num_books=10):
-    # Fetch the indices of the most similar books for the given book index
-    similar_books_indices = indices[book_index][:num_books]
-    
-    # Optionally, you might want to use distances to filter or sort the results further
-    # For simplicity, this example returns the top N similar books directly
-    
-    return similar_books_indices """
+    keyword_results = df.iloc[top_book_indices]
+    keyword_indices = keyword_results.index.tolist()
+    results = df.loc[keyword_indices].head(num_books)
+    return results
 
-""" def query_to_index(df, query, vectorizer=None):
+def semantic_search(df, query, num_books=10):
+    book_index = HelperFunctions.query_to_index(df, query)
+    semantic_indices = HelperFunctions.get_semantic_results(book_index, num_books)
+    semantic_indices = semantic_indices.tolist() if isinstance(semantic_indices, np.ndarray) else semantic_indices
+    results = df.loc[semantic_indices].head(num_books)
+    return results
+
+# Function for fuzzy matching for author2 search
+def author2_search(df, query, ratio = 80, num_books=10):
+
+    def calculate_ratio(row):
+        return fuzz.ratio(row['author'], query)
+    # Apply the function to each row and store the result in a new column
+    df['ratio'] = df.apply(calculate_ratio, axis=1)
+    # filter the database to only those rows with match > ratio
+    result = df[df["ratio"] > ratio].tolist()
+    results = result.head(num_books)
 
-    Map a search query to the most relevant book index in the dataset.
+    return results
 
-    :param df: DataFrame containing the books dataset.
-    :param query: The search query as a string.
-    :param vectorizer: Pre-fitted TF-IDF Vectorizer (optional).
-    :return: Index of the most relevant book based on the query.
+def plot_semantic_search(df, query, num_books = 10):
+    # computing embeddings for the query
+    query_embedding = vo.embed(query, model="voyage-lite-02-instruct", input_type="document").embeddings
 
-    # If a vectorizer is not provided, initialize and fit one based on the 'combined_text' column
-    if vectorizer is None:
-        from sklearn.feature_extraction.text import TfidfVectorizer
-        vectorizer = TfidfVectorizer(stop_words='english')
-        vectorizer.fit(df['combined_text'])
-
-    # Vectorize the query using the provided or newly created vectorizer
-    query_vec = vectorizer.transform([query])
-
-    # Compute cosine similarity between the query vector and all book vectors
-    from sklearn.metrics.pairwise import linear_kernel
-    cosine_similarities = linear_kernel(query_vec, vectorizer.transform(df['combined_text'])).flatten()
-
-    # Find the index of the most relevant book
-    most_relevant_index = cosine_similarities.argsort()[-1]
-
-    return most_relevant_index """
-
-""" def hybrid_search(df, query, distances, indices, num_books=10, alpha=0.5):
-    Perform a hybrid search combining keyword and semantic search results.
-
-    :param df: DataFrame containing the books dataset.
-    :param query: Search query string.
-    :param distances: Numpy array of precomputed semantic distances.
-    :param indices: Numpy array of precomputed semantic indices.
-    :param num_books: Number of books to return.
-    :param alpha: Weight for blending the results (0 to 1). Closer to 0 favors keyword, closer to 1 favors semantic.
-    :return: DataFrame of the top N books based on hybrid search criteria.
+    # Convert embeddings from string representation back to lists (and then to numpy arrays)
+    embeddings_matrix = np.array([ast.literal_eval(embedding) if isinstance(embedding, str) else embedding for embedding in df['embeddings']])
 
+    # Compute cosine similarities between the query embedding and the book embeddings
+    similarities = cosine_similarity(query_embedding, embeddings_matrix)
 
-    # Step 1: Perform Keyword-Based Search
-    keyword_results = get_keyword_results(df, query, num_books)
-    keyword_indices = keyword_results.index.tolist()
-        
-    # Step 2: Map query to an index for Semantic Search (this step is conceptual and needs a concrete implementation)
-    # For demonstration, let's assume a function `query_to_index` that maps a query to an index for semantic search
-    book_index = query_to_index(df, query)  # This function needs to be defined based on your application's specifics
-    semantic_indices = get_semantic_results(book_index, num_books)
-    semantic_indices = semantic_indices.tolist() if isinstance(semantic_indices, np.ndarray) else semantic_indices
-
-    # Step 3: Combine Results
-    # This could be a simple union or an intersection with weighted ranking
-    combined_indices = list(set(keyword_indices + semantic_indices))
+    # Get indices of the top N similar books
+    top_n_indices = np.argsort(similarities[0])[::-1][:num_books]
+    closest_books = df.iloc[top_n_indices]
 
-    # Optional: Re-rank combined results based on some criteria, e.g., blending scores
-    # For simplicity, this example does not implement re-ranking
-
-    # Fetch book details for the combined indices
-    combined_results = df.loc[combined_indices].head(num_books)
-
-    return combined_results
- """
+    # Return the DataFrame containing the closest books
+    return closest_books
diff --git a/bookworm/search_wrapper.py b/bookworm/search_wrapper.py
@@ -1,49 +1,47 @@
-from search import get_keyword_results as get_keyword_results
+import search
 import pandas as pd
 import numpy as np
 from thefuzz import fuzz
 
-df = pd.read_csv("data/complete_w_ratings.csv")
-#distances = np.load('data/distances.npy')
-#indices = np.load('data/indices.npy')
+# adding the df
+# Read the first dataframe, which will provide the column names for the combined dataframe
+df1 = pd.read_csv("data/complete_w_embeddings/complete_w_embeddings.csv_part_1.csv")
 
+# Read the remaining dataframes without adding their headers as column names
+df2 = pd.read_csv("data/complete_w_embeddings/complete_w_embeddings.csv_part_2.csv", header=None)
+df3 = pd.read_csv("data/complete_w_embeddings/complete_w_embeddings.csv_part_3.csv", header=None)
+df4 = pd.read_csv("data/complete_w_embeddings/complete_w_embeddings.csv_part_4.csv", header=None)
+df2.columns = df1.columns
+df3.columns = df1.columns
+df4.columns = df1.columns
+
+df = pd.concat([df1, df2, df3, df4], ignore_index=True)
+
+# Filter
 def filter(results, min_ave_ratings, min_num_rating):
-    # Filter by min ave ratings if min >0.0
+    # Filter by min ave ratings if min > 0.0
     # Otherwise keep all, including "none" values 
     if min_ave_ratings != 0.0:
         subset_df = results[results['Book-Rating'] > min_ave_ratings]
     else:
         subset_df = results
 
-    # Filter by min num ratings if min_num >0
+    # Filter by min num ratings if min_num > 0
     # Otherwise keep all, including "none" values 
     if min_num_rating != 0:
         results_filtered = subset_df[subset_df['RatingCount'] > min_num_rating]
     else:
         results_filtered = subset_df
     return results_filtered
 
-# Function for fuzzy matching for author2 search
-def calculate_matching_ratio(df, query, ratio = 80):
-
-    def calculate_ratio(row):
-        return fuzz.ratio(row['author'], query)
-
-    # Apply the function to each row and store the result in a new column
-    df['ratio'] = df.apply(calculate_ratio, axis=1)
-
-
-    # filter the database to only those rows with match > ratio
-    result_df = df[df["ratio"] > ratio]
-
-
-    return result_df
-
-
-def search_wrapper(search_mode, search_value, min_ave_rating, min_num_ratings):
+def search_wrapper(search_mode, search_value, min_ave_rating, min_num_ratings, num_books=10):
     if (search_mode == "Author2"):
-        results = calculate_matching_ratio(df, search_value).head(10)
+        results = search.author2_search(df, search_value, num_books)
+    if (search_mode == "Title"):
+        results = search.semantic_search(df, search_value, num_books)
+    if (search_mode == "Plot"):
+        results = search.plot_semantic_search(df, search_value, num_books)
     else: 
-        results = get_keyword_results(df,search_value)
+        results = search.keyword_search(df, search_value, num_books)
     results_filtered = filter(results, min_ave_rating, min_num_ratings)
     return results_filtered
diff --git a/data/distances_updated.npy b/data/distances_updated.npy
diff --git a/data/indices_updated.npy b/data/indices_updated.npy
diff --git a/rec_system_1/Dockerfile b/rec_system_1/Dockerfile