revert pylint checker to disabled

jacobp24 · Mar 11, 2024 · 360698b · 360698b
1 parent 2972798
commit 360698b
Show file tree

Hide file tree

Showing 7 changed files with 275 additions and 3 deletions.
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
@@ -92,8 +92,9 @@ jobs:
      # Next step: run pylint. Anything less than 10/10 will fail.
       - name: Lint with pylint
         run: |
-         pylint bookworm/**/*.py
-         pylint bookworm/*.py
+        pylint bookworm/*.py
+        pylint bookworm/**/*.py
+
 
       # Next step: run the unit tests with code coverage.
       - name: Unit tests

diff --git a/bookworm/__init__.py b/bookworm/__init__.py
@@ -1 +1 @@
-# EMPTY FILE
+# EMPTY FILE
diff --git a/bookworm/local/InspectData.py b/bookworm/local/InspectData.py
@@ -0,0 +1,29 @@
+# Sue's Local Search Script 
+
+import pandas as pd
+import search_wrapper as sw
+import search
+from search import HelperFunctions as H 
+import numpy as np
+from thefuzz import fuzz
+
+#Assemble data 
+path_root = "data/complete_w_embeddings/complete_w_embeddings.csv"
+path1 = path_root + "_part_1.csv"
+path2 = path_root + "_part_2.csv"
+path3 = path_root + "_part_3.csv"
+path4 = path_root + "_part_4.csv"
+dat = sw.assemble_data(path1, path2, path3, path4)
+
+col_to_show = ["book_id", "genre", "book_title", "author"]
+#pd.set_option('display.max_colwidth', None) # display entire summary field
+for query in ["Death at La Fenice", "Muder in Grub Street", "A touch of Frost"]:
+    idx = H.query_to_index(dat, query, ["book_title"])
+    print(f"Index is {idx}")
+    book = dat.iloc[idx]
+    for col in col_to_show:
+        print(f"{col} is {book[col]}")
+    print (book["summary"])
+
+
+
diff --git a/bookworm/local/LocalTestScript.py b/bookworm/local/LocalTestScript.py
@@ -0,0 +1,76 @@
+# Sue's Local Search Script 
+
+import pandas as pd
+import search_wrapper as sw
+import search
+from thefuzz import fuzz
+
+#Assemble data 
+path_root = "data/complete_w_embeddings/complete_w_embeddings.csv"
+path1 = path_root + "_part_1.csv"
+path2 = path_root + "_part_2.csv"
+path3 = path_root + "_part_3.csv"
+path4 = path_root + "_part_4.csv"
+dat = sw.assemble_data(path1, path2, path3, path4)
+
+col_to_show = ["book_id", "book_title", "author"]
+ALL_COL = ["genre", "book_title", "summary", "author"]
+
+def calculate_ratio(row):
+        return fuzz.ratio(row['author'], query)
+
+#print(dat[col_to_show].iloc[15:40])
+
+# authors_to_check = ["Herge", "Isaac Asimov", "Asimov", "Tarkington", "Stephen King", 
+#                     "JRR Tolkien", "Tolkien", "Rowling", "Joyce", "James Joyce"]
+# # results from keyword author field only
+# for author in authors_to_check:
+#     idx = search.HelperFunctions.query_to_index(dat, author, ["author"])
+#     auth_result = dat.iloc[idx]["author"]
+#     print(f"For author {author}, index is {idx} and author is {auth_result}")
+
+# # results from keyword all fields
+# for author in authors_to_check:
+#     columns = ["book_title", "genre", "author", "summary"]
+#     idx = search.HelperFunctions.query_to_index(dat, author, columns)
+#     auth_result = dat.iloc[idx]["author"]
+#     print(f"For author {author}, index is {idx} and author is {auth_result}")
+
+books_to_check = ["way of all flesh", "wizard and glass", "winters heart", "winter's heart",
+                  "Myth of sisuphus", "Blade Runner", "wolves of the calla", "mary had a little lamb"]
+# # results from keyword title field only
+results = pd.DataFrame()
+results["query"] = books_to_check
+idx_one_col = []
+title_one_col = []
+idx_all_col =[]
+title_all_col=[]
+
+
+for book in books_to_check:
+    idx = search.HelperFunctions.query_to_index(dat, book, ["book_title"])
+    title_result = dat.iloc[idx]["book_title"]
+    idx_one_col.append(idx)
+    title_one_col.append(title_result)
+    idx2 = search.HelperFunctions.query_to_index(dat, book, ALL_COL)
+    title_result_2 = dat.iloc[idx2]["book_title"]
+    idx_all_col.append(idx2)
+    title_all_col.append(title_result_2)
+
+results["idx_one_col"] = idx_one_col
+results["title_one_col"] = title_one_col
+results["idx_all_col"]=idx_all_col
+results["title_all_col"] =title_all_col
+print(results)
+
+
+
+
+
+# # results from keyword all fields
+# for author in authors_to_check:
+#     columns = ["book_title", "genre", "author", "summary"]
+#     idx = search.HelperFunctions.query_to_index(dat, author, columns)
+#     auth_result = dat.iloc[idx]["author"]
+#     print(f"For author {author}, index is {idx} and author is {auth_result}")
+
diff --git a/bookworm/local/LocalTestScript2.py b/bookworm/local/LocalTestScript2.py
@@ -0,0 +1,64 @@
+# Sue's Local Search Script 
+
+import pandas as pd
+import search_wrapper as sw
+import search
+import numpy as np
+from thefuzz import fuzz
+
+#Assemble data 
+path_root = "data/complete_w_embeddings/complete_w_embeddings.csv"
+path1 = path_root + "_part_1.csv"
+path2 = path_root + "_part_2.csv"
+path3 = path_root + "_part_3.csv"
+path4 = path_root + "_part_4.csv"
+dat = sw.assemble_data(path1, path2, path3, path4)
+col_to_show = ["book_id", "book_title", "author"]
+
+print(dat[dat["book_id"] == 156489]["summary"])
+
+indices = np.load('../bookworm/data/indices_updated.npy')
+print(indices[0:5])
+
+columns = ["book_title"]
+query = "Harry Potter and the Order of the Phoenix"
+book_index = search.HelperFunctions.query_to_index(dat, query, columns)
+print(book_index)
+
+print(dat.iloc[book_index][col_to_show])
+
+
+
+semantic_indices = search.HelperFunctions.get_semantic_results(book_index,5)
+semantic_indices = semantic_indices.tolist() if \
+    isinstance(semantic_indices, np.ndarray) else semantic_indices
+
+print(f"semantic indices are {semantic_indices}")
+
+results = dat.loc[semantic_indices].head(5)
+print(results)
+
+
+
+
+# book = dat.iloc[0, :]
+# id = book["book_id"]
+# print(book)
+# print(id)
+# print(type(id))
+
+# book2 = dat[dat["book_id"]== 4081]
+# #book2 = dat[dat["book_id"]== 22808]
+# #id2 = book2["book_id"]
+# print(type(book2))
+# #print(id2)
+
+# #print(book[col_to_show].head(1))
+
+
+# #book = dat[dat["book_id"] == 4081]
+# # print(book)
+
+
+
+
diff --git a/bookworm/local/create_test_genre.py b/bookworm/local/create_test_genre.py
@@ -0,0 +1,18 @@
+# create_test_genre
+
+import pandas as pd
+f = "../data/genre.csv"
+genres = pd.read_csv(f)
+print(genres.shape)
+
+
+test_genre = genres.head(30)
+print(test_genre.shape)
+print(test_genre.head)
+f = "../data/test_data/test_genre.csv"
+test_genre.to_csv(f)
+
+genre_types = set(test_genre["generic_genre"].tolist())
+print(genre_types)
+
+
diff --git a/bookworm/local/prep_text.py b/bookworm/local/prep_text.py
@@ -0,0 +1,84 @@
+""" Script to prep text for tokenization
+
+Script takes the data complete_w_ratings.csv and preprocesses the text
+fields in preparation for vectorization.
+
+Module should be run from /scripts folder.
+
+
+"""
+import ast
+import pandas as pd
+
+def parse_genres(genre_str):
+    """ 
+    Parse genre dictionaries into keywords joined by commas
+        
+    Paramters:
+         Genres: A dictinonary of genres.
+    Return
+        A string with genres expressed as keywords, separated
+        by commas.  
+    """
+
+    try:
+        genres_dict = ast.literal_eval(genre_str)
+        genres_text = ', '.join(genres_dict.values())
+    except (ValueError, SyntaxError):
+        genres_text = 'Unknown Genre'
+    return genres_text
+
+
+
+def fill_na(df):
+    """ 
+    Fill in missing values in dataframe
+
+    Paramaters
+        df: A dataframe with fields author, book_title, genre
+            and summary. 
+    Return: 
+        A dataframe with missing values filled in. 
+    """
+
+    df.fillna({'author': 'Unknown', 'book_title': 'Unknown',
+                'genre': 'Unknown', 'summary': 'No Summary Available'},
+                inplace=True)
+    return df
+
+
+def preprocess_text(text):
+    """
+    Convert text to all lowercase.
+        
+    Parameters:
+        Text: A string
+    Returns
+        A lower case string
+    """
+    text = str(text).lower()
+    return text
+
+def prep_df(df): 
+
+    """"Prepares dataframe for tokenization.
+    
+    Paramaters:
+        df: A dataframe with columns author, book_title, genre, and summary
+    Returns:
+        Preprocessed dataframe.  Text all losercase and missing values filled.
+    """
+
+    df = fill_na(df)
+    df["genre"] = df['genre'].apply(parse_genres)
+    columns = ["author", "book_title", "genre", "summary"]
+    for col in columns: 
+        df[col] = df[col].apply(preprocess_text)
+    return(df)
+
+#f = "../bookworm/data/test_data/test_data.csv"
+f = "../bookworm/data/complete_w_ratings.csv"
+dat = pd.read_csv(f)
+processed_dat = prep_df(dat)
+f = "../bookworm/data/complete_w_ratings_preproc.csv"
+processed_dat.to_csv(f)