Skip to content

Commit

Permalink
revert pylint checker to disabled
Browse files Browse the repository at this point in the history
  • Loading branch information
sue-t-boyd committed Mar 11, 2024
1 parent 2972798 commit 360698b
Show file tree
Hide file tree
Showing 7 changed files with 275 additions and 3 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/build_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ jobs:
# Next step: run pylint. Anything less than 10/10 will fail.
- name: Lint with pylint
run: |
pylint bookworm/**/*.py
pylint bookworm/*.py
pylint bookworm/*.py
pylint bookworm/**/*.py


# Next step: run the unit tests with code coverage.
- name: Unit tests
Expand Down
2 changes: 1 addition & 1 deletion bookworm/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# EMPTY FILE
# EMPTY FILE
29 changes: 29 additions & 0 deletions bookworm/local/InspectData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Sue's Local Search Script

import pandas as pd
import search_wrapper as sw
import search
from search import HelperFunctions as H
import numpy as np
from thefuzz import fuzz

#Assemble data
path_root = "data/complete_w_embeddings/complete_w_embeddings.csv"
path1 = path_root + "_part_1.csv"
path2 = path_root + "_part_2.csv"
path3 = path_root + "_part_3.csv"
path4 = path_root + "_part_4.csv"
dat = sw.assemble_data(path1, path2, path3, path4)

col_to_show = ["book_id", "genre", "book_title", "author"]
#pd.set_option('display.max_colwidth', None) # display entire summary field
for query in ["Death at La Fenice", "Muder in Grub Street", "A touch of Frost"]:
idx = H.query_to_index(dat, query, ["book_title"])
print(f"Index is {idx}")
book = dat.iloc[idx]
for col in col_to_show:
print(f"{col} is {book[col]}")
print (book["summary"])



76 changes: 76 additions & 0 deletions bookworm/local/LocalTestScript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Sue's Local Search Script

import pandas as pd
import search_wrapper as sw
import search
from thefuzz import fuzz

#Assemble data
path_root = "data/complete_w_embeddings/complete_w_embeddings.csv"
path1 = path_root + "_part_1.csv"
path2 = path_root + "_part_2.csv"
path3 = path_root + "_part_3.csv"
path4 = path_root + "_part_4.csv"
dat = sw.assemble_data(path1, path2, path3, path4)

col_to_show = ["book_id", "book_title", "author"]
ALL_COL = ["genre", "book_title", "summary", "author"]

def calculate_ratio(row):
return fuzz.ratio(row['author'], query)

#print(dat[col_to_show].iloc[15:40])

# authors_to_check = ["Herge", "Isaac Asimov", "Asimov", "Tarkington", "Stephen King",
# "JRR Tolkien", "Tolkien", "Rowling", "Joyce", "James Joyce"]
# # results from keyword author field only
# for author in authors_to_check:
# idx = search.HelperFunctions.query_to_index(dat, author, ["author"])
# auth_result = dat.iloc[idx]["author"]
# print(f"For author {author}, index is {idx} and author is {auth_result}")

# # results from keyword all fields
# for author in authors_to_check:
# columns = ["book_title", "genre", "author", "summary"]
# idx = search.HelperFunctions.query_to_index(dat, author, columns)
# auth_result = dat.iloc[idx]["author"]
# print(f"For author {author}, index is {idx} and author is {auth_result}")

books_to_check = ["way of all flesh", "wizard and glass", "winters heart", "winter's heart",
"Myth of sisuphus", "Blade Runner", "wolves of the calla", "mary had a little lamb"]
# # results from keyword title field only
results = pd.DataFrame()
results["query"] = books_to_check
idx_one_col = []
title_one_col = []
idx_all_col =[]
title_all_col=[]


for book in books_to_check:
idx = search.HelperFunctions.query_to_index(dat, book, ["book_title"])
title_result = dat.iloc[idx]["book_title"]
idx_one_col.append(idx)
title_one_col.append(title_result)
idx2 = search.HelperFunctions.query_to_index(dat, book, ALL_COL)
title_result_2 = dat.iloc[idx2]["book_title"]
idx_all_col.append(idx2)
title_all_col.append(title_result_2)

results["idx_one_col"] = idx_one_col
results["title_one_col"] = title_one_col
results["idx_all_col"]=idx_all_col
results["title_all_col"] =title_all_col
print(results)





# # results from keyword all fields
# for author in authors_to_check:
# columns = ["book_title", "genre", "author", "summary"]
# idx = search.HelperFunctions.query_to_index(dat, author, columns)
# auth_result = dat.iloc[idx]["author"]
# print(f"For author {author}, index is {idx} and author is {auth_result}")

64 changes: 64 additions & 0 deletions bookworm/local/LocalTestScript2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Sue's Local Search Script

import pandas as pd
import search_wrapper as sw
import search
import numpy as np
from thefuzz import fuzz

#Assemble data
path_root = "data/complete_w_embeddings/complete_w_embeddings.csv"
path1 = path_root + "_part_1.csv"
path2 = path_root + "_part_2.csv"
path3 = path_root + "_part_3.csv"
path4 = path_root + "_part_4.csv"
dat = sw.assemble_data(path1, path2, path3, path4)
col_to_show = ["book_id", "book_title", "author"]

print(dat[dat["book_id"] == 156489]["summary"])

indices = np.load('../bookworm/data/indices_updated.npy')
print(indices[0:5])

columns = ["book_title"]
query = "Harry Potter and the Order of the Phoenix"
book_index = search.HelperFunctions.query_to_index(dat, query, columns)
print(book_index)

print(dat.iloc[book_index][col_to_show])



semantic_indices = search.HelperFunctions.get_semantic_results(book_index,5)
semantic_indices = semantic_indices.tolist() if \
isinstance(semantic_indices, np.ndarray) else semantic_indices

print(f"semantic indices are {semantic_indices}")

results = dat.loc[semantic_indices].head(5)
print(results)




# book = dat.iloc[0, :]
# id = book["book_id"]
# print(book)
# print(id)
# print(type(id))

# book2 = dat[dat["book_id"]== 4081]
# #book2 = dat[dat["book_id"]== 22808]
# #id2 = book2["book_id"]
# print(type(book2))
# #print(id2)

# #print(book[col_to_show].head(1))


# #book = dat[dat["book_id"] == 4081]
# # print(book)




18 changes: 18 additions & 0 deletions bookworm/local/create_test_genre.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# create_test_genre

import pandas as pd
f = "../data/genre.csv"
genres = pd.read_csv(f)
print(genres.shape)


test_genre = genres.head(30)
print(test_genre.shape)
print(test_genre.head)
f = "../data/test_data/test_genre.csv"
test_genre.to_csv(f)

genre_types = set(test_genre["generic_genre"].tolist())
print(genre_types)


84 changes: 84 additions & 0 deletions bookworm/local/prep_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
""" Script to prep text for tokenization
Script takes the data complete_w_ratings.csv and preprocesses the text
fields in preparation for vectorization.
Module should be run from /scripts folder.
"""
import ast
import pandas as pd

def parse_genres(genre_str):
"""
Parse genre dictionaries into keywords joined by commas
Paramters:
Genres: A dictinonary of genres.
Return
A string with genres expressed as keywords, separated
by commas.
"""

try:
genres_dict = ast.literal_eval(genre_str)
genres_text = ', '.join(genres_dict.values())
except (ValueError, SyntaxError):
genres_text = 'Unknown Genre'
return genres_text



def fill_na(df):
"""
Fill in missing values in dataframe
Paramaters
df: A dataframe with fields author, book_title, genre
and summary.
Return:
A dataframe with missing values filled in.
"""

df.fillna({'author': 'Unknown', 'book_title': 'Unknown',
'genre': 'Unknown', 'summary': 'No Summary Available'},
inplace=True)
return df


def preprocess_text(text):
"""
Convert text to all lowercase.
Parameters:
Text: A string
Returns
A lower case string
"""
text = str(text).lower()
return text

def prep_df(df):

""""Prepares dataframe for tokenization.
Paramaters:
df: A dataframe with columns author, book_title, genre, and summary
Returns:
Preprocessed dataframe. Text all losercase and missing values filled.
"""

df = fill_na(df)
df["genre"] = df['genre'].apply(parse_genres)
columns = ["author", "book_title", "genre", "summary"]
for col in columns:
df[col] = df[col].apply(preprocess_text)
return(df)

#f = "../bookworm/data/test_data/test_data.csv"
f = "../bookworm/data/complete_w_ratings.csv"
dat = pd.read_csv(f)
processed_dat = prep_df(dat)
f = "../bookworm/data/complete_w_ratings_preproc.csv"
processed_dat.to_csv(f)

0 comments on commit 360698b

Please sign in to comment.