-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2972798
commit 360698b
Showing
7 changed files
with
275 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# EMPTY FILE | ||
# EMPTY FILE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# Sue's Local Search Script | ||
|
||
import pandas as pd | ||
import search_wrapper as sw | ||
import search | ||
from search import HelperFunctions as H | ||
import numpy as np | ||
from thefuzz import fuzz | ||
|
||
#Assemble data | ||
path_root = "data/complete_w_embeddings/complete_w_embeddings.csv" | ||
path1 = path_root + "_part_1.csv" | ||
path2 = path_root + "_part_2.csv" | ||
path3 = path_root + "_part_3.csv" | ||
path4 = path_root + "_part_4.csv" | ||
dat = sw.assemble_data(path1, path2, path3, path4) | ||
|
||
col_to_show = ["book_id", "genre", "book_title", "author"] | ||
#pd.set_option('display.max_colwidth', None) # display entire summary field | ||
for query in ["Death at La Fenice", "Muder in Grub Street", "A touch of Frost"]: | ||
idx = H.query_to_index(dat, query, ["book_title"]) | ||
print(f"Index is {idx}") | ||
book = dat.iloc[idx] | ||
for col in col_to_show: | ||
print(f"{col} is {book[col]}") | ||
print (book["summary"]) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# Sue's Local Search Script | ||
|
||
import pandas as pd | ||
import search_wrapper as sw | ||
import search | ||
from thefuzz import fuzz | ||
|
||
#Assemble data | ||
path_root = "data/complete_w_embeddings/complete_w_embeddings.csv" | ||
path1 = path_root + "_part_1.csv" | ||
path2 = path_root + "_part_2.csv" | ||
path3 = path_root + "_part_3.csv" | ||
path4 = path_root + "_part_4.csv" | ||
dat = sw.assemble_data(path1, path2, path3, path4) | ||
|
||
col_to_show = ["book_id", "book_title", "author"] | ||
ALL_COL = ["genre", "book_title", "summary", "author"] | ||
|
||
def calculate_ratio(row): | ||
return fuzz.ratio(row['author'], query) | ||
|
||
#print(dat[col_to_show].iloc[15:40]) | ||
|
||
# authors_to_check = ["Herge", "Isaac Asimov", "Asimov", "Tarkington", "Stephen King", | ||
# "JRR Tolkien", "Tolkien", "Rowling", "Joyce", "James Joyce"] | ||
# # results from keyword author field only | ||
# for author in authors_to_check: | ||
# idx = search.HelperFunctions.query_to_index(dat, author, ["author"]) | ||
# auth_result = dat.iloc[idx]["author"] | ||
# print(f"For author {author}, index is {idx} and author is {auth_result}") | ||
|
||
# # results from keyword all fields | ||
# for author in authors_to_check: | ||
# columns = ["book_title", "genre", "author", "summary"] | ||
# idx = search.HelperFunctions.query_to_index(dat, author, columns) | ||
# auth_result = dat.iloc[idx]["author"] | ||
# print(f"For author {author}, index is {idx} and author is {auth_result}") | ||
|
||
books_to_check = ["way of all flesh", "wizard and glass", "winters heart", "winter's heart", | ||
"Myth of sisuphus", "Blade Runner", "wolves of the calla", "mary had a little lamb"] | ||
# # results from keyword title field only | ||
results = pd.DataFrame() | ||
results["query"] = books_to_check | ||
idx_one_col = [] | ||
title_one_col = [] | ||
idx_all_col =[] | ||
title_all_col=[] | ||
|
||
|
||
for book in books_to_check: | ||
idx = search.HelperFunctions.query_to_index(dat, book, ["book_title"]) | ||
title_result = dat.iloc[idx]["book_title"] | ||
idx_one_col.append(idx) | ||
title_one_col.append(title_result) | ||
idx2 = search.HelperFunctions.query_to_index(dat, book, ALL_COL) | ||
title_result_2 = dat.iloc[idx2]["book_title"] | ||
idx_all_col.append(idx2) | ||
title_all_col.append(title_result_2) | ||
|
||
results["idx_one_col"] = idx_one_col | ||
results["title_one_col"] = title_one_col | ||
results["idx_all_col"]=idx_all_col | ||
results["title_all_col"] =title_all_col | ||
print(results) | ||
|
||
|
||
|
||
|
||
|
||
# # results from keyword all fields | ||
# for author in authors_to_check: | ||
# columns = ["book_title", "genre", "author", "summary"] | ||
# idx = search.HelperFunctions.query_to_index(dat, author, columns) | ||
# auth_result = dat.iloc[idx]["author"] | ||
# print(f"For author {author}, index is {idx} and author is {auth_result}") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# Sue's Local Search Script | ||
|
||
import pandas as pd | ||
import search_wrapper as sw | ||
import search | ||
import numpy as np | ||
from thefuzz import fuzz | ||
|
||
#Assemble data | ||
path_root = "data/complete_w_embeddings/complete_w_embeddings.csv" | ||
path1 = path_root + "_part_1.csv" | ||
path2 = path_root + "_part_2.csv" | ||
path3 = path_root + "_part_3.csv" | ||
path4 = path_root + "_part_4.csv" | ||
dat = sw.assemble_data(path1, path2, path3, path4) | ||
col_to_show = ["book_id", "book_title", "author"] | ||
|
||
print(dat[dat["book_id"] == 156489]["summary"]) | ||
|
||
indices = np.load('../bookworm/data/indices_updated.npy') | ||
print(indices[0:5]) | ||
|
||
columns = ["book_title"] | ||
query = "Harry Potter and the Order of the Phoenix" | ||
book_index = search.HelperFunctions.query_to_index(dat, query, columns) | ||
print(book_index) | ||
|
||
print(dat.iloc[book_index][col_to_show]) | ||
|
||
|
||
|
||
semantic_indices = search.HelperFunctions.get_semantic_results(book_index,5) | ||
semantic_indices = semantic_indices.tolist() if \ | ||
isinstance(semantic_indices, np.ndarray) else semantic_indices | ||
|
||
print(f"semantic indices are {semantic_indices}") | ||
|
||
results = dat.loc[semantic_indices].head(5) | ||
print(results) | ||
|
||
|
||
|
||
|
||
# book = dat.iloc[0, :] | ||
# id = book["book_id"] | ||
# print(book) | ||
# print(id) | ||
# print(type(id)) | ||
|
||
# book2 = dat[dat["book_id"]== 4081] | ||
# #book2 = dat[dat["book_id"]== 22808] | ||
# #id2 = book2["book_id"] | ||
# print(type(book2)) | ||
# #print(id2) | ||
|
||
# #print(book[col_to_show].head(1)) | ||
|
||
|
||
# #book = dat[dat["book_id"] == 4081] | ||
# # print(book) | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# create_test_genre | ||
|
||
import pandas as pd | ||
f = "../data/genre.csv" | ||
genres = pd.read_csv(f) | ||
print(genres.shape) | ||
|
||
|
||
test_genre = genres.head(30) | ||
print(test_genre.shape) | ||
print(test_genre.head) | ||
f = "../data/test_data/test_genre.csv" | ||
test_genre.to_csv(f) | ||
|
||
genre_types = set(test_genre["generic_genre"].tolist()) | ||
print(genre_types) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
""" Script to prep text for tokenization | ||
Script takes the data complete_w_ratings.csv and preprocesses the text | ||
fields in preparation for vectorization. | ||
Module should be run from /scripts folder. | ||
""" | ||
import ast | ||
import pandas as pd | ||
|
||
def parse_genres(genre_str): | ||
""" | ||
Parse genre dictionaries into keywords joined by commas | ||
Paramters: | ||
Genres: A dictinonary of genres. | ||
Return | ||
A string with genres expressed as keywords, separated | ||
by commas. | ||
""" | ||
|
||
try: | ||
genres_dict = ast.literal_eval(genre_str) | ||
genres_text = ', '.join(genres_dict.values()) | ||
except (ValueError, SyntaxError): | ||
genres_text = 'Unknown Genre' | ||
return genres_text | ||
|
||
|
||
|
||
def fill_na(df): | ||
""" | ||
Fill in missing values in dataframe | ||
Paramaters | ||
df: A dataframe with fields author, book_title, genre | ||
and summary. | ||
Return: | ||
A dataframe with missing values filled in. | ||
""" | ||
|
||
df.fillna({'author': 'Unknown', 'book_title': 'Unknown', | ||
'genre': 'Unknown', 'summary': 'No Summary Available'}, | ||
inplace=True) | ||
return df | ||
|
||
|
||
def preprocess_text(text): | ||
""" | ||
Convert text to all lowercase. | ||
Parameters: | ||
Text: A string | ||
Returns | ||
A lower case string | ||
""" | ||
text = str(text).lower() | ||
return text | ||
|
||
def prep_df(df): | ||
|
||
""""Prepares dataframe for tokenization. | ||
Paramaters: | ||
df: A dataframe with columns author, book_title, genre, and summary | ||
Returns: | ||
Preprocessed dataframe. Text all losercase and missing values filled. | ||
""" | ||
|
||
df = fill_na(df) | ||
df["genre"] = df['genre'].apply(parse_genres) | ||
columns = ["author", "book_title", "genre", "summary"] | ||
for col in columns: | ||
df[col] = df[col].apply(preprocess_text) | ||
return(df) | ||
|
||
#f = "../bookworm/data/test_data/test_data.csv" | ||
f = "../bookworm/data/complete_w_ratings.csv" | ||
dat = pd.read_csv(f) | ||
processed_dat = prep_df(dat) | ||
f = "../bookworm/data/complete_w_ratings_preproc.csv" | ||
processed_dat.to_csv(f) |