-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This includes all files used for model development as well as the final submission files. Note that the raw data is not included.
- Loading branch information
1 parent
e7f3d69
commit 5a34786
Showing
275 changed files
with
748 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,6 @@ | ||
# Amazon product rating | ||
Project for rating products on amazon based on reviewer/product data | ||
|
||
Please see https://kasra-eshaghi.github.io/blog/software/amazon-product-rating for more details. | ||
|
||
The data for this project can be found at https://www.kaggle.com/c/csc2515-rating-prediction |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import tensorflow as tf | ||
from tensorflow import keras | ||
|
||
def model_1(train_review_time, train_categories, train_summaries, train_reviews, cv_summary_fit, cv_review_fit): | ||
# model 1 - min val = 0.5619 | ||
input_time = keras.layers.Input(shape = [train_review_time.shape[1]], name='time_input') | ||
input_cat = keras.layers.Input(shape = [train_categories.shape[1]], name='cat_input') | ||
input_summary = keras.layers.Input(shape = [train_summaries.shape[1]], name='summary_input') | ||
input_review = keras.layers.Input(shape = [train_reviews.shape[1]], name='review_input') | ||
|
||
embed_summary = keras.layers.Embedding(input_dim = (len(cv_summary_fit.vocabulary_)+1), output_dim = 5, input_length = train_summaries.shape[1], name = 'embed_summary')(input_summary) | ||
embed_review = keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review')(input_review) | ||
|
||
flatten_summary = keras.layers.Flatten()(embed_summary) | ||
flatten_review = keras.layers.Flatten()(embed_review) | ||
|
||
concat_all = keras.layers.concatenate([input_time, input_cat, flatten_summary, flatten_review]) | ||
|
||
hidden_1 = keras.layers.Dense(units = 100, activation = 'relu', name = 'hidden')(concat_all) | ||
|
||
output = keras.layers.Dense(units = 1, name = 'output')(hidden_1) | ||
|
||
model = keras.Model(inputs=[input_time, input_cat, input_summary, input_review], outputs = [output]) | ||
|
||
model.compile(loss = 'mse', optimizer = 'Adam') | ||
|
||
return model | ||
|
||
def model_2(train_reviews, cv_review_fit): | ||
#min val = 0.6 | ||
model = keras.models.Sequential([ | ||
keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review'), | ||
keras.layers.LSTM(units = 128), | ||
keras.layers.Dense(units = 1, name = 'output'), | ||
]) | ||
|
||
model.compile(loss = 'mse', optimizer = 'Adam') | ||
|
||
return model | ||
|
||
def model_3(train_review_time, train_categories, train_summaries, train_reviews, cv_summary_fit, cv_review_fit): | ||
# model 3 - min val = 0.58 | ||
input_time = keras.layers.Input(shape = [train_review_time.shape[1]], name='time_input') | ||
input_cat = keras.layers.Input(shape = [train_categories.shape[1]], name='cat_input') | ||
input_summary = keras.layers.Input(shape = [train_summaries.shape[1]], name='summary_input') | ||
input_review = keras.layers.Input(shape = [train_reviews.shape[1]], name='review_input') | ||
|
||
embed_cat = keras.layers.Embedding(input_dim = 5, output_dim = 5, input_length = train_categories.shape[1], name = 'embed_cat')(input_cat) | ||
embed_summary = keras.layers.Embedding(input_dim = (len(cv_summary_fit.vocabulary_)+1), output_dim = 5, input_length = train_summaries.shape[1], name = 'embed_summary')(input_summary) | ||
embed_review = keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review')(input_review) | ||
|
||
flatten_cat = keras.layers.Flatten()(embed_cat) | ||
flatten_summary = keras.layers.Flatten()(embed_summary) | ||
flatten_review = keras.layers.Flatten()(embed_review) | ||
|
||
concat_all = keras.layers.concatenate([input_time, flatten_cat, flatten_summary, flatten_review]) | ||
|
||
hidden_1 = keras.layers.Dense(units = 100, activation = 'relu', name = 'hidden')(concat_all) | ||
|
||
output = keras.layers.Dense(units = 1, name = 'output')(hidden_1) | ||
|
||
model = keras.Model(inputs=[input_time, input_cat, input_summary, input_review], outputs = [output]) | ||
|
||
model.compile(loss = 'mse', optimizer = 'Adam') | ||
|
||
return model | ||
|
||
def model_4(train_review_time, train_categories, train_summaries, train_reviews, cv_summary_fit, cv_review_fit): | ||
# model 4 - min val = 0.86 | ||
input_time = keras.layers.Input(shape = (train_review_time.shape[1],), name='time_input') | ||
input_cat = keras.layers.Input(shape = (train_categories.shape[1],), name='cat_input') | ||
|
||
embed_cat = keras.layers.Embedding(input_dim = 5, output_dim = 5, input_length = train_categories.shape[1], name = 'embed_cat')(input_cat) | ||
flatten_cat = keras.layers.Flatten(name = 'flatten_cat')(embed_cat) | ||
concat_time_cat = keras.layers.concatenate([input_time, flatten_cat], name = 'concat_time_cat') | ||
dense_time_cat = keras.layers.Dense(units = 10, activation = 'relu')(concat_time_cat) | ||
|
||
|
||
input_summary = keras.layers.Input(shape = (train_summaries.shape[1],), name='summary_input') | ||
embed_summary = keras.layers.Embedding(input_dim = (len(cv_summary_fit.vocabulary_)+1), output_dim = 32, input_length = train_summaries.shape[1], name = 'embed_summary')(input_summary) | ||
lstm_summary = keras.layers.LSTM(units = 128)(embed_summary) | ||
dense_summary = keras.layers.Dense(units = 1)(lstm_summary) | ||
|
||
input_review = keras.layers.Input(shape = (train_reviews.shape[1],), name='review_input') | ||
embed_review = keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review')(input_review) | ||
lstm_review = keras.layers.LSTM(units = 128)(embed_review) | ||
dense_review = keras.layers.Dense(units = 1)(lstm_review) | ||
|
||
concat_all = keras.layers.concatenate([dense_time_cat, dense_summary, dense_review]) | ||
|
||
|
||
output = keras.layers.Dense(units = 1, name = 'output')(concat_all) | ||
|
||
model = keras.Model(inputs=[input_time, input_cat, input_summary, input_review], outputs = [output]) | ||
|
||
model.compile(loss = 'mse', optimizer = 'Adam') | ||
|
||
return model | ||
|
||
def model_5(train_review_time, train_categories, train_summaries, train_reviews, cv_summary_fit, cv_review_fit): | ||
# model 5 - min val =0.55 | ||
input_time = keras.layers.Input(shape = [train_review_time.shape[1]], name='time_input') | ||
input_cat = keras.layers.Input(shape = [train_categories.shape[1]], name='cat_input') | ||
input_summary = keras.layers.Input(shape = [train_summaries.shape[1]], name='summary_input') | ||
input_review = keras.layers.Input(shape = [train_reviews.shape[1]], name='review_input') | ||
|
||
embed_summary = keras.layers.Embedding(input_dim = (len(cv_summary_fit.vocabulary_)+1), output_dim = 16, input_length = train_summaries.shape[1], name = 'embed_summary')(input_summary) | ||
embed_review = keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review')(input_review) | ||
|
||
conv1d_summary = keras.layers.Conv1D(filters = 128, kernel_size=5)(embed_summary) | ||
conv1d_review = keras.layers.Conv1D(filters = 128, kernel_size=5)(embed_review) | ||
|
||
flatten_summary = keras.layers.Flatten()(conv1d_summary) | ||
flatten_review = keras.layers.Flatten()(conv1d_review) | ||
|
||
concat_all = keras.layers.concatenate([input_time, input_cat, flatten_summary, flatten_review]) | ||
|
||
hidden_1 = keras.layers.Dense(units = 100, activation = 'relu', name = 'hidden')(concat_all) | ||
|
||
output = keras.layers.Dense(units = 1, name = 'output')(hidden_1) | ||
|
||
model = keras.Model(inputs=[input_time, input_cat, input_summary, input_review], outputs = [output]) | ||
|
||
model.compile(loss = 'mse', optimizer = 'Adam') | ||
|
||
return model | ||
|
||
def model_6(train_summaries, train_reviews, vocab_size, max_sent_len): | ||
# create model, val error = 0.63 | ||
input_reviews = keras.layers.Input(shape = (train_reviews.shape[1],), name = 'review_input') | ||
input_summaries = keras.layers.Input(shape = (train_summaries.shape[1],), name = 'summary_input') | ||
|
||
embed_words = keras.layers.Embedding(input_dim = vocab_size, output_dim = 8, input_length = max_sent_len, mask_zero=True, name = 'word_embedding') | ||
|
||
input_reviews_encoded = embed_words(input_reviews) | ||
input_summaries_encoded = embed_words(input_summaries) | ||
|
||
|
||
concat_embeded_sentences = keras.layers.Concatenate(axis = -1, name = 'summary_and_review')([input_reviews_encoded, input_summaries_encoded]) | ||
|
||
lstm_sentences = keras.layers.LSTM(64, return_sequences = False, name = 'lstm_out')(concat_embeded_sentences) | ||
|
||
output = keras.layers.Dense(1)(lstm_sentences) | ||
|
||
model = keras.Model(inputs=[input_reviews, input_summaries], outputs = [output]) | ||
|
||
model.compile(loss = 'mse', optimizer = 'Adam') | ||
|
||
def model_7(train_reviews, vocab_size, max_sent_len): | ||
#%% create model, val error = 0.5 | ||
input_reviews = keras.layers.Input(shape = (train_reviews.shape[1],), name = 'review_input') | ||
|
||
embed_review = keras.layers.Embedding(input_dim = vocab_size, output_dim = 32, input_length = max_sent_len, mask_zero=False)(input_reviews) | ||
|
||
gru_review = keras.layers.GRU(units = 128, return_sequences=False)(embed_review) | ||
#gru_review_2 = keras.layers.GRU(units = 128)(gru_review) | ||
|
||
output = keras.layers.Dense(1)(gru_review) | ||
|
||
model = keras.Model(inputs=[input_reviews], outputs = [output]) | ||
|
||
model.compile(loss = 'mse', optimizer = 'Adam') | ||
|
||
def build_model(n_hidden = 1, n_neurons = 50, learning_rate = 3e-3, inputshape= 5): | ||
model = keras.models.Sequential() | ||
# add input layer | ||
model.add(keras.layers.InputLayer(input_shape = (inputshape,))) | ||
|
||
model.add(keras.layers.Dense(n_neurons, activation = 'relu', input_shape = (inputshape,))) | ||
# add desired number of hidden layers: | ||
for layer in range(n_hidden-1): | ||
model.add(keras.layers.Dense(n_neurons, activation = 'relu')) | ||
|
||
# add last dense layer to get output: | ||
model.add(keras.layers.Dense(1)) | ||
|
||
# compile model: | ||
model.compile(loss ='mse', optimizer = keras.optimizers.SGD(lr = learning_rate)) | ||
|
||
return model | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Sun Dec 6 17:43:10 2020 | ||
@author: Kasra | ||
""" | ||
|
||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
|
||
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | ||
from nltk.corpus import stopwords | ||
from nltk.stem import WordNetLemmatizer, PorterStemmer | ||
|
||
# import data | ||
df = pd.read_json('train.json', lines=True) | ||
# drop unnecessary columns | ||
df.drop('image', inplace = True, axis = 1) | ||
df.drop('reviewTime', inplace = True, axis = 1) | ||
df.drop('reviewHash', inplace = True, axis = 1) | ||
|
||
# clean up price data: | ||
df['price'] = df['price'].replace({'\$':''}, regex = True) | ||
|
||
# clean up review text and summary text: | ||
df['reviewText'] = df['reviewText'].replace(np.nan,'', regex = True) | ||
df['summary'] = df['summary'].replace(np.nan,'', regex = True) | ||
|
||
#%% Look at rating for each music category over time | ||
categories = df['category'].unique() | ||
for category in categories: | ||
colors = ['red', 'blue', 'green', 'magenta','gold'] | ||
data = [] | ||
for overall in range(1,6,1): | ||
# find dataframe for specific overall and category: | ||
temp_df = df.loc[(df['category'] == category) & (df['overall'] == overall)] | ||
data.append(temp_df.unixReviewTime) | ||
|
||
|
||
|
||
plt.hist(data, color = colors, label = ['1','2','3','4','5'], bins = [0.9e9, 1e9, 1.1e9, 1.2e9, 1.3e9, 1.4e9, 1.5e9, 1.6e9]) | ||
plt.title(category) | ||
plt.xlim(0.9e9, 1.6e9) | ||
plt.ylim(0, 30000) | ||
plt.xlabel('unixReviewTime ') | ||
plt.ylabel('Frequency') | ||
|
||
plt.show() | ||
|
||
#%% Look at statistics for review and summary data: | ||
cv = CountVectorizer(ngram_range = (1,1), lowercase = True, analyzer = 'word', binary = True) | ||
analyzer = cv.build_analyzer() | ||
|
||
# count number of words in the summary and reviews | ||
summary_texts = list(df['summary']) | ||
summary_lengths = [] | ||
for summary in summary_texts: | ||
# tokenize: | ||
summary_tok = analyzer(summary) | ||
summary_lengths.append(len(summary_tok)) | ||
|
||
review_texts = list(df['reviewText']) | ||
review_lengths = [] | ||
for review in review_texts: | ||
# tokenize: | ||
review_tok = analyzer(review) | ||
review_lengths.append(len(review_tok)) | ||
|
||
plt.boxplot([summary_lengths], vert = False, labels = ['Summary']) | ||
plt.title('Summary Length') | ||
plt.xlim(0,40) | ||
plt.show() | ||
|
||
plt.boxplot([review_lengths], vert = False, labels = ['Review']) | ||
plt.title('Review Length') | ||
plt.xlim(0, 5500) | ||
plt.show() | ||
|
||
# count number of unique words in summary and review | ||
cv_fit_summary = cv.fit(summary_texts) | ||
print('number of unique words in summaries:', len(cv_fit_summary.vocabulary_.keys())) | ||
|
||
cv_fit_review = cv.fit(review_texts) | ||
print('number of unique words in reviews:', len(cv_fit_review.vocabulary_.keys())) | ||
|
||
#%% look at common words in summary and review data: | ||
# get common stop words | ||
custom_stop_words = set(stopwords.words('english')) | ||
# add additional stop words: | ||
ps = PorterStemmer() | ||
lemmatizer = WordNetLemmatizer() | ||
additional_stop_words = {'pop', 'classical', 'jazz', 'dance', 'electronic', 'rock'} | ||
additional_stop_words_normalized=set() | ||
for additional_word in additional_stop_words: | ||
category_lem = lemmatizer.lemmatize(additional_word) | ||
category_stem = ps.stem(additional_word) | ||
additional_stop_words_normalized.update({category_lem,category_stem}) | ||
|
||
custom_stop_words.update(additional_stop_words_normalized) | ||
summary_texts = list(df['summary']) | ||
|
||
|
||
cv_summary = CountVectorizer(ngram_range=(1,1), lowercase = True, stop_words=list(custom_stop_words), analyzer = 'word', binary = True, max_features = 30) | ||
cv_summary_fit = cv_summary.fit_transform(summary_texts) | ||
|
||
word_count = np.sum(cv_summary_fit.toarray(), axis = 0) | ||
|
||
df_temp = pd.DataFrame({'word':cv_summary.get_feature_names(), 'word_count':word_count}) | ||
df_temp.sort_values(by='word_count', ascending=False, inplace = True) | ||
df_temp.plot(x='word', y='word_count', kind = 'bar') | ||
plt.xlabel('Top 30 Frequent Unigrams') | ||
plt.ylabel('Frequency') | ||
plt.legend('') | ||
plt.title('Summary Word Distribution') | ||
plt.ylim(0, 30000) | ||
plt.show() | ||
#%% | ||
review_texts = list(df['reviewText']) | ||
cv_review = CountVectorizer(ngram_range=(1,1), lowercase = True, stop_words=list(custom_stop_words), analyzer = 'word', binary = True, max_features = 30) | ||
cv_review_fit = cv_review.fit_transform(review_texts) | ||
|
||
word_count = np.sum(cv_review_fit.toarray(), axis = 0) | ||
|
||
df_temp = pd.DataFrame({'word':cv_review.get_feature_names(), 'word_count':word_count}) | ||
df_temp.sort_values(by='word_count', ascending=False, inplace = True) | ||
df_temp.plot(x='word', y='word_count', kind = 'bar') | ||
plt.xlabel('Top 30 Frequent Unigrams') | ||
plt.ylabel('Frequency') | ||
plt.legend('') | ||
plt.ylim(0, 70000) | ||
plt.title('Review Word Distribution') | ||
plt.show() | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
Binary file not shown.
Oops, something went wrong.