Skip to content

Commit

Permalink
All files
Browse files Browse the repository at this point in the history
This includes all files used for model development as well as the final submission files. Note that the raw data is not included.
  • Loading branch information
Kasra-1374 committed Feb 5, 2024
1 parent e7f3d69 commit 5a34786
Show file tree
Hide file tree
Showing 275 changed files with 748 additions and 0 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
# Amazon product rating
Project for rating products on amazon based on reviewer/product data

Please see https://kasra-eshaghi.github.io/blog/software/amazon-product-rating for more details.

The data for this project can be found at https://www.kaggle.com/c/csc2515-rating-prediction
183 changes: 183 additions & 0 deletions all_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# -*- coding: utf-8 -*-

import tensorflow as tf
from tensorflow import keras

def model_1(train_review_time, train_categories, train_summaries, train_reviews, cv_summary_fit, cv_review_fit):
# model 1 - min val = 0.5619
input_time = keras.layers.Input(shape = [train_review_time.shape[1]], name='time_input')
input_cat = keras.layers.Input(shape = [train_categories.shape[1]], name='cat_input')
input_summary = keras.layers.Input(shape = [train_summaries.shape[1]], name='summary_input')
input_review = keras.layers.Input(shape = [train_reviews.shape[1]], name='review_input')

embed_summary = keras.layers.Embedding(input_dim = (len(cv_summary_fit.vocabulary_)+1), output_dim = 5, input_length = train_summaries.shape[1], name = 'embed_summary')(input_summary)
embed_review = keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review')(input_review)

flatten_summary = keras.layers.Flatten()(embed_summary)
flatten_review = keras.layers.Flatten()(embed_review)

concat_all = keras.layers.concatenate([input_time, input_cat, flatten_summary, flatten_review])

hidden_1 = keras.layers.Dense(units = 100, activation = 'relu', name = 'hidden')(concat_all)

output = keras.layers.Dense(units = 1, name = 'output')(hidden_1)

model = keras.Model(inputs=[input_time, input_cat, input_summary, input_review], outputs = [output])

model.compile(loss = 'mse', optimizer = 'Adam')

return model

def model_2(train_reviews, cv_review_fit):
#min val = 0.6
model = keras.models.Sequential([
keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review'),
keras.layers.LSTM(units = 128),
keras.layers.Dense(units = 1, name = 'output'),
])

model.compile(loss = 'mse', optimizer = 'Adam')

return model

def model_3(train_review_time, train_categories, train_summaries, train_reviews, cv_summary_fit, cv_review_fit):
# model 3 - min val = 0.58
input_time = keras.layers.Input(shape = [train_review_time.shape[1]], name='time_input')
input_cat = keras.layers.Input(shape = [train_categories.shape[1]], name='cat_input')
input_summary = keras.layers.Input(shape = [train_summaries.shape[1]], name='summary_input')
input_review = keras.layers.Input(shape = [train_reviews.shape[1]], name='review_input')

embed_cat = keras.layers.Embedding(input_dim = 5, output_dim = 5, input_length = train_categories.shape[1], name = 'embed_cat')(input_cat)
embed_summary = keras.layers.Embedding(input_dim = (len(cv_summary_fit.vocabulary_)+1), output_dim = 5, input_length = train_summaries.shape[1], name = 'embed_summary')(input_summary)
embed_review = keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review')(input_review)

flatten_cat = keras.layers.Flatten()(embed_cat)
flatten_summary = keras.layers.Flatten()(embed_summary)
flatten_review = keras.layers.Flatten()(embed_review)

concat_all = keras.layers.concatenate([input_time, flatten_cat, flatten_summary, flatten_review])

hidden_1 = keras.layers.Dense(units = 100, activation = 'relu', name = 'hidden')(concat_all)

output = keras.layers.Dense(units = 1, name = 'output')(hidden_1)

model = keras.Model(inputs=[input_time, input_cat, input_summary, input_review], outputs = [output])

model.compile(loss = 'mse', optimizer = 'Adam')

return model

def model_4(train_review_time, train_categories, train_summaries, train_reviews, cv_summary_fit, cv_review_fit):
# model 4 - min val = 0.86
input_time = keras.layers.Input(shape = (train_review_time.shape[1],), name='time_input')
input_cat = keras.layers.Input(shape = (train_categories.shape[1],), name='cat_input')

embed_cat = keras.layers.Embedding(input_dim = 5, output_dim = 5, input_length = train_categories.shape[1], name = 'embed_cat')(input_cat)
flatten_cat = keras.layers.Flatten(name = 'flatten_cat')(embed_cat)
concat_time_cat = keras.layers.concatenate([input_time, flatten_cat], name = 'concat_time_cat')
dense_time_cat = keras.layers.Dense(units = 10, activation = 'relu')(concat_time_cat)


input_summary = keras.layers.Input(shape = (train_summaries.shape[1],), name='summary_input')
embed_summary = keras.layers.Embedding(input_dim = (len(cv_summary_fit.vocabulary_)+1), output_dim = 32, input_length = train_summaries.shape[1], name = 'embed_summary')(input_summary)
lstm_summary = keras.layers.LSTM(units = 128)(embed_summary)
dense_summary = keras.layers.Dense(units = 1)(lstm_summary)

input_review = keras.layers.Input(shape = (train_reviews.shape[1],), name='review_input')
embed_review = keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review')(input_review)
lstm_review = keras.layers.LSTM(units = 128)(embed_review)
dense_review = keras.layers.Dense(units = 1)(lstm_review)

concat_all = keras.layers.concatenate([dense_time_cat, dense_summary, dense_review])


output = keras.layers.Dense(units = 1, name = 'output')(concat_all)

model = keras.Model(inputs=[input_time, input_cat, input_summary, input_review], outputs = [output])

model.compile(loss = 'mse', optimizer = 'Adam')

return model

def model_5(train_review_time, train_categories, train_summaries, train_reviews, cv_summary_fit, cv_review_fit):
# model 5 - min val =0.55
input_time = keras.layers.Input(shape = [train_review_time.shape[1]], name='time_input')
input_cat = keras.layers.Input(shape = [train_categories.shape[1]], name='cat_input')
input_summary = keras.layers.Input(shape = [train_summaries.shape[1]], name='summary_input')
input_review = keras.layers.Input(shape = [train_reviews.shape[1]], name='review_input')

embed_summary = keras.layers.Embedding(input_dim = (len(cv_summary_fit.vocabulary_)+1), output_dim = 16, input_length = train_summaries.shape[1], name = 'embed_summary')(input_summary)
embed_review = keras.layers.Embedding(input_dim = (len(cv_review_fit.vocabulary_)+1), output_dim = 32, input_length = train_reviews.shape[1], name = 'embed_review')(input_review)

conv1d_summary = keras.layers.Conv1D(filters = 128, kernel_size=5)(embed_summary)
conv1d_review = keras.layers.Conv1D(filters = 128, kernel_size=5)(embed_review)

flatten_summary = keras.layers.Flatten()(conv1d_summary)
flatten_review = keras.layers.Flatten()(conv1d_review)

concat_all = keras.layers.concatenate([input_time, input_cat, flatten_summary, flatten_review])

hidden_1 = keras.layers.Dense(units = 100, activation = 'relu', name = 'hidden')(concat_all)

output = keras.layers.Dense(units = 1, name = 'output')(hidden_1)

model = keras.Model(inputs=[input_time, input_cat, input_summary, input_review], outputs = [output])

model.compile(loss = 'mse', optimizer = 'Adam')

return model

def model_6(train_summaries, train_reviews, vocab_size, max_sent_len):
# create model, val error = 0.63
input_reviews = keras.layers.Input(shape = (train_reviews.shape[1],), name = 'review_input')
input_summaries = keras.layers.Input(shape = (train_summaries.shape[1],), name = 'summary_input')

embed_words = keras.layers.Embedding(input_dim = vocab_size, output_dim = 8, input_length = max_sent_len, mask_zero=True, name = 'word_embedding')

input_reviews_encoded = embed_words(input_reviews)
input_summaries_encoded = embed_words(input_summaries)


concat_embeded_sentences = keras.layers.Concatenate(axis = -1, name = 'summary_and_review')([input_reviews_encoded, input_summaries_encoded])

lstm_sentences = keras.layers.LSTM(64, return_sequences = False, name = 'lstm_out')(concat_embeded_sentences)

output = keras.layers.Dense(1)(lstm_sentences)

model = keras.Model(inputs=[input_reviews, input_summaries], outputs = [output])

model.compile(loss = 'mse', optimizer = 'Adam')

def model_7(train_reviews, vocab_size, max_sent_len):
#%% create model, val error = 0.5
input_reviews = keras.layers.Input(shape = (train_reviews.shape[1],), name = 'review_input')

embed_review = keras.layers.Embedding(input_dim = vocab_size, output_dim = 32, input_length = max_sent_len, mask_zero=False)(input_reviews)

gru_review = keras.layers.GRU(units = 128, return_sequences=False)(embed_review)
#gru_review_2 = keras.layers.GRU(units = 128)(gru_review)

output = keras.layers.Dense(1)(gru_review)

model = keras.Model(inputs=[input_reviews], outputs = [output])

model.compile(loss = 'mse', optimizer = 'Adam')

def build_model(n_hidden = 1, n_neurons = 50, learning_rate = 3e-3, inputshape= 5):
model = keras.models.Sequential()
# add input layer
model.add(keras.layers.InputLayer(input_shape = (inputshape,)))

model.add(keras.layers.Dense(n_neurons, activation = 'relu', input_shape = (inputshape,)))
# add desired number of hidden layers:
for layer in range(n_hidden-1):
model.add(keras.layers.Dense(n_neurons, activation = 'relu'))

# add last dense layer to get output:
model.add(keras.layers.Dense(1))

# compile model:
model.compile(loss ='mse', optimizer = keras.optimizers.SGD(lr = learning_rate))

return model

147 changes: 147 additions & 0 deletions data_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 6 17:43:10 2020
@author: Kasra
"""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# import data
df = pd.read_json('train.json', lines=True)
# drop unnecessary columns
df.drop('image', inplace = True, axis = 1)
df.drop('reviewTime', inplace = True, axis = 1)
df.drop('reviewHash', inplace = True, axis = 1)

# clean up price data:
df['price'] = df['price'].replace({'\$':''}, regex = True)

# clean up review text and summary text:
df['reviewText'] = df['reviewText'].replace(np.nan,'', regex = True)
df['summary'] = df['summary'].replace(np.nan,'', regex = True)

#%% Look at rating for each music category over time
categories = df['category'].unique()
for category in categories:
colors = ['red', 'blue', 'green', 'magenta','gold']
data = []
for overall in range(1,6,1):
# find dataframe for specific overall and category:
temp_df = df.loc[(df['category'] == category) & (df['overall'] == overall)]
data.append(temp_df.unixReviewTime)



plt.hist(data, color = colors, label = ['1','2','3','4','5'], bins = [0.9e9, 1e9, 1.1e9, 1.2e9, 1.3e9, 1.4e9, 1.5e9, 1.6e9])
plt.title(category)
plt.xlim(0.9e9, 1.6e9)
plt.ylim(0, 30000)
plt.xlabel('unixReviewTime ')
plt.ylabel('Frequency')

plt.show()

#%% Look at statistics for review and summary data:
cv = CountVectorizer(ngram_range = (1,1), lowercase = True, analyzer = 'word', binary = True)
analyzer = cv.build_analyzer()

# count number of words in the summary and reviews
summary_texts = list(df['summary'])
summary_lengths = []
for summary in summary_texts:
# tokenize:
summary_tok = analyzer(summary)
summary_lengths.append(len(summary_tok))

review_texts = list(df['reviewText'])
review_lengths = []
for review in review_texts:
# tokenize:
review_tok = analyzer(review)
review_lengths.append(len(review_tok))

plt.boxplot([summary_lengths], vert = False, labels = ['Summary'])
plt.title('Summary Length')
plt.xlim(0,40)
plt.show()

plt.boxplot([review_lengths], vert = False, labels = ['Review'])
plt.title('Review Length')
plt.xlim(0, 5500)
plt.show()

# count number of unique words in summary and review
cv_fit_summary = cv.fit(summary_texts)
print('number of unique words in summaries:', len(cv_fit_summary.vocabulary_.keys()))

cv_fit_review = cv.fit(review_texts)
print('number of unique words in reviews:', len(cv_fit_review.vocabulary_.keys()))

#%% look at common words in summary and review data:
# get common stop words
custom_stop_words = set(stopwords.words('english'))
# add additional stop words:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
additional_stop_words = {'pop', 'classical', 'jazz', 'dance', 'electronic', 'rock'}
additional_stop_words_normalized=set()
for additional_word in additional_stop_words:
category_lem = lemmatizer.lemmatize(additional_word)
category_stem = ps.stem(additional_word)
additional_stop_words_normalized.update({category_lem,category_stem})

custom_stop_words.update(additional_stop_words_normalized)
summary_texts = list(df['summary'])


cv_summary = CountVectorizer(ngram_range=(1,1), lowercase = True, stop_words=list(custom_stop_words), analyzer = 'word', binary = True, max_features = 30)
cv_summary_fit = cv_summary.fit_transform(summary_texts)

word_count = np.sum(cv_summary_fit.toarray(), axis = 0)

df_temp = pd.DataFrame({'word':cv_summary.get_feature_names(), 'word_count':word_count})
df_temp.sort_values(by='word_count', ascending=False, inplace = True)
df_temp.plot(x='word', y='word_count', kind = 'bar')
plt.xlabel('Top 30 Frequent Unigrams')
plt.ylabel('Frequency')
plt.legend('')
plt.title('Summary Word Distribution')
plt.ylim(0, 30000)
plt.show()
#%%
review_texts = list(df['reviewText'])
cv_review = CountVectorizer(ngram_range=(1,1), lowercase = True, stop_words=list(custom_stop_words), analyzer = 'word', binary = True, max_features = 30)
cv_review_fit = cv_review.fit_transform(review_texts)

word_count = np.sum(cv_review_fit.toarray(), axis = 0)

df_temp = pd.DataFrame({'word':cv_review.get_feature_names(), 'word_count':word_count})
df_temp.sort_values(by='word_count', ascending=False, inplace = True)
df_temp.plot(x='word', y='word_count', kind = 'bar')
plt.xlabel('Top 30 Frequent Unigrams')
plt.ylabel('Frequency')
plt.legend('')
plt.ylim(0, 70000)
plt.title('Review Word Distribution')
plt.show()














Binary file added final_model.h5
Binary file not shown.
Loading

0 comments on commit 5a34786

Please sign in to comment.