This repository has been archived by the owner on Oct 8, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 153
MovieLens Matrix Factorization
Makoto YUI edited this page May 23, 2015
·
21 revisions
This page explains how to run matrix factorization on MovieLens 1M dataset.
Caution: Matrix factorization is supported in Hivemall v0.3 or later.
use movielens;
select avg(rating) from training;
3.593565
-- mean rating
set hivevar:mu=3.593565;
-- number of factors
set hivevar:factor=10;
-- maximum number of training iterations
set hivevar:iters=50;
See this article or OnlineMatrixFactorizationUDTF#getOption() to get the details of options.
Note that there are no need to set an exact value for $mu. It actually works without setting $mu but recommended to set one for getting a better prediction.
Due to a bug in Hive, do not issue comments in CLI.
create table sgd_model
as
select
idx,
array_avg(u_rank) as Pu,
array_avg(m_rank) as Qi,
avg(u_bias) as Bu,
avg(m_bias) as Bi
from (
select
train_mf_sgd(userid, movieid, rating, "-factor ${factor} -mu ${mu} -iter ${iters}") as (idx, u_rank, m_rank, u_bias, m_bias)
from
training
) t
group by idx;
Note: Hivemall also provides train_mf_adagrad for training using AdaGrad.
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx);
select
mae(predicted, actual) as mae,
rmse(predicted, actual) as rmse
from (
select
t2.actual,
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
select
t1.userid,
t1.movieid,
t1.rating as actual,
p1.Pu,
p1.Bu
from
testing t1 LEFT OUTER JOIN sgd_model p1
ON (t1.userid = p1.idx)
) t2
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx)
) t;
0.6728969407733578 (MAE)
0.8584162122694449 (RMSE)
Recommend top-k movies that a user have not ever seen.
set hivevar:userid=1;
set hivevar:topk=5;
select
t1.movieid,
mf_predict(t2.Pu, t1.Qi, t2.Bu, t1.Bi, ${mu}) as predicted
from (
select
idx movieid,
Qi,
Bi
from
sgd_model p
where
p.idx NOT IN
(select movieid from training where userid=${userid})
) t1 CROSS JOIN (
select
Pu,
Bu
from
sgd_model
where
idx = ${userid}
) t2
order by
predicted DESC
limit ${topk};
movieid | predicted |
---|---|
318 | 4.8051853 |
2503 | 4.788541 |
53 | 4.7518783 |
904 | 4.7463417 |
953 | 4.732769 |