This repository has been archived by the owner on Oct 8, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 153
news20 binary classification on Amazon Elastic MapReduce
Makoto YUI edited this page Oct 31, 2013
·
40 revisions
Learn how to use Hive with Elastic MapReduce (EMR).
http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-hive.html
Lunch EMR with hive in an interactive mode. I'm usually lunching EMR instances with cheap Spot instances through CLI as follows:
./elastic-mapreduce --create --alive \
--name "Hive cluster" \
--hive-interactive --hive-versions latest \
--hive-site=s3://${s3bucket}/emr/conf/hive-site.xml \
--ami-version latest \
--instance-group master --instance-type m1.medium --instance-count 1 --bid-price 0.175 \
--instance-group core --instance-type m1.large --instance-count 3 --bid-price 0.35 \
--enable-debugging --log-uri s3n://${s3bucket}/emr/logs \
--bootstrap-action s3://elasticmapreduce/bootstrap-actions/install-ganglia
To use YARN instead of old Hadoop, specify "--ami-version 3.0.0". Hivemall works on both old Hadoop and YARN.
Or, lunch EMR instances using the EMR GUI wizard.
mkdir tmp
cd tmp
wget --no-check-certificate https://github.com/myui/hivemall/raw/master/target/hivemall.jar https://github.com/myui/hivemall/raw/master/scripts/ddl/define-all.hive
cd ~
hive
Put training and test data in a TSV format on Amazon S3 and create ${s3bucket}/emr/outputs for outputs.
create database news20;
use news20;
add jar ./tmp/hivemall.jar;
source ./tmp/define-all.hive;
Create external table news20b_train (
rowid int,
label int,
features ARRAY<STRING>
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION 's3n://${s3bucket}/datasets/news20b/train';
Create external table news20b_test (
rowid int,
label int,
features ARRAY<STRING>
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION 's3n://${s3bucket}/datasets/news20b/test';
create or replace view news20b_train_x3
as
select
*
from (
select
amplify(3, *) as (rowid, label, features)
from
news20b_train
) t
CLUSTER BY CAST(rand(47) * 100 as INT), CAST(rand(49) * 100 as INT), CAST(rand(50) * 100 as INT);
create table news20b_test_exploded as
select
rowid,
label,
cast(split(feature,":")[0] as int) as feature,
cast(split(feature,":")[1] as float) as value
from
news20b_test LATERAL VIEW explode(addBias(features)) t AS feature;
DROP TABLE news20b_arow_model1;
CREATE EXTERNAL TABLE IF NOT EXISTS news20b_arow_model1 (
feature string,
weight float
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION 's3://${s3bucket}/emr/outputs/news20b_arow_model1';
insert overwrite table news20b_arow_model1
select
feature,
cast(voted_avg(weight) as float) as weight
from
(select
train_arow(addBias(features),label) as (feature,weight)
from
news20b_train_x3
) t
group by feature;
create or replace view news20b_arow_predict1
as
select
t.rowid,
sum(m.weight * t.value) as total_weight,
case when sum(m.weight * t.value) > 0.0 then 1 else -1 end as label
from
news20b_test_exploded t LEFT OUTER JOIN
news20b_arow_model1 m ON (t.feature = m.feature)
group by
t.rowid;
create or replace view news20b_arow_submit1 as
select
t.rowid,
t.label as actual,
pd.label as predicted
from
news20b_test t JOIN news20b_arow_predict1 pd
on (t.rowid = pd.rowid);
select count(1)/4996 from news20b_arow_submit1
where actual == predicted;
0.9659727782225781
drop table news20b_arow_model1;
drop view news20b_arow_predict1;
drop view news20b_arow_submit1;