This repository has been archived by the owner on Oct 8, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 153
kdd2010b binary classification dataset
Makoto YUI edited this page Jul 25, 2014
·
3 revisions
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (bridge to algebra)
add jar ./tmp/hivemall.jar;
source ./tmp/define-all.hive;
create database kdd2010;
use kdd2010;
create external table kdd10b_train (
rowid int,
label int,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/kdd10b/train';
create external table kdd10b_test (
rowid int,
label int,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/kdd10b/test';
awk -f conv.awk kddb | hadoop fs -put - /dataset/kdd10b/train/kddb
awk -f conv.awk kddb.t | hadoop fs -put - /dataset/kdd10b/test/kddb.t
create table kdd10b_test_exploded as
select
rowid,
label,
split(feature,":")[0] as feature,
cast(split(feature,":")[1] as float) as value
from
kdd10b_test LATERAL VIEW explode(addBias(features)) t AS feature;
set hivevar:xtimes=3;
set hivevar:shufflebuffersize=1000;
create or replace view kdd10b_train_x3
as
select
rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features)
from
kdd10b_train;