-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimp_commands.txt
134 lines (95 loc) · 3.3 KB
/
imp_commands.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// To start pig in a local mode from where we can access only the Local File System.
pig -x local
//To start pig in a HDFS mdoe from where we can access only the HDFS File System
pig -x mapreduce
// Pig willnever store data physically
//Pig is Comparitively Slow with the MapReduce because it internally build the MapReduce program
--- PIG DATA TYPES ---
int
float
char
double
bytearray - it is a DEFAULT datatype
chararray
bag - somewhat related to table name
typle - somewhat like the whole record
atom - it is a single value
pig -help
--- APACHE PIG - Word Count ---
lines = LOAD 'user/root/pig_demo.txt' AS (line:chararray);
words = FOREACH lines GENERATE FLATTEN(TOKENIZE(line)) AS word;
groupped = GROUP words BY word;
wordcount = FOREACH grouped GENERATE group, COUNT(words);
DUMP wordcount;
DESCRIBE grouped;
EXPLAIN wordcount
--- Counting number of records in Pig ---
data1 = LOAD 'user/root/pig_demo.txt' AS (line:chararray);
wrk_one = foreach data1 generate 1 as one;
wrk_group = group wrk_one by one;
wrk_count = foreach wrk_group generate group, COUNT(wrk_one.one);
dump wrk_count;
-- Load data into pig without a schema
departments = LOAD 'user/root/sqoop-improt/departments' USING PigStorage(',') AS (department_id:int, departmentName:chararray);
DESCRIBE departments;
department_id = FOREACH departments GENERATE (int) $1;
DUP departments;
--- Load Data ino Pig relation with a schema
departments = LOAD 'user/root/sqoop-import/departments' USING PigStorage(',') AS (department_id:int, departmentName:chararray);
department_id = FOREACH departments GENERATE department_id;
DUMP department_id;
--- FILTER Command ---
filter1 = FILTER climate BY year == 2007;
DUMP filter1;
filter2 = FILTER climate BY $1 eq 2007;
DUMP filter2;
filter3 = FILTER climate BY $1 > 2007;
DUMP filter3;
filter4 = FILTER climate BY $1 < 2007;
DUMP filter4;
filter3 = FILTER climate BY $1 <gt/lt/eq> 2007;
DUMP filter3;
filter6 = = FILTER climate BY NOT $1 == 2007;
DUMP filter6
filter7 = FILTER climate BY $1 IN (2006, 2007);
DUMP filter7;
--- Matched the records of China & Canada ---
filter8 = FILTER climate BY country MATCHES 'C.*a';
DUMP filter8;
--- Matched the records of Indonesia and China ---
filter9 = FILTER climate BY country MATCHES '.*(done|hin)*.';
DUMP filter9;
--- LIMIT Keyword ---
limit1 = LIMIT climate 100;
DUMP limit1;
--- SPLIT Keyword ---
// minimum 2 relations should be there
SPLIT climate into B1992 if $1 == 1992, B2002 if $1 == 2002;
DUMP B1992;
DUMP B2002;
--- DISTINCT Keyword ---
distinct1 = DISTINCT redundant1;
DUMP distinct1;
--- SAMPLE Keyword ---
// will generate some %age of the data present in a relation.
// below command will generate apporx. 0.2% of the data which is resent in climate relation.
sample1 = SAMPLE climate 0.2
DUMP sample1;
--- ORDER Keyword ---
order1 = ORDER climate BY year asc;
DUMP order1;
order2 = ORDER climate BY year desc;
DUMP order2;
order3 = ORDER climate BY year asc, temp desc;
DUMP order2;
--- GROUP BY Commad ---
group1 = GROUP climate BY year;
DUMP group1;
grouop2 = GROUP climate BY (year, temp);
DUMP group2;
--- COGROUP Command- like JOINS in SQL---
// It will combine the similar tuples into one group from 2 or more relations based on grouping coloumn
cogroup1 = COGROUP a BY a1, b BY b1;
DUMP cogroup1;
cogroup2 = COGROUP a BY a1 INNER, b BY b1 INNER;
DUMP cogroup2;