-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3. Athena Queries Thanksgiving.sql
80 lines (75 loc) · 3.35 KB
/
3. Athena Queries Thanksgiving.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#Load Table into Athena from Bucket
DROP TABLE IF EXISTS thanksgiving_key_words;
CREATE EXTERNAL TABLE IF NOT EXISTS `ptb3`.`thanksgiving_key_words` (
`id` string,
`name` string,
`username` string,
`tweet` string,
`followers_count` int,
`location` string,
`geo` string,
`created_at` string,
`tweet_clean` string,
`filtered` array < string >,
`prediction` double
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION 's3://ptp3-frankiebromage/bigdataproject/thanksgiving/predicted_data.parquet/'
TBLPROPERTIES ('classification' = 'parquet');
#Create Table with Key Words, Totals, Username and Date from first table so I can create word clouds with key words
DROP TABLE IF EXISTS words_cloud_sentiment;
CREATE TABLE words_cloud_sentiment AS (
SELECT word,username,
CAST(SUBSTR(created_at, 27, 4) || '-' ||
(CASE WHEN SUBSTR(created_at, 5, 3) = 'Dec' THEN '12'
WHEN SUBSTR(created_at, 5, 3) = 'Nov' THEN '11'
END)||
'-' || SUBSTR(created_at, 9, 2) AS date) AS date_, (CASE WHEN prediction = 0 THEN 'Negative'
WHEN prediction = 1 THEN 'Positive'
END) AS Sentiment,
COUNT(word) AS TOTAL
FROM thanksgiving_key_words, UNNEST(filtered) AS t(word)
GROUP BY username, CAST(SUBSTR(created_at, 27, 4) || '-' ||
(CASE WHEN SUBSTR(created_at, 5, 3) = 'Dec' THEN '12'
WHEN SUBSTR(created_at, 5, 3) = 'Nov' THEN '11'
END)||
'-' || SUBSTR(created_at, 9, 2) AS date), (CASE WHEN prediction = 0 THEN 'Negative'
WHEN prediction = 1 THEN 'Positive'
END), word
ORDER BY TOTAL DESC);
#Create Table with cleaned dates and sentiment
#To create datetime I only need to use months November and December because this is the period of all the tweets.
#I filter out rows where year does not equal 2022, because some rows have errors with data not in the right place.
DROP TABLE IF EXISTS thanksgiving_dates;
CREATE TABLE thanksgiving_dates AS
(SELECT id,
tweet,
name,
username,
followers_count,
location,
geo,
tweet_clean,
prediction,
(CASE WHEN prediction = 0 THEN 'Negative'
WHEN prediction = 1 THEN 'Positive'
END) AS Sentiment,
SUBSTR(created_at, 1, 3) AS day_of_week,
SUBSTR(created_at, 5, 3) AS month_,
SUBSTR(created_at, 9, 2) AS day_,
SUBSTR(created_at, 12, 8) AS time_,
SUBSTR(created_at, 27, 4) AS year_,
CAST(SUBSTR(created_at, 27, 4) || '-' ||
(CASE WHEN SUBSTR(created_at, 5, 3) = 'Dec' THEN '12'
WHEN SUBSTR(created_at, 5, 3) = 'Nov' THEN '11'
END)||
'-' || SUBSTR(created_at, 9, 2)|| ' ' ||SUBSTR(created_at, 12, 8) AS timestamp) AS date_time,
CAST(SUBSTR(created_at, 27, 4) || '-' ||
(CASE WHEN SUBSTR(created_at, 5, 3) = 'Dec' THEN '12'
WHEN SUBSTR(created_at, 5, 3) = 'Nov' THEN '11'
END)||
'-' || SUBSTR(created_at, 9, 2) AS date) AS date_,
SUBSTR(created_at, 12, 2) AS hour
FROM thanksgiving_key_words
WHERE SUBSTR(created_at, 27, 4) = '2022');