-
Notifications
You must be signed in to change notification settings - Fork 0
/
line.cpp
101 lines (90 loc) · 2.8 KB
/
line.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Processes and extracts information from individual lines
#include <iostream>
#include <regex>
#include <set>
#include <unordered_map>
#include "include/rapidjson/document.h"
using namespace std;
using namespace rapidjson;
// Function prototype
string to_lower(string in);
// Pattern used to match hashtags
string pattern = "#[\\d\\w]+";
regex pattern_hashtag(pattern);
/**
* Extract language and hashtags from line, and calculate frequencies.
* @param line line (string), e.g.: "This is a tweet!"
* @param lang_freq_map frequency map of languages (unordered_map), e.g.:
* lang_freq_map["en"] -> 42
* @param hashtag_freq_map frequency map of hashtags (unordered_map), e.g.:
* hashtag_freq_map["#hashtag"] -> 43
*/
void process_line(const string& line,
unordered_map<string, unsigned long>& lang_freq_map,
unordered_map<string, unsigned long>& hashtag_freq_map) {
try {
// Parse into JSON DOM
Document d;
d.Parse(line.c_str());
// Extract hash tags
set<string> unique_hashtags;
smatch matched_strings;
// Extract hash tags from tweet text
string tweet_text;
tweet_text = d["doc"]["text"].GetString();
regex_search(tweet_text, matched_strings, pattern_hashtag);
for (auto matched : matched_strings) {
if (matched.length()) {
string matched_lower = to_lower(matched);
unique_hashtags.insert(matched_lower);
}
}
// Extract hash tags from doc->entities->hashtags
const Value& hashtags = d["doc"]["entities"]["hashtags"];
assert(hashtags.IsArray());
for (auto& v : hashtags.GetArray()) {
string hashtag = "#";
hashtag.append(v["text"].GetString());
regex_search(hashtag, matched_strings, pattern_hashtag);
for (auto filtered : matched_strings) {
if (filtered.length() == hashtag.length()) {
string filtered_lower = to_lower(filtered);
unique_hashtags.insert(filtered_lower);
}
}
}
// Count freq
for (const auto& unique_hashtag : unique_hashtags) {
// Whether contains key
if (hashtag_freq_map.end() !=
hashtag_freq_map.find(unique_hashtag)) {
hashtag_freq_map[unique_hashtag] += 1;
} else {
hashtag_freq_map[unique_hashtag] = 1;
}
}
// Extract language
string lang = d["doc"]["lang"].GetString();
if (lang_freq_map.find(lang) != lang_freq_map.end()) {
lang_freq_map[lang] += 1;
} else {
lang_freq_map[lang] = 1;
}
} catch (const std::regex_error& e) {
std::cout << "regex_error caught: " << e.what() << std::endl;
if (e.code() == std::regex_constants::error_brack) {
std::cout << "The code was error_brack" << std::endl;
}
}
};
/**
* Transform string to lowercase.
* @param in input string (string), e.g.: "#ANice_Day"
* @return lower case equivalent of input string (string), e.g.: "#anice_day"
*/
string to_lower(string in) {
for (char& i : in)
if ('A' <= i && i <= 'Z')
i += 32;
return in;
}