-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_util.py
144 lines (110 loc) · 4.64 KB
/
twitter_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# --------------------------------------------------
# twitter_util.py
#
# Utility functions for working with the twitter dataset.
# --------------------------------------------------
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import json
import string, nltk, csv, os, re, sys
# Project-wide constants, file paths, etc.
import settings
# You might get an error with nltk.
# It can be resoloved by the following lines of code:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# Read the stop list from stop_list.txt
fobj = open(settings.STOP_LIST, 'r')
stop_list = set([l.strip() for l in fobj.readlines()])
if settings.DEBUG:
print(f"Loaded stop list: {sorted(list(stop_list))}")
def download_twitter(path='./TwitterDataset'):
'''
Downloads the twitter dataset from the git repository:
https://github.com/VeronicaSalm/TwitterDataset
Arguments:
- path (string): an absolute or relative path to the directory where the
Twitter repository should be downloaded to, defaults to
the current directory '.'
'''
os.system(f"git clone https://github.com/VeronicaSalm/TwitterDataset {path}")
def load_data_twitter(twitter_dir):
'''
Load in the Twitter data from the json files.
Note that since the Twitter data is unlabelled, None is used as
the label for every item.
Arguments:
- twitter_dir (string): the path to a directory containing the twitter jsonl files
Return Values:
- train (list): List of training files with dummy labels.
- test (list): Empty list, as there is no test data for the Twitter dataset
- classes (list): Empty list, as there are no class labels
'''
if settings.DEBUG: print('Loading in the twitter dataset.')
if twitter_dir == None:
raise Exception("Must specify the path to the twitter directory")
# First make sure the files exist on the system.
if not os.path.exists(twitter_dir) or not os.path.isdir(twitter_dir):
raise Exception('Can not load in training data, files do not exist.')
classes, train, test = [], [], []
# Read in the training data next
for f in sorted(os.listdir(twitter_dir)):
fpath = os.path.join(twitter_dir, f)
if settings.DEBUG: print(f"Loading {fpath}")
with open(fpath, "r") as json_file:
line = json_file.readline()
while line:
d = json.loads(line)
tweetID = d["id"]
text = d["full_text"]
date = d["created_at"]
# store the tweetID and date in case we need them later
train.append([None, text, tweetID, date])
# get the next tweet
line = json_file.readline()
return (train, test, classes)
def tokenize_twitter(text, remove_stopwords=True):
'''
Tokenize a given text. Also removes URLs and
has the option to remove stop words.
Arguments:
- text (string): The text to tokenize.
- remove_stopwords (boolean): Flag for if we should remove stopwords or not.
Return Values:
- (list): The tokenized text.
'''
# First, use a simple regex to remove the URLs. Then tokenize the text.
# We remove URLs here as it'll be more difficult to do this when we normalize.
text = re.sub(r"http\S+", "", text)
# remove all punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
tokens = word_tokenize(text)
# Handle stopwords if needed.
if remove_stopwords:
s_words = set([s.lower() for s in stopwords.words('english')])
tokens = list(filter(lambda x: x.lower() not in s_words, tokens))
return tokens
def normalize_twitter(tokens):
'''
Normalize a list of tokens.
We do case folding, remove punctuation, emojis, and lemmatization.
Hashtags will be preserved without the '#' character,
and same with mentions and the '@' character.
Arguments:
- tokens (list): The tokens to normalize.
Return Values:
- (list): The normalized tokens.
Notes:
- This function keeps duplicates and numbers. We may or may not want to
change this.
'''
lemmatizer = WordNetLemmatizer()
valid = []
for token in tokens:
if token.lower() not in stop_list:
valid.append(token)
return [lemmatizer.lemmatize(token.lower()) for token in valid if (token not in string.punctuation) and (token.encode("ascii", "ignore").decode())]