-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_extraction.py
187 lines (153 loc) · 5.52 KB
/
data_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#%%
import re
import urllib.request
import time
import math
from fuzzywuzzy import process
import logging
logger = logging.getLogger(__name__)
###### CONSTANTS
# SLEEP_TIME time is added for politeness policy while crawling; do not reduce.
SLEEP_TIME = 1.1
STRING_CLIP = 50
EPSILON = 1e-9
KEYWORD_MATCH_THRESHOLD = 80
# REGEX PATTERNS
LIKES_REGEX = re.compile(r'"accessibilityText":"[\d,.KMB]+ likes"}')
VIEWS_REGEX = re.compile(r'"allowRatings":true,"viewCount":"[\d,.KMB]+","author"')
DIGIT_WITHOUT_CHAR = re.compile(r'\d+')
DIGIT_PATTERN = re.compile(r'\d+[A-Z]')
# list of stop words
with open("stop_words.txt", "r") as f:
stop_words = f.readlines()
stop_words_list = [x.strip('\n') for x in stop_words]
###### helper functions
def process_keywords(strng):
lowercase_string = strng.lower()
keywords = lowercase_string.split(',')
all_words = []
for word in keywords:
word = word.strip()
if len(word) > 3:
if ' ' in word:
all_words += word.split(' ')
else:
all_words.append(word)
filtered_keywords = set(all_words) - set(stop_words_list)
return list(filtered_keywords)
def get_keywords_score(seed_keywords, match_keywords):
"""Extract a keyword match score.
Args:
seed_keywords (listOfStrings): list of processed keywords from the SeedURL
match_keywords (listOfStrings): list of processed keywords from a crawled url
Returns:
keyword_score(10^count of words that match roughly): The 10 to the power of number of words thresholded by scores as returned by fuzzywuzzy process on each Seed keyword match.
EPSILON: If no match crosses KEYWORD_MATCH_THRESHOLD; this has the effect of increasing the final score.
"""
if match_keywords:
exponent = 0
for seed_word in seed_keywords:
best_match, match_score = process.extractOne(seed_word,match_keywords)
if match_score >= KEYWORD_MATCH_THRESHOLD:
exponent += 1
if exponent > 0:
keyword_score = 10 ** exponent
return keyword_score
else:
return EPSILON
else:
return EPSILON
def extract_from_regex(regex_pattern, strng):
match = regex_pattern.search(strng)
if match:
string_found = match.group(0)
return string_found.replace(",","")
else:
return "No match found."
def convert_to_integer(number_str):
"""courtesy #chatGPT"""
suffixes = {
'K': 1000,
'M': 1000000,
'B': 1000000000
}
if number_str[-1] in suffixes:
multiplier = suffixes[number_str[-1]]
return int(float(number_str[:-1]) * multiplier)
else:
return int(float(number_str))
def extract_integers(strng):
number_with_char = DIGIT_PATTERN.findall(strng)
just_number = DIGIT_WITHOUT_CHAR.findall(strng)
if len(just_number) == 2:
number_str = ".".join(just_number)
else:
number_str = just_number[0]
if number_with_char:
number_str += number_with_char[0][-1]
return convert_to_integer(number_str)
def get_data(link):
'''
Given a link to youtube video, extracts views, likes,
and other info (title, author, keywords) to return a row of data as dict.
'''
row = {
"title": 'NA',
"link": link,
"final_score": float('inf'),
"author": 'NA',
"views":0,
"likes":0,
"keywords": "NA"
}
time.sleep(SLEEP_TIME)
try:
with urllib.request.urlopen(link) as url:
logger.info(f"Link = {link}")
theSite=str(url.read())
# get title
title = re.findall('''<title>(.+?)</title>''',theSite,re.DOTALL)[0]
title = re.sub(r'\W+', ' ', title)
row["title"] = title[:STRING_CLIP]
# get author
if re.findall('''"author":"(.+?)"''',theSite,re.DOTALL):
author = re.findall('''"author":"(.+?)"''',theSite,re.DOTALL)[0]
row["author"] = re.sub(r'\W+', ' ', author)
row["author"] = row["author"][:STRING_CLIP]
# get likes
likes_strng = extract_from_regex(LIKES_REGEX, theSite)
if likes_strng == "No match found.":
logger.info(f"likes string not found")
return row
likes = extract_integers(likes_strng)
if likes<100:
logger.info(f"likes less than hundred")
return row
logger.info(f"Likes = {likes}")
# get views
views_strng = extract_from_regex(VIEWS_REGEX, theSite)
if views_strng == "No match found.":
logger.info(f"views string not found")
return row
views = extract_integers(views_strng)
logger.info(f"Views = {views}")
# get keywords
keywords = re.findall('''<meta name="keywords" content="(.+?)"><''',theSite,re.DOTALL)[0]
if keywords:
row["keywords"] = process_keywords(keywords)
score = views/likes
# dividing further by log10 of likes to prioritize higher number of likes
row["final_score"] = score / (math.log10(likes+10)) # addding 10 to avoid divide by 0
row["views"] = views
row["likes"] = likes
except:
logger.info(f"Issue extracting data for {link}")
return row
#%%
########## testing
# print(get_data("https://youtu.be/CVU1Mv9e-0U"))
# # %%
# x = "\"accessibilityText\":\"14 likes\"}"
# # %%
# extract_integers(x)
# %%