-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathredditScraperDumby.py
292 lines (229 loc) · 8.88 KB
/
redditScraperDumby.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import requests
import string
import re
import csv
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import numpy as np
import time
import random
start_time = time.time()
#Opens CSV with subreddit names and search words and sorst into subs and words respectively
with open('request.csv', newline='') as f:
search = list(csv.reader(f))
subs = []
words = []
for i in range(len(search)):
subs.append(search[i][0])
words.append(search[i][1])
def listShorten(list):
newlist =[]
for i in range(len(list)):
if list[i] != '':
newlist.append(list[i])
return newlist
def generateURL(subredditName, keywords):
'''generates search urls for each sub and each search word in request.csv'''
sort='new' #options are relevance, hot, top, new, comments(most comments)
SEARCHCSV = []
SEARCH = []
subs = listShorten(subredditName)
words = listShorten(keywords)
formatted = []
for i in range(len(words)):
formatted = formatted + [words[i].replace(" ", "%20" )]
#print(formatted)
for i in range(len(subs)):
for k in range(len(words)):
SEARCHCSV = SEARCHCSV + [['https://www.reddit.com/' + subs[i] + '/search/?q=' + formatted[k] + '&restrict_sr=1&sort=' + sort]]
SEARCH = SEARCH + ['https://www.reddit.com/' + subs[i] + '/search/?q=' + formatted[k] + '&restrict_sr=1&sort=' + sort]
exportSearchCSV(SEARCHCSV)
return SEARCH
def exportSearchCSV(urls):
rows = urls
# using the savetxt
# from the numpy module
np.savetxt("searches.csv",
rows,
delimiter =", ",
fmt ='% s')
def scrapeResults(urlList,itemTargetCount):
'''outputs all comment text from URL as a list of strings with only english characters and arabic numbers'''
urlsCSV=[]
urls = []
for i in range(len(urlList)):
# instantiate options
options = webdriver.ChromeOptions()
# run browser in headless mode
options.headless = True
driver = webdriver.Chrome(service=ChromeService(
ChromeDriverManager().install()), options=options)
# get the entire website content
driver.get(urlList[i])
#with open('comm.txt', 'w', encoding='utf-8') as f:
#f.write(driver.page_source)
items = []
# instantiate height of webpage
last_height = driver.execute_script('return document.body.scrollHeight')
# scroll to bottom of webpage
while itemTargetCount > len(items):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
last_height = new_height
# select elements by XPath
elements = driver.find_elements(By.XPATH, "//div[@class='_2i5O0KNpb9tDq0bsNOZB_Q']/div/div/a/div/h3")
h3_texts = [element.text for element in elements]
items = h3_texts
items = items[0:itemTargetCount]
for i in range(len(items)):
element = driver.find_element(By.LINK_TEXT, items[i])
urlsCSV = urlsCSV + [[element.get_attribute('href')]]
urls = urls + [element.get_attribute('href')]
#removes duplicates
#urlsCSV = list(set(urlsCSV))
urls = list(set(urls))
print(len(urls))
print(urls)
#exportResCSV(urlsCSV)
return urls
def exportResCSV(inputurls):
rows = inputurls
# using the savetxt
# from the numpy module
np.savetxt("redditResults.csv",
rows,
delimiter =", ",
fmt ='% s')
#reads csv and creates list of links
#with open('redditResults.csv', newline='') as f:
#redditResults = list(csv.reader(f))
#URLS=[]
#for i in range(len(redditResults)):
#URLS.append(redditResults[i][0])
def scrapePost(url):
'''outputs the title and post text from URL as a list of strings with only english characters and arabic numbers '''
#Pulls page HTML
page = requests.get(url)
#creates soup object
soupPage = BeautifulSoup(page.content, "html.parser")
#extracting elements
postTitleHTML = soupPage.find(slot="title")
postTextHTML = soupPage.find(slot="text-body")
#ensures title and text have text, discards if not
if postTitleHTML == None:
allText = postTextHTML.text.strip()
elif postTextHTML == None:
allText = postTitleHTML.text.strip()
elif postTitleHTML == None and postTitleHTML == None:
allText = 'postcontentwasnottext'
else:
postTitle = postTitleHTML.text.strip()
postText = postTextHTML.text.strip()
allText = postTitle + " " + postText[:-10]#removes read more and combines title and text
return textCleaner(allText)
#broken rn
def scrapeComments(url):
'''outputs all comment text from URL as a list of strings with only english characters and arabic numbers'''
# instantiate options
options = webdriver.ChromeOptions()
# run browser in headless mode
options.headless = True
driver = webdriver.Chrome(service=ChromeService(
ChromeDriverManager().install()), options=options)
# get the entire website content
driver.get(url)
#with open('readme.txt', 'w', encoding='utf-8') as f:
#f.write(driver.page_source)
comments = []
# scroll to bottom of webpage
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(2)
# select elements by id
elements = driver.find_elements(By.ID, "-post-rtjson-content")
comments = [element.text for element in elements]
cleanComments=[]
for i in range(len(comments)):
cleanComments = cleanComments + textCleaner(comments[i])
print(cleanComments)
return cleanComments
#possible improvement to account for contractions
#with open('contractions.csv', newline='') as c:
#contractions = list(csv.reader(c))
def textCleaner(inputString):
'''returns list of one word strings without any extra spaces, line breaks, or special characters.'''
#remove punctuation and conver to all lowercase
noPunc = inputString.translate(str.maketrans('', '', string.punctuation)).lower()
#removes extra spaces and line breaks
res = ""
res2 = ""
for i in range(len(noPunc)):
if (noPunc[i] == " " and noPunc[i-1] == " " ) or ord(noPunc[i]) == 10:
pass
else:
res += noPunc[i]
for i in range(len(res)):
if (res[i] == " " and res[i-1] == " ") or ord(res[i]) == 10:
pass
else:
res2 += res[i]
#remove emojis/special char
wordList = makeList(res2)
for i in range(len(wordList)):
if not wordList[i].isalnum():
newWord=""
for k in range(len(wordList[i])):
if wordList[i][k].isalnum():
newWord = newWord + wordList[i][k]
wordList[i] = newWord
return wordList
def makeList(string):
return list(string.split(" "))
count = {} #{word,frequency}
def counter(url):
'''Stores frequency of every word in the main post and comments in dictionary count'''
comments = scrapeComments(url)
tries = 1
maxtries = 5
while comments == [] and tries <= maxtries:
comments = scrapeComments(url)
tries = tries + 1
allWords = scrapePost(url) + comments
for i in range(len(allWords)):
if allWords[i] in count: #if this word has already been encountered add one to its dictionary value
count[allWords[i]] = count[allWords[i]] + 1
else: #if this is the first time this word has been encountered, create dictionary item with word as key and value equal to one
count[allWords[i]] = 1
def countAllPages(list):
'''Iterates counter on all URLS in list'''
for i in range(len(list)):
counter(list[i])
def filterDict():
'''filters dictionary to only include desired keywords'''
pass
def exportCSV(dict):
'''exports dict as CSV'''
with open('wordFrequency.csv', 'w', newline='', encoding = 'utf-8') as csvfile:
header_key = ['word', 'freq']
new_val = csv.DictWriter(csvfile, fieldnames=header_key)
new_val.writeheader()
for new_k in dict:
new_val.writerow({'word': new_k, 'freq': dict[new_k]})
def main():
#creates list of search urls
searchURLS = generateURL(subs,words)
#creates list with numResults posts per search url
numResults = 10
URLS = scrapeResults(searchURLS,numResults)
countAllPages(URLS)
#filterDict(count)
exportCSV(count)
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()