-
Notifications
You must be signed in to change notification settings - Fork 1
/
search.py
202 lines (180 loc) · 7.2 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
Takes a search query and searches google for the top results.
Take those top results and search each page for the occurrences of each
artist. Keep track of how many times those artist's names occur for all
of the pages.
"""
import json, urllib, sys, re, socket, csv, time, argparse, http.client
from urllib import request
from bs4 import BeautifulSoup
# Colors
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def formatSeconds(sec) :
""" Takes a number of seconds and converts it into hrs, mins, secs """
# Convert the seconds to hours and minutes
m, s = divmod(sec, 60)
h, m = divmod(m, 60)
# Get the correct word to go along with each string
secStr = addS("second", "seconds", s)
minStr = addS("minute", "minutes", m)
hrStr = addS("hour", "hours", h)
# Don't have unnecessary times displayed
if h < 1.0 :
if m < 1.0 :
return format("%d %s" % (s, secStr))
else :
return format("%d %s %d %s" % (m, minStr, s, secStr))
else :
return format("%d %s %d %s %d %s" % (h, hrStr, m, minStr, s, secStr))
def addS(sing, plur, aNum) :
""" Determines whether we need a singular or plural version of a word
Ex: 1 minute, 2 minutes, 0 minutes, etc.
Keyword arguments:
sing -- The singular version of the word
plur -- The plural version of the word
aNum -- The number that determines which version of word to use
"""
if aNum >= 1.0 and aNum < 2.0 :
return sing
else:
return plur
def getUrls(query, engine = "google", startValue = 0, verbose=False) :
""" Get's the urls from a given search engine for a query
Keyword arguments:
query -- The search query
engine -- Which search engine to use (default: "google")
startValue -- Used only for google. Determines what starting result google returns (default: 0)
verbose -- Do you want verbose output or not? (default: False)
"""
try:
query = query.replace(" ", "+")
if engine == "google" : # Google Search API (Deprecated, we must pray that it works)
url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&q="+ query +"&start=" + str(startValue)
if verbose :
print("Search query: ", url)
time.sleep(5)
req = urllib.request.urlopen(url)
reqtxt = req.read().decode(req.info().get_param('charset') or 'utf-8')
rJson = json.loads(reqtxt)
urls = []
for result in rJson["responseData"]["results"] :
urls.append(result["unescapedUrl"])
return urls
elif engine == "duckduckgo" : # DuckDuckGo parsing
url = "http://duckduckgo.com/html/?q=" + query
if verbose :
print("Search query: ", url)
data = urllib.request.urlopen(url)
parsed = BeautifulSoup(data, "html.parser")
urls = []
for i in parsed.findAll('div', {'class': re.compile('links_main*')}):
urls.append(i.a['href'])
return urls
else:
print("Invalid engine. Quitting...")
return []
except (KeyboardInterrupt, SystemExit): # Let the program quit
raise
# But for God's sake catch everything else so the program does't randomly
# crap out before it's finish.
except Exception:
if verbose :
print(FAIL + "Couldn't get: " + ENDC + url)
return []
def getPageText(url, verbose=False, timeout=10) :
""" Gets the text from the page and returns it as a string
Keyword Arguments:
url -- The url of the page
verbose -- Do you want it verbose? (default: False)
timeout -- How long should we wait to try and get the page (default: 10)
"""
try :
if verbose :
printurl = (url[:72] + '...') if len(url) > 75 else url
print("Downloading... " + printurl)
html = urllib.request.urlopen(url, timeout=timeout).read()
return getVisibleText(html)
except (KeyboardInterrupt, SystemExit): # Let the program quit
raise
# But for God's sake catch everything else so the program does't randomly
# crap out before it's finish.
except Exception:
if verbose :
print(FAIL + "Couldn't get: " + ENDC + url)
return " "
def getVisibleText(readHTML) :
""" Takes the page html and removes everything that isn't visible text
on the page
Keyword Arguemnts:
readHTML -- The html to search
"""
soup = BeautifulSoup(readHTML, "html.parser")
texts = soup.findAll(text=True)
paragraphs = ""
for x in texts:
paragraphs += str(x)
return paragraphs
def printOccurrences(counter, artists) :
""" Nicely prints the artists and the corresponding number of occurrences
Keyword Arguments:
counter -- The counter array of occurrences
artists -- The artists array corresponding to number of occurrences
"""
## The maximum number of occurrences of any given name
maxOccurrences = max(counter)
if maxOccurrences is 0 :
print("No results");
else :
## Print out all the artists that matched the search
for occurrences in reversed(range(1, maxOccurrences + 1)) :
# Index of counter array with the max value
theIndex = [i for i, x in enumerate(counter) if x == occurrences]
# Print out the results
if occurrences == 0 :
print("No results");
else :
if len(theIndex) != 0 :
print("Occurrences: ", occurrences)
for x in theIndex :
print(" - Artist: ", artists[x])
#print("Index: ", x)
def resultsToCsv(counter, artists, query, plantString, resultsFile) :
""" Prints out everything to a CSV file
Keyword Arguments:
counter -- The counter array of occurrences
artists -- The artists array corresponding to number of occurrences
query -- The query that was originally searched for the plant
plantString -- The plant that was searched
resultsFile -- An csv file that has already been opened
"""
# Format the data
results = [plantString]
firstRun = True
## The maximum number of occurrences of any given name
maxOccurrences = max(counter)
if maxOccurrences <= 3 :
results.append("No results");
else :
## Print out all the artists that matched the search
for occurrences in reversed(range(1, maxOccurrences + 1)) :
# Index of counter array with the max value
theIndexes = [i for i, x in enumerate(counter) if x == occurrences]
if len(theIndexes) != 0 :
for x in theIndexes :
if firstRun :
results.append(query + " " + artists[x])
firstRun = False
if occurrences > 3 :
results.append(artists[x] + " (" + str(occurrences) + ")")
# Write data to file in a new row in csv format
resultString = ""
for result in results :
resultString += result + ", "
resultsFile.write(resultString + "\n")