-
Notifications
You must be signed in to change notification settings - Fork 0
/
linkfinder.py
77 lines (61 loc) · 2.05 KB
/
linkfinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
'''This script looks through itsumi's logs and returns the last few URLs posted
and their respective titles encoded in JSON.
'''
import sys
import csv
import re
import json
import requests
if len(sys.argv) != 3:
print('ERROR: Wrong number of arguments.\nUsage: python3 linkfinder.py <PATH TO LOG FILE> <NUMBER OF LINKS TO GET>', file=sys.stderr)
quit()
try:
if int(sys.argv[2]) < 1:
raise
except:
print('ERROR: "{0}" is not a valid number of links to fetch.'.format(sys.argv[2]), file=sys.stderr)
quit()
try:
with open(sys.argv[1], newline='', encoding='iso-8859-1') as file:
url_regex = re.compile('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
reader = csv.reader(file, delimiter=',', escapechar='\\', quoting=csv.QUOTE_MINIMAL)
links = []
for row in reader:
link = url_regex.search(row[2]) # row[2] = chat message content
if link:
links.append(link.group(0))
# Extact title from last X links.
parsed_links = []
count = 0
for link in reversed(links):
if count == int(sys.argv[2]):
break
# Request raw HTML data.
try:
html = requests.get(link).text
# Look for title.
try:
# For some reason this breaks when the title has \n or \t.
#link_title = re.search('<title>.+</title>', html)[0][7:-8]
# Do it manually then.
left = html.find('<title>')
right = html.find('</title>')
if left == -1 or right == -1:
raise
link_title = html[left+7:right].strip()
except:
link_title = None
print('WARNING: Failed to extract title from {0}'.format(link), file=sys.stderr)
parsed_links.append({
'url': link,
'title': link_title
})
count += 1
except:
print('WARNING: Failed to load {0}'.format(link), file=sys.stderr)
continue
# Encode into JSON and print to stdout.
parsed_links.reverse() # last link posted will be in position [0]
print(json.dumps(parsed_links))
except FileNotFoundError as ex:
print('ERROR: Failed to open {0}\nCaught exception: {1}'.format(sys.argv[1], str(ex)), file=sys.stderr)