-
Notifications
You must be signed in to change notification settings - Fork 0
/
flathunter.py
executable file
·96 lines (73 loc) · 2.98 KB
/
flathunter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
import requests
import argparse
import json
import warnings
from os.path import join
import smtplib
import ssl
from collections import namedtuple
SmtpConfig = namedtuple('SmtpConfig', 'port server email user password')
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('urls', help='file with urls to crawl')
parser.add_argument('db_dir', help='directory to store chached data')
parser.add_argument('--emails', help='send an email with changed urls to'
'these addresses (separate multiple addresses by commas)')
parser.add_argument('--smtp-port', default=465, help='port of smtp server',
type=int)
parser.add_argument('--smtp-server', default='smtp.gmail.com',
help='smtp server address')
parser.add_argument('--smtp-password', help='smtp server password')
parser.add_argument('--smtp-user', help='smtp server user')
parser.add_argument('--from-email', help='from email address')
args = parser.parse_args()
urls = load_urls(args.urls)
emails = args.emails.split(',') if args.emails else []
from_email = args.from_email if args.from_email else args.smtp_user
smtp_config = SmtpConfig(port=args.smtp_port, server=args.smtp_server,
email=from_email, user=args.smtp_user,
password=args.smtp_password)
changed = crawl_urls(urls, args.db_dir)
report = generate_report(changed)
if emails:
send_report(report, emails, smtp_config)
else:
print(report)
def load_urls(urls_file):
with open(urls_file, 'r') as urls_in:
urls = [url.strip() for url in urls_in.readlines()]
return urls
def crawl_urls(urls, db_dir):
cache = load_cache(db_dir)
changed = {}
for url in urls:
r_cached = cache.get(url, '')
r = requests.get(url)
if r.text != r_cached:
cache[url] = r.text
changed[url] = r.text
write_cache(cache, db_dir)
return changed
def load_cache(db_dir):
try:
with open(join(db_dir, 'flathunter_cache.json'), 'r') as cache_file:
cache = json.load(cache_file)
except FileNotFoundError:
warnings.warn('cache file could not be found. generating new one')
cache = {}
return cache
def write_cache(cache, db_dir):
with open(join(db_dir, 'flathunter_cache.json'), 'w') as cache_file:
json.dump(cache, cache_file)
def generate_report(changed):
for url, content in changed.items():
print('{}\n{}\n\n'.format(url, content))
def send_report(report, emails, smtp_config):
context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_config.server, smtp_config.port, context) as server:
server.login(smtp_config.user, smtp_config.password)
for email in emails:
server.sendmail(smtp_config.email, email, report)
if __name__ == '__main__':
main()