forked from tfiers/kul-machine-learning-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurlStreamHandler.py
109 lines (95 loc) · 3.29 KB
/
urlStreamHandler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python3
# encoding: utf-8
"""
urlStreamHandler.py
"""
import sys
import argparse
import json
import http.server
import socketserver
import datetime
import atexit
import signal
import url_predictor
date = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
filename = "urls_{}.csv".format(date)
logfile = open(filename, "w")
print('Writing to {}'.format(filename))
def at_exit():
print("Closing logfile")
logfile.close()
atexit.register(at_exit)
def do_exit(sig, frame):
print("\nShutting down")
sys.exit(0)
signal.signal(signal.SIGINT, do_exit)
class MyRequestHandler(http.server.SimpleHTTPRequestHandler):
def do_POST(self):
"""The GreaseMonkey script sends json data containing the url,
timestamp, and html. We capture all POST requests irrespective of the
path.
"""
length = int(self.headers['Content-Length'])
content = self.rfile.read(length)
data = json.loads(content.decode(encoding='UTF-8'))
url = data['url']
ts = data['ts']
action = data['action']
if action == 'load':
toppage = data['top']
html = data['html']
if toppage:
action_str = 'load'
else:
action_str = 'bg'
target = ''
print('{:<15}: {}'.format(action_str, url))
elif 'target' in data:
action_str = action
target = data['target']
print('{:<15}: {} -> {}'.format(action_str, url, target))
else:
action_str = action
target = ''
print('{:<15}: {}'.format(action_str, url))
print('"'+ts+'", "'+action_str+'", "'+url+'", "'+target+'"',
file=logfile)
# Call our model for a list of guesses.
guesses = url_predictor.get_guesses(url)
print(guesses)
response = {
'success': True,
'guesses': guesses,
}
jsonstr = bytes(json.dumps(response), "UTF-8")
self.send_response(200)
self.send_header("Content-type", "application/json")
self.send_header("Content-length", len(jsonstr))
self.end_headers()
self.wfile.write(jsonstr)
def start_from_csv(filenames):
"""List of csv files that contain a url stream as if they were comming
from the GreaseMonkey script."""
for filename in filenames:
with open(filename, 'r') as csv_file:
# Incrementally train our model based on these files
url_predictor.learn_from(csv_file)
print('Processing {}'.format(filename))
def main(argv=None):
parser = argparse.ArgumentParser(description='Record and suggest urls')
parser.add_argument('--verbose', '-v', action='count',
help='Verbose output')
parser.add_argument('--port', '-p', default=8000,
help='Server port')
parser.add_argument('--csv', nargs='*',
help='CSV files with a url stream to start from')
args = parser.parse_args(argv)
if args.csv is not None:
start_from_csv(args.csv)
server = socketserver.TCPServer(("", args.port), MyRequestHandler)
print("Serving at port {}".format(args.port))
print("CTRL-C to exit")
server.serve_forever()
if __name__ == "__main__":
sys.exit(main())