-
Notifications
You must be signed in to change notification settings - Fork 1
/
unused_history_cleanup.py
324 lines (267 loc) · 11.1 KB
/
unused_history_cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/env python
"""
usnused_history_cleanup.py
Author: Simon Gladman 2018 - University of Melbourne.
License: MIT
A script to help administer storage by:
* Send users email if their history hasn't been updated for "threshold 1"
weeks, telling them it will automatically be deleted by
"threshold 2" (some further weeks).
* Mark deleted - histories that haven't been updated for "threshold 2"
weeks, haven't been published or shared, haven't been previously deleted.
It is intended that this script is used in a broader shell script that includes
the pg_cleanup.py script running actual history and associated dataset deletion
and purge.
It needs a config file in yaml format as follows:
---
server_name: Galaxy Australia
smtp_server: localhost
from_addr: [email protected]
response_addr: [email protected]
bcc_addr: ['[email protected]','[email protected]']
info_url: https://galaxy-au-training.github.io/tutorials/modules/galaxy-data/
pg_host: <db_server_dns>
pg_port: <pgsql_port_number>
pg_user: galaxy
pg_dbname: galaxy
pg_password: <password>
warn_weeks: 11
delete_weeks: 13
"""
from __future__ import print_function
import sys
import os.path
from os import access, R_OK
import argparse
import yaml
from collections import defaultdict
import psycopg2
import smtplib
from email.mime.text import MIMEText
VERSION = 0.1
PROGRAM_NAME = 'unused_history_cleanup.py'
DESCRIPTION = 'Looks for old histories, warns users of their upcoming deletion and marks previously warned histories as deleted.'
def transform_hists(hists):
"""
transform_hists
Takes a list of dictionaries returned from `get_old_hists` and returns a dictionary of unique users
which contains their username and email by id and a dictionary of lists of histories for each user id.
"""
users = defaultdict()
user_hists = defaultdict(list)
for h in hists:
user_hists[h['uid']].append((h['id'],h['name']))
users[h['uid']] = {'email': h['email'], 'uname': h['uname']}
return users, user_hists
def get_old_hists(conn, age, verbose):
"""
get_old_hists
Takes a psql connection, an age in weeks (int) and the verbosity level, runs a query on the connection
and returns a list of dictionaries of histories and their user details which haven't been updated for at least
that many weeks.
"""
SELECT_Q = """
SELECT
h.id, h.name, h.user_id, u.username, u.email
FROM
history h, galaxy_user u
WHERE
h.user_id = u.id AND
h.deleted = FALSE AND
h.published = FALSE AND
h.update_time < (now() - '%s weeks'::interval);
"""
if verbose:
print('Retrieving histories. Running following SQL to get old histories', file=sys.stderr)
print(SELECT_Q % age, file=sys.stderr)
curs = conn.cursor()
curs.execute(SELECT_Q, [age])
temp = curs.fetchall()
hists = []
for t in temp:
hists.append({'id': t[0], 'name': t[1], 'uid': t[2], 'uname': t[3], 'email': t[4]})
curs.close()
return hists
def subtract_lists(hists1, hists2):
"""
substracts the records in the second list from the records in the first list
"""
for h in hists2:
if h in hists1:
hists1.remove(h)
return hists1
def send_email_to_user(user, hists, warn, delete, server, smtp_server, from_addr, response_addr, bcc_addr, info_url, VERBOSE):
"""
Sends warning emails to users stating their histories are about to be deleted.
"""
MSG_TEXT = """Dear %s,
You are receiving this email as one or more of your histories on the %s server
have not been updated for %i weeks. They will be beyond the User Data Storage time limits soon.
If you do not use or update them within the next %i weeks, they will automatically be deleted
and purged from disk.
You should download any files you wish to keep from this history within the next
%i weeks. Instructions for doing so can be found at:
%s
The history(ies) in question are as follows:
""" % (user['uname'], server, warn, delete - warn, delete - warn, info_url)
MSG_TEXT += '\tHistory ID\tName\n'
for h in hists:
MSG_TEXT += '\t%s\t\t%s\n' % (h[0],h[1])
MSG_TEXT += """
You can contact %s if you have any queries.
Yours,
%s Admins.
""" % (response_addr, server)
email = user['email']
subject = "%s History Deletion Warning" % server
if VERBOSE:
print('Email sent:')
print("To: %s" % email)
print("From: %s" % from_addr)
print("Subject: %s" % subject)
print("----------------------")
print(MSG_TEXT)
mail_server = smtplib.SMTP('localhost')
msg = MIMEText(MSG_TEXT)
msg['To'] = email
msg['From'] = from_addr
msg['Subject'] = subject
msg['BCC'] = '.'.join(bcc_addr)
mail_server.sendmail(from_addr, [email] + bcc_addr, msg.as_string())
mail_server.quit()
return
def main():
VERBOSE = False
parser = argparse.ArgumentParser(description=DESCRIPTION)
parser.add_argument("-c", "--config", help="YAML Config file to use. Required parameter.", required=('--version' not in sys.argv))
parser.add_argument("-i", "--info_only", help="Just run the queries and report statistics without emailing users nor altering database.", action='store_true')
parser.add_argument("--actually_delete_things", help="DANGER: Will run the email query, email the users and update the database marking histories as deleted.", action='store_true')
parser.add_argument("-s", "--show_config", help="Display the config used and exit. Do not run any queries.", action='store_true')
parser.add_argument("--version", action='store_true', help='Show the version and exit.')
parser.add_argument("--verbose", action='store_true', help='Show extra output on STDERR.')
args = parser.parse_args()
if args.version:
print("%s, Version: %.1f" % (PROGRAM_NAME, VERSION))
return
if args.verbose:
VERBOSE = True
#Check the config file
if VERBOSE:
print('Reading config file: %s' % args.config, file=sys.stderr)
if not os.path.isfile(args.config) or not access(args.config, R_OK):
print('ERROR: Config file "%s" not found or not readable!' % args.config, file=sys.stderr)
parser.print_help()
exit(1)
conf = yaml.load(open(args.config,'r'))
#Connection stuff
pg_host = conf['pg_host']
pg_port = conf['pg_port']
pg_dbname = conf['pg_dbname']
pg_user = conf['pg_user']
pg_pass = conf['pg_password']
#Miscellaneous stuff
server_name = conf['server_name']
smtp_server = conf['smtp_server']
from_addr = conf['from_addr']
response_addr = conf['response_addr']
bcc_addr = conf['bcc_addr']
info_url = conf['info_url']
#Threshold stuff
warn_threshold = conf['warn_weeks']
delete_threshold = conf['delete_weeks']
#Show config only then exit
if args.show_config:
print('Database host name: %s' % pg_host)
print('Database port: %s' % pg_port)
print('Database name: %s' % pg_dbname)
print('Datbase user: %s' % pg_user)
print('Warning threshold: %s weeks' % warn_threshold)
print('Delete threshold: %s weeks' % delete_threshold)
return
#Ok, everything past here will require a connection...
#Create a connection object
conn = psycopg2.connect(host=pg_host, port=pg_port, user=pg_user, dbname=pg_dbname, password=pg_pass)
if VERBOSE:
print("Connection: %s" % conn, file=sys.stderr)
"""
Info Only: Depending on the verbosity, prints out numbers of or lists of
users and their histories to be:
* Warned
* Deleted
"""
if args.info_only:
warn_hists = get_old_hists(conn, warn_threshold, VERBOSE)
dont_warn_again = get_old_hists(conn, warn_threshold + 1, VERBOSE)
actually_warn_hists = subtract_lists(warn_hists, dont_warn_again)
delete_hists = get_old_hists(conn, delete_threshold, VERBOSE)
print('*******************************')
if VERBOSE:
print('The following users will get warnings (Histories are %i weeks old):' % warn_threshold)
else:
print('Number of users with %i week old histories to be warned and number histories to be warned about:' % warn_threshold)
user_warns, user_warn_hists = transform_hists(actually_warn_hists)
hist_count = 0
user_count = 0
for u in user_warns.keys():
if VERBOSE:
print('User: %s, %s' % (user_warns[u]['uname'], user_warns[u]['email']))
for h in user_warn_hists[u]:
if VERBOSE:
print('\tHistory: %s\t%s' % (h[0], h[1]))
hist_count += 1
user_count += 1
print('Users to be warned: %i' % user_count)
print('Histories to be warned about: %i' % hist_count)
print('*******************************')
if VERBOSE:
print('The following %i weeks old histories would be marked as deleted:' % delete_threshold)
for h in delete_hists:
print('History: %s\t%s' % (h['id'],h['name']))
else:
print('The number of %i weeks old histories to be marked as deleted: %i' % (delete_threshold, len(delete_hists)))
return
if args.actually_delete_things:
"""
This will actually alter the database and actually delete things! Be very careful!
"""
# First we need to get the deleteable histories.
delete_hists = get_old_hists(conn, delete_threshold, VERBOSE)
#Send this dictionary to the delete histories sub.
if len(delete_hists) > 0:
hist_ids = []
for h in delete_hists:
hist_ids.append(h['id'])
if VERBOSE:
print('These histories are to be marked as deleted.')
for h in hist_ids:
print("History: %s" % h)
UPDATE_Q = """
UPDATE
history
SET
deleted = TRUE
WHERE
id = ANY( %s );
"""
curr = conn.cursor()
if VERBOSE:
print('Running the following query to update the database')
print(curr.mogrify(UPDATE_Q, (hist_ids,)))
try:
curr.execute(UPDATE_Q, (hist_ids,))
conn.commit()
except psycopg2.Error as e:
print('Something went wrong with the commit. Rolling back')
print(e)
conn.rollback()
pass
# Now we need to email users who are getting warnings. :)
warn_hists = get_old_hists(conn, warn_threshold, VERBOSE)
dont_warn_again = get_old_hists(conn, warn_threshold + 1, VERBOSE)
actually_warn_hists = subtract_lists(warn_hists, dont_warn_again)
user_warns, user_warn_hists = transform_hists(actually_warn_hists)
conn.close()
for u in user_warns.keys():
send_email_to_user(user_warns[u], user_warn_hists[u], warn_threshold, delete_threshold, server_name, smtp_server, from_addr, response_addr, info_url, VERBOSE)
return
if __name__ == "__main__": main()