forked from CITS3200-Professioanl-Computing-Team-21/WA-Crime
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter.py
489 lines (435 loc) · 18.9 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
import csv, sqlite3, os, pandas
from operator import sub
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from scipy import stats
def filter(name, zone, year, mq, offence):
# Make sure all inputs are lower case
year = year.lower()
if mq == 'All':
mq = mq.lower()
elif mq[0] == "-":
mq = mq[1:]
offence = offence.lower()
name = name.lower()
database = "data.db"
crime_file = "crime.csv"
localities_file = "localities.csv"
# Year_fn is financial year starting year. Do not need to store the
# following year - it is implied. Use of integer makes for easier
# comparison.
sql_create_projects_table = """ CREATE TABLE IF NOT EXISTS Crime (
suburb text NOT NULL,
crime text NOT NULL,
year_fn int NOT NULL,
jul int,
aug int,
sep int,
oct int,
nov int,
dec int,
jan int,
feb int,
mar int,
apr int,
may int,
jun int,
annual int
); """
sql_create_tasks_table = """CREATE TABLE IF NOT EXISTS Localities (
suburb text PRIMARY KEY NOT NULL,
station text NOT NULL,
district text NOT NULL,
region text NOT NULL
);"""
# Create a database connection
conn = create_connection(database)
# Create tables if they don't exist, create cursor
if conn is not None:
c = conn.cursor()
# Create data table
filled = create_table(c, sql_create_projects_table, "Crime")
# Only fill tables if no data in them - avoids redundantly doing
# it each time.
if not filled:
fill_table(c, crime_file, "Crime")
conn.commit()
# Create localities table
filled = create_table(c, sql_create_tasks_table, "Localities")
if not filled:
fill_table(c, localities_file, "Localities")
conn.commit()
else:
print("Error! cannot create the database connection.")
return
# Build query in stages
query0 = ""
query1 = ""
mrange = ""
y1, y2 = 0, 0
# Construct month range for query. If 'all', defaults to jul-jun
if str(mq).find('all') == -1:
mrange = str(distribution(mq))
else:
# We assume no more updates to the calendar system
mq = "jul-jun"
mrange = str(distribution(mq))
# Construct year range for query. If 'all', defaults to max year range
if str(year).find('all') == -1:
y1, y2 = yrange(year)
else:
# In-built Y3K bug
y1, y2 = "0", "3000"
# If year order came in the wrong way
if y1 > y2:
y2, y1 = y1, y2
# Determines subordinate zone type for selection and grouping
zones = ['suburb', 'station', 'district', 'region']
sub_zone = ""
# When name is 'all', we return all of that zone type, therefore no subzones. When name is not all, we return only the named zone and it's subzones.
# If zone is suburb and named, we return the station that named suburb belongs to.
if name != 'all':
if zone == 'suburb':
sub_zone = 'suburb'
zone = 'station'
get_station = "SELECT station FROM Localities WHERE suburb = '" + name + "'"
c.execute(get_station)
name = c.fetchone()[0]
else:
sub_zone = zones[zones.index(zone) - 1]
else:
sub_zone = zone
# Sum across every month for 'total' column, used for unfiltered data
allmonth = "SUM(jul) + SUM(aug) + SUM(sep) + SUM(oct) + SUM(nov) + SUM(dec) + SUM(jan) + SUM(feb) + SUM(mar) + SUM(apr) + SUM(may) + SUM(jun)"
# Default unfiltered data query
query0 += "CREATE TEMPORARY TABLE unfiltered AS SELECT Localities." + sub_zone + ", year_fn, SUM(jul) as jul, SUM(aug) as aug, SUM(sep) as sep, SUM(oct) as oct, SUM(nov) as nov, SUM(dec) as dec, SUM(jan) as jan, SUM(feb) as feb, SUM(mar) as mar, SUM(apr) as apr, SUM(may) as may, SUM(jun) as jun, " + allmonth + " as total FROM (Crime LEFT OUTER JOIN Localities ON Crime.suburb = Localities.suburb)"
# Construction of unfiltered data query
if name != 'all' or offence != 'all' or year != 'all':
query0 += " WHERE"
# includeand to check if query has multiple conditions, so that "AND" can be appended to include them
includeand = False
if name != 'all':
query0 += " Localities." + zone + " = '" + name.lower() + "'"
includeand = True
if offence != 'all':
if includeand:
query0 += " AND"
query0 += " Crime = '" + offence.lower() + "'"
includeand = True
if year != 'all':
if includeand:
query0 += " AND"
query0 += " year_fn >= " + y1 + " AND year_fn < " + y2
query0 += " GROUP BY Localities." + sub_zone + ", year_fn"
# Run query to make temp unfiltered table, then get unfiltered data.
c.execute(query0)
c.execute("SELECT * FROM unfiltered")
j = 0
unfiltered = []
for i in c.fetchall():
unfiltered.append(list(i))
j += 1
# Convert to pandas frame
unfiltered = convert(unfiltered, ["name", "year_fn", "jul", "aug", "sep", "oct", "nov", "dec", "jan", "feb", "mar", "apr", "may", "jun", "total"])
# Following is to apply final filters to data
if mrange != allmonth:
query1 += "SELECT " + sub_zone + ", SUM(" + mrange + ") AS total FROM unfiltered"
else:
query1 += "SELECT " + sub_zone + ", total FROM unfiltered"
query1 += " GROUP BY " + sub_zone + " ORDER BY " + sub_zone
c.execute(query1)
j = 0
filtered = []
for i in c.fetchall():
filtered.append(list(i))
j += 1
filtered = convert(filtered, ["name", "sum"])
# Calculate data anomalies
anomalies = statistics(filtered, unfiltered, mrange)
# Build data object that holds textual anomaly information, and graph information
data = statobject(anomalies, offence, unfiltered, mrange)
# Convert to pandas frame to be used in UI
if anomalies != []:
anomalies = convert(anomalies, ["name", "year", "mean", "std_e", "observation", "p-value"])
# Returns the filtered data to make heatmap, anomaly information, and data object holding textual anomaly information and graph
return filtered, anomalies, data
# Function for calculating anomalies
def statistics(filtered, unfiltered, mrange):
name_list = unfiltered['name'].drop_duplicates().sort_values().tolist()
year_list = unfiltered['year_fn'].drop_duplicates().sort_values().tolist()
temp1 = unfiltered[unfiltered.columns[[0,1]]]
if '+' not in mrange:
temp2 = unfiltered[mrange]
elif '+' in mrange:
temp2 = unfiltered[mrange[1:-1].split('+')]
dataframe = pandas.concat([temp1, temp2], axis=1)
temp = []
anomalies = []
# Anomalies are calculated across years when multiple year range is given
if len(year_list) > 1:
for a in range(len(name_list)):
values = dataframe.loc[dataframe['name'] == name_list[a]]
extracted_values = values[values.columns[2:]].values.tolist()
plot_values = []
for i in extracted_values:
plot_values += i
months = []
for i in range(len(year_list)):
months += values[values.columns[2:]].columns.tolist()
count = 0
for i in year_list:
for j in range(len(values[values.columns[2:]].columns.tolist())):
months[count] += str(i)[2:]
count += 1
total = []
for i in range(len(extracted_values)):
extracted_values[i] = [k for k in extracted_values[i] if k != '']
for k in extracted_values:
total.append(sum(k))
mean = np.mean(total)
# If total less than or equal to 1, then no std_d can be calculated, so skip
if len(total) <= 1:
continue
std_d = stats.tstd(total)
# If no std_d, can't determine p value, so skip
if std_d == 0.0:
continue
# Where k is an array that represents the sample
count = 0
for k in extracted_values:
z_value = (sum(k)-mean)/(std_d)
p_value = stats.norm.sf(abs(z_value))
temp.append([name_list[a], year_list[count], mean, std_d, sum(k), p_value])
count += 1
for i in temp:
if i[-1] < 0.05:
anomalies.append(i)
# Singular years calculate anomalies based on month range
elif len(year_list) <= 1:
for a in range(len(name_list)):
z_value = []
p_value = []
values = dataframe.loc[dataframe['name'] == name_list[a]]
extracted_values = values[values.columns[2:]].values.tolist()
month_list = values[values.columns[2:]].columns.tolist()
plot_values = []
for i in extracted_values:
plot_values += i
num = len(plot_values)
if num <= 1:
continue
mean = np.mean(plot_values)
std_d = stats.tstd(plot_values)
if std_d == 0.0:
continue
for i in plot_values:
z_value.append((i-mean)/(std_d))
for i in z_value:
p_value.append(stats.norm.sf(abs(i)))
count = 0
for i, k in enumerate(p_value):
if k < 0.05:
anomalies.append([name_list[a], month_list[count], mean, std_d, extracted_values[0][i], k])
count += 1
return anomalies
#Collects the months whose data are to be summed, scans from m0 to m1 collecting months in between.
def distribution(mq):
months = ["jul", "aug", "sep", "oct", "nov", "dec", "jan", "feb", "mar", "apr", "may", "jun"]
query = ""
if mq != 'all':
mrange = "("
mq = str(mq).split("-")
# Check if it is a range, meaning split by - was successful
if len(mq) > 1:
if mq[0][0] == 'q':
# Calculate starting month
mq[0] = months[(int(mq[0][1])-1)*3]
mq[1] = months[(int(mq[1][1]))*3-1]
m0 = mq[0]
m1 = mq[1]
# If input query had months out of order, fix
if months.index(m0) > months.index(m1):
temp = m1
m1 = m0
m0 = temp
mi = months.index(m0)
curr = m0
while curr != m1:
mrange += curr + "+"
mi = (mi + 1) % 12
curr = months[mi]
# Closes the range with bracket
mrange += curr + ')'
return mrange
else:
# Assuming no months will ever start with Q
if mq[0][0] == 'q':
# Calculate starting month
start = months[(int(mq[0][1])-1)*3]
mi = months.index(start)
end = months[int(mq[0][1])*3-1]
while start != end:
mrange += start + "+"
mi = (mi + 1) % 12
start = months[mi]
mrange += start + ')'
return mrange
else:
return mq[0]
def yrange(year):
y = year.split('-')
# Formatting required to determine the ending year
return y[0], str(int(int(y[-2])/100)) + y[-1]
# Create database connection
def create_connection(db_file):
conn = None
try:
conn = sqlite3.connect(db_file)
return conn
except:
print("Connect to database error")
return conn
def create_table(c, create_table_sql, table):
try:
# Only create table if it doesn't exist (data can't be accessed)
try:
# LIMIT 1 just to make the selection quick if table does exist.
query = "SELECT * FROM " + table + " LIMIT 1"
c.execute(query)
if len(c.fetchall()) > 0:
return True
return False
except:
c.execute(create_table_sql)
return False
except:
print("Create table error")
return None
def fill_table(c, file_name, table):
try:
with open(file_name, 'r') as f:
reader = csv.reader(f)
columns = next(reader)
query = "INSERT INTO " + table + " VALUES({})".format(','.join('?' * len(columns)))
for data in reader:
# Also converts everything to lower case
data = [e.lower() for e in data]
# Turn year_fn into an int that is just the beginning year.
if table == "Crime":
data[2] = int(data[2].split("-")[0])
c.execute(query, data)
c.commit()
except:
print("Error filling table data")
# Converts list to pandas data frame
def convert(list, cols):
df = pandas.DataFrame(list, columns = cols)
return df
# Data object to hold information returned to application
class statobject:
def __init__(self, anomalydata, offence, unfiltered, mrange):
# Text to be displayed in panel 1 summarised anomalies.
self.anomalies_text = ""
self.unfiltered = unfiltered
self.mrange = mrange
if anomalydata != []:
self.anomalies_text = text(anomalydata, offence)
def getGraph(self):
graph(self.unfiltered, self.mrange)
def getText(self):
return self.anomalies_text
# Function to draw time series or bar charts for query
def graph(unfiltered, mrange):
year_list = unfiltered['year_fn'].drop_duplicates().sort_values().tolist()
temp1 = unfiltered[unfiltered.columns[[0,1]]]
if '+' not in mrange:
temp2 = unfiltered[mrange]
elif '+' in mrange:
temp2 = unfiltered[mrange[1:-1].split('+')]
dataframe = pandas.concat([temp1, temp2], axis=1)
sorted = unfiltered.sort_values(by=['total'], ascending=False)
name_list = sorted['name'].drop_duplicates().tolist()
bar_names = unfiltered['name'].drop_duplicates().tolist()
# If more than 5 years, we plot years along time series
if len(year_list) > 5:
limit = 5
if len(name_list) < limit:
limit = len(name_list)
for i in range(limit):
values = unfiltered.loc[unfiltered['name'] == name_list[i]]
data_years = values[values.columns[1]].values.tolist()
if len(data_years) != len(year_list):
# i -= 1
continue
extracted_values = values[values.columns[-1:]].values.tolist()
plot_values = []
for i in extracted_values:
plot_values += i
plt.plot(data_years, plot_values, marker='o')
plt.legend(name_list)
plt.xticks(rotation=45)
plt.xlabel('Years/Months Distribution')
plt.ylabel('Crime Counts')
# Less than or equal to 5 years we plot months over those years
elif len(year_list) <= 5:
# If a singular month and only one year featured, plot bar graph instead
if '+' not in mrange and len(year_list) <= 1:
temp = dataframe.groupby('name')[mrange].sum()
plt.bar(bar_names, list(temp))
plt.xticks(rotation=45)
plt.xlabel('Locations')
plt.ylabel('Crime Counts')
plt.title(mrange + str(year_list)[3:-1])
else:
limit = 5
if len(name_list) < limit:
limit = len(name_list)
for i in range(limit):
values = dataframe.loc[dataframe['name'] == name_list[i]]
extracted_values = values[values.columns[2:]].values.tolist()
data_years = values[values.columns[1]].values.tolist()
if len(data_years) != len(year_list):
continue
plot_values = []
for i in extracted_values:
plot_values += i
months = []
for i in range(len(data_years)):
months += values[values.columns[2:]].columns.tolist()
count = 0
for i in data_years:
for j in range(len(values[values.columns[2:]].columns.tolist())):
months[count] += str(i)[2:]
count += 1
plt.plot(months, plot_values, marker='o')
plt.legend(name_list)
plt.xticks(rotation=45)
plt.xlabel('Years/Months Distribution')
plt.ylabel('Crime Counts')
plt.show()
# Generates textual information for anomalies
def text(anomalydata, offence):
if offence == "all":
offence = "crime"
text = ""
text += "ORDERED BY LOCATION:\n\n"
for i in range(len(anomalydata)):
period, name, change = str(anomalydata[i][1]), anomalydata[i][0], "higher" if anomalydata[i][4] > anomalydata[i][2] else "lower"
percent = abs(anomalydata[i][4]-anomalydata[i][2])/anomalydata[i][2]
text += name + " (" + period + ")" " had " + str(round(percent*100, 2)) + "% " + change + " " + offence + " than average\n\n"
anomalydata.sort(key=year, reverse=True)
text += "\n\nORDERED BY DATE:\n\n"
for i in range(len(anomalydata)):
period, name, change = str(anomalydata[i][1]), anomalydata[i][0], "higher" if anomalydata[i][4] > anomalydata[i][2] else "lower"
percent = abs(anomalydata[i][4]-anomalydata[i][2])/anomalydata[i][2]
text += name + " (" + period + ")" " had " + str(round(percent*100, 2)) + "% " + change + " " + offence + " than average\n\n"
return text
# Sorting method for dates
def year(anom):
if type(anom[1]) is int:
return anom[1]
else:
months = ["jul", "aug", "sep", "oct", "nov", "dec", "jan", "feb", "mar", "apr", "may", "jun"]
return months.index(anom[1])
filter("regional wa region", "region", "all", "all", "all")