This repository has been archived by the owner on May 11, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
determine.py
124 lines (109 loc) · 4.33 KB
/
determine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from psycopg2.extensions import AsIs
import psycopg2.extras
import numpy as np
from sklearn import svm
# Connect to Postgres
connection = psycopg2.connect(dbname='ocr-classify', user='john', host='localhost', port='5432')
#cursor = connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cursor = connection.cursor()
cursor.execute("""
SELECT
a.area_id,
ad.doc_id,
page_no,
labels.name AS label,
has_words::int,
line_intersect::int,
small_text::int,
small_leading::int,
is_line::int,
is_top_or_bottom::int,
mostly_blank::int,
very_separated_words::int,
little_word_coverage::int,
normal_word_separation::int,
normal_word_coverage::int,
best_caption::int,
good_caption::int,
ok_caption::int,
overlap::int,
offset_words::int,
proportion_alpha,
area::float / (select max(area) from areas) as area,
n_gaps::float / (select max(n_gaps) from areas) as n_gaps,
n_lines::float / (select max(n_lines) from areas) as n_lines,
page_no::float / (select max(page_no) from areas z join area_docs x on z.area_id=x.area_id where x.doc_id = ad.doc_id)
FROM areas a
JOIN area_docs ad ON a.area_id = ad.area_id
JOIN area_labels al ON al.area_id = a.area_id
JOIN labels ON labels.label_id = al.label_id
WHERE a.area_id NOT IN (
527,1252,56,153,1454,1072,1154,1514,207,1269,89,1684,102,66,560,972,858,280,1425,1071,759,787,1451,1172,1576,552,831,926,1262,901,1233,1415,1695,464,316,1397,746,1663,1366,492,967,1627,1617,205,1643,876,1630,441,1651,1657
)
""")
data = cursor.fetchall()
# Omit area_id, doc_id, page_no, and label_name
train = [ list(d[4:]) for d in data ]
label = np.array([ d[3] for d in data ])
index = [ d[0:3] for d in data ]
# gamma - influence of a single training example. low = far, high = close
# C - low = less freedom, high = more freedom
clf = svm.SVC(gamma=1, C=100, probability=True, cache_size=500, kernel='rbf')
clf.fit(train, label)
cursor.execute("""
SELECT
a.area_id,
ad.doc_id,
page_no,
labels.name AS label,
has_words::int,
line_intersect::int,
small_text::int,
small_leading::int,
is_line::int,
is_top_or_bottom::int,
mostly_blank::int,
very_separated_words::int,
little_word_coverage::int,
normal_word_separation::int,
normal_word_coverage::int,
best_caption::int,
good_caption::int,
ok_caption::int,
overlap::int,
offset_words::int,
proportion_alpha,
area::float / (select max(area) from areas) as area,
n_gaps::float / (select max(n_gaps) from areas) as n_gaps,
n_lines::float / (select max(n_lines) from areas) as n_lines,
page_no::float / (select max(page_no) from areas z join area_docs x on z.area_id=x.area_id where x.doc_id = ad.doc_id)
FROM areas a
JOIN area_docs ad ON a.area_id = ad.area_id
JOIN area_labels al ON al.area_id = a.area_id
JOIN labels ON labels.label_id = al.label_id
WHERE a.area_id IN (
527,1252,56,153,1454,1072,1154,1514,207,1269,89,1684,102,66,560,972,858,280,1425,1071,759,787,1451,1172,1576,552,831,926,1262,901,1233,1415,1695,464,316,1397,746,1663,1366,492,967,1627,1617,205,1643,876,1630,441,1651,1657
)
""")
data = cursor.fetchall()
groups = {}
for cat in clf.classes_:
groups[cat] = {
'p': [],
'label': []
}
for area in data:
classification_p = clf.predict_proba([list(area[4:])])
classification_label = clf.predict([list(area[4:])])
named_p = zip(clf.classes_, classification_p[0])
groups[area[3]]['label'].append( 1 if classification_label[0] == area[3] else 0 )
for each in named_p:
if each[0] == area[3]:
groups[area[3]]['p'].append(each[1])
for cat in groups:
print(cat)
print(' + p (avg) ', (sum(groups[cat]['p']) / len(groups[cat]['p'])))
print(' + correct ', (sum(groups[cat]['label']) / float(len(groups[cat]['label']))) * 100, '%')
print('TOTAL')
print(' + p (avg) ', sum([ val for val in groups[cat]['p'] for cat in groups ]) / len([ val for val in groups[cat]['p'] for cat in groups ]))
print(' + correct ', sum([ val for val in groups[cat]['label'] for cat in groups ]) / float(60) * 100, '%')