-
Notifications
You must be signed in to change notification settings - Fork 1
/
function_en.py
343 lines (263 loc) · 14.6 KB
/
function_en.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
# -*- coding: utf-8 -*-
#@START PROCCESS
#### Pvh Developer
#### create 8-8-2016 working good with .doc file
############################################################################################################################
import textract
import re
import prettytable
print '=========== Buoc 1 : Tim kiem toan bo tep tin can trich xuat trong floder ======='
import glob, os
path = os.getcwd()
_dir = "" + path.replace("/vi","/en/")
#_dir = ""
print _dir
os.chdir(_dir)
lists = [{},{},{},{},{}]
i,j,k,t=0,0,0,0
for file in glob.glob("*.txt"):
lists[1][i] = str(file)
i +=1
#print "txt: %s" %str((lists[1]))
# i+1 is length of lists[1] or use len(lists[1])
for file in glob.glob("*.doc"):
lists[2][j]=str(file)
j+=1
#print "doc : %s " %(str(lists[2]))
# j+1 is length of lists[2]
print ' *** pdf is disabled to test'
'''
for file in glob.glob("*.pdf"):
lists[3][k]=str(file)
k +=1
'''
#print "pdf : %s" %(str(lists[3]))
# k+1 is length of lists[3]
############################################################################################################################
print '================================ Buoc 2 : format du lieu ======='
############################################################################################################################
_i=0
import textract
def convert_to_text_to_process():
for _i in range(0,len(lists[2]) ,1) : # .doc
lists[2][_i] = (textract.process(_dir+lists[2][_i])).decode('utf-8') #.lower()
_i+=1
# print _i
'''
_j=i
for _j in range(_i,len(lists[3])+_i,1) : # .pdf
lists[2][_j] = (textract.process(_dir+lists[3][_j-_i])).decode('utf-8').lower()
_j+=1
print _j
'''
convert_to_text_to_process()
_text_lists_convert2 = lists[2]
_text_lists_convert1 = lists[1]
############################################################################################################################
# Key Tag
#
# Tung TAG nen co cach trich xua khac nhau
# Profile thi tim tu khoa roi lay dong do
# Skill thi lay toan bo doan do
# Education thi lay theo tung cau thuoc doan do
# Experience thi lay tug cau , roi chia cau ra bang cach tim tu khoa thoi gian
#
############################################################################################################################
# print '=============Buoc 3 : Nhap tu khoa va Tag ======='
dict_keys_en_Tag_short_profile = ["full name","date of birth","status","gender","nationality","street","adress","country","email","e-mail","phone","mobile","fax","relocation"]
dict_keys_en_Tag_short_Objective= ["Objective"] #mong muon # Neu ko can thiet lam thi co the lay ca doan do
dict_keys_en_Tag_short_Education = ["university","street","school","coleage"]
dict_keys_en_Tag_short_Experience = ["Experience",]
dict_keys_en_Tag_short_Skill = ["Skills"]
dict_keys_en_Tag_short_Interest= ["Interest"]
# Sau nay se cho vao List_tag[{},{}...]
dict_keys_en_Tag_long = ["Profile", "Objective" , "Education","Experience","Skills","Interest","Total"]
TAG = [[]*(len(dict_keys_en_Tag_short_profile)+1) for i in range(len(dict_keys_en_Tag_long))] # Chon max len
# Part :[ 1[Profile]*m, 2[Objective], 3[Education], 4[Experience],5[Skill],6[Interest],7[...vv]]
TAG[0] = dict_keys_en_Tag_short_profile # Luu TAG[0] = profile = array sort tag
TAG[1] = dict_keys_en_Tag_short_Objective
TAG[2] = dict_keys_en_Tag_short_Education
TAG[3] = dict_keys_en_Tag_short_Experience
TAG[4] = dict_keys_en_Tag_short_Skill
TAG[5] = dict_keys_en_Tag_short_Interest
#dict_keys_vi = ["Ho Ten":0,"":0]
############################################################################################################################
# tim nhung tu khoa chinh truoc , phan vung chung ra thanh nhung doan nho ,sap xep thanh khung xuong, sau do tu nhung doan nho moi tim cac tag 1 dong (short tag) , va luu thanh kieu json theo key - values
############################################################################################################################
dict_keys_en_long_tag = [[None]*(len(dict_keys_en_Tag_long)) for i in range((len(_text_lists_convert2)))]
def find_partion():
for i in range(0,len(_text_lists_convert2),1) :
# chay lan luot cac tep da chuyen kieu doc sang
_i = 0
for line in _text_lists_convert2[i].splitlines():
_i +=1
for key in range(0,len(dict_keys_en_Tag_long),1):
if (dict_keys_en_Tag_long[key -1] in line):
dict_keys_en_long_tag[i][key-1] = (_i)
print " Total line CV : %s" %(str(_i))
dict_keys_en_long_tag[i][6] = (_i)
#print ' ==> \n'
#print " Partion of CV "
find_partion()
#print 'Test Pdf -----------------------------'
def test_pdf():
for i in range(0,len(_text_lists_convert3),1) :
# chay lan luot cac tep da chuyen kieu doc sang
_i = 0
for line in _text_lists_convert3[i].splitlines(): # ??? spliline
_i +=1
for key in range(0,len(dict_keys_en_Tag_long),1):
if dict_keys_en_Tag_long[key -1] in line :
dict_keys_en_long_tag[i][key-1] = (_i)
print " Total line CV : %s" %(str(_i))
print ' ==> \n '
#test_pdf()
#dict_keys_en_long_tag.sort() # Phan chia thanh tung doan nho , sap xep theo thu tu
from prettytable import PrettyTable
t = PrettyTable(dict_keys_en_Tag_long)
for i in range(0,len(_text_lists_convert2),1):
t.add_row(dict_keys_en_long_tag[i])
print t
############################################################################################################################
# Buoc 4-*
# Tim cac tu khoa nho hon , thuoc tung phan, quet tung vung nho :
# ***
############################################################################################################################
# print '\n \t \t Quet lan luot cac mini tag trong long tag va luu vao result 1'
result_1 = [[None]*(len(TAG[0] )) for i in range(len(_text_lists_convert2))]
result_2 = [[None]*(len(TAG[1] )) for i in range(len(_text_lists_convert2))]
result_3 = [[None]*(len(TAG[2] )) for i in range(len(_text_lists_convert2))]
result_4 = [[None]*(len(TAG[3] )) for i in range(len(_text_lists_convert2))]
result_5 = [[None]*(len(TAG[4] )) for i in range(len(_text_lists_convert2))]
result_6 = [[None]*(len(TAG[5] )) for i in range(len(_text_lists_convert2))]
# Tao ra 7 cai result hay tao 1 cai chua ca 7 ?
# De quet tat ca cac vung lan luot , them vong for lon nhat vafo va bien k, dic..[i][k] ? NO! =(
def find_date (result_x,x):
for i in range(0,len(_text_lists_convert2),1): # quet tung tep , i la stt tep
for line in _text_lists_convert2[i].lower().splitlines()[dict_keys_en_long_tag[i][x-1]:dict_keys_en_long_tag[i][x]]:
for key in range(0,len(TAG[x-1] ),1):
if "/" in line:
# print line
result_x[i][key] = line.strip().replace(TAG[x-1][key],"").encode("utf-8")
print " finish find : \n"
#find_date(result_2,2)
def find_tag_education():
for i in range(0,len(_text_lists_convert2),1):
for line in _text_lists_convert2[i].lower().splitlines()[dict_keys_en_long_tag[i][2]:dict_keys_en_long_tag[i][3]]:
for key in range(0,len(TAG[2] ),1):
if TAG[2][key] in line :
# print line
result_3[i][key] = ""
result_3[i][key] += (line.strip()).encode("ascii", "ignore")
# print "find Education"
def find_tag_profile():
for i in range(0,len(_text_lists_convert2),1): # quet tung tep , i la stt tep
for line in _text_lists_convert2[i].lower().splitlines()[dict_keys_en_long_tag[i][0]:dict_keys_en_long_tag[i][1]-1]:
for key in range(0,len(TAG[0] ),1):
if TAG[0][key] in line:
# print line
result_1[i][key] = line.strip().replace(TAG[0][key]or":","").encode("utf-8").replace(":", "").strip()
# print " find profile : \n"
def find_tag_skill():
for i in range(0,len(_text_lists_convert2),1): # quet tung tep , i la stt tep
result_5[i][0]=""
for line in _text_lists_convert2[i].lower().splitlines()[dict_keys_en_long_tag[i][4]:dict_keys_en_long_tag[i][5]-1]:
result_5[i][0] += (str(line)+"\n").encode("ascii", "ignore")
# print 'find skills '
def find_tag_exprience():
for i in range(0,len(_text_lists_convert2),1): # quet tung tep , i la stt tep4
result_4[i][0] =""
for line in _text_lists_convert2[i].lower().splitlines()[dict_keys_en_long_tag[i][3]:dict_keys_en_long_tag[i][4]-1]:
if '19' or '20' in line :
result_4[i][0] += (line.strip() + "\n").encode("ascii", "ignore")
# print 'find exprience'
def find_tag_interest():
for i in range(0,len(_text_lists_convert2),1): # quet tung tep , i la stt tep
result_6[i][0] =""
for line in _text_lists_convert2[i].lower().splitlines()[dict_keys_en_long_tag[i][5]:dict_keys_en_long_tag[i][6]]:
result_6[i][0] += (line.strip() + "\n").encode("ascii", "ignore")
# print 'find interest'
def find_objective():
for i in range(0,len(_text_lists_convert2),1):
result_2[i][0] = ""
for line in _text_lists_convert2[i].lower().splitlines()[dict_keys_en_long_tag[i][1]:dict_keys_en_long_tag[i][2]]:
result_2[i][0] += (line.strip()+"\n").encode("ascii","ignore")
find_tag_interest()
find_tag_exprience()
find_tag_education()
find_tag_profile()
find_tag_skill()
find_objective()
#return result_x
# profile tim theo kieu tim tag thi hop ly , con education , exeprience thi nen tim theo kieu tim ngay thang (/) con skill thi ngat theo dong ,doan
# TAG PROFILE
def show_data (result,k):
k = PrettyTable(TAG[k-1])
for i in range(0,len(_text_lists_convert2),1):
k.add_row(result[i])
print k
return k
show_data(result_1,1) # Tam thoi comment lai vi no roi qua
show_data(result_2,2)
show_data(result_3,3)
show_data(result_4,4)
show_data(result_5,5)
show_data(result_6,6)
def write_file_txt(result_x,x):
data_output = (show_data(result_x,x)).get_string()
with open('text_profile.txt','a') as f:
f.write(data_output)
f.close()
#write_file_txt(result_1,1)
import csv
def write_excel(result_x,x,out_file):
arr= data_output = (result_x)
f = open(out_file,'w')
writer= csv.writer(f)
writer.writerow(TAG[x-1])
for values in arr :
writer.writerow(values)
f.close()
write_excel(result_1,1,'out_persional.csv')
write_excel(result_2,2,'out_Objective.csv')
write_excel(result_3,3,'out_employee.csv') # no tu dong tao ra tep moi neu chua co
write_excel(result_4,4,'out_education.csv')
write_excel(result_5,5,'out_skill.csv')
write_excel(result_6,6,'out_interest.csv')
print '\n===Write excels Success ==================\n'
############################################################################################################################
# Buoc 5
# Dieu quan trong la ban phai che bien dc du lieu , chuyen du lieu str kia ve dictionaryVariable de xu li va chuyen thanh #kieu json la can thiet
############################################################################################################################
#
############################################################################################################################
# return Json type: Buoc 6
############################################################################################################################
print '\t \n '
print '\t\t\t\t\t\t\t\write ouput \t \n'
'''
arr = TAG[0]+TAG[1]+TAG[2]+TAG[3]+TAG[4]+TAG[5]
print arr
m = PrettyTable(arr)
for i in range(0,len(_text_lists_convert2),1):
m.add_row(result_1[i]+result_2[i]+result_3[i]+result_4[i]+result_5[i] + result_6[i])
print m
teext = m.get_string()
with open ('text_tbl.txt', 'a') as _file: # ko ghi de ,ghi de a= w
_file.write(teext)
'''
def write_excel_all(out_file):
arr = result_1+result_2+result_3+result_4+result_5+ result_6 # wrong!
k = TAG[0]+TAG[1]+TAG[2]+TAG[3]+TAG[4]+TAG[5]
f = open(out_file,'w')
writer= csv.writer(f)
writer.writerow(k)
for values in arr :
writer.writerow(values)
f.close()
write_excel_all('out_all_table.csv')
print "END OF PROGRAME"
############################################################################################################################
#@END OF PROCESS Xa voi qua
################ Building Successful
## pvh