-
Notifications
You must be signed in to change notification settings - Fork 0
/
__init__.py
163 lines (153 loc) · 4.66 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from cudatext import *
from .chardet.universaldetector import UniversalDetector
import codecs
import cudatext_cmd
import os
import shutil
import time
import hashlib
SKIP_ENCODINGS = ('ASCII', 'UTF-8', 'UTF-16LE', 'UTF-16BE')
SUPERSETS = {
'GB2312': 'GBK',
'GBK': 'GB18030',
'BIG5': 'CP950', # CP950 is common in Taiwan
'CP950': 'BIG5-HKSCS', # HK official Big5 variant
'EUC-KR': 'CP949' # CP949 is a superset of euc-kr!
}
CODE_PAGES = {
'GB2312': 'cp936',
'GBK': 'cp936',
'BIG5': 'cp950',
'EUC-KR': 'cp949'
}
TMP_DIR = os.path.join(app_path(APP_DIR_DATA), 'c2u_tmp')
if not os.path.exists(TMP_DIR):
os.mkdir(TMP_DIR)
def get_temp_name(name):
if not name:
return None
name = name.encode('UTF-8')
return hashlib.md5(name).hexdigest()
def detect(view, file_name, cnt):
if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
return
msg_status('Detecting encoding, please wait...')
detector = UniversalDetector()
fp = open(file_name, 'rb')
for line in fp:
# cut MS-Windows CR code
line = line.replace(b'\r',b'')
detector.feed(line)
cnt -= 1
if detector.done or cnt == 0:
break
fp.close()
detector.close()
encoding = detector.result['encoding']
if encoding:
encoding = encoding.upper()
confidence = detector.result['confidence']
check_encoding(view, encoding, confidence)
def check_encoding(view, encoding, confidence):
view_encoding = view.get_prop(PROP_ENC)
result = 'Detected {0} vs {1} with {2:.0%} confidence'.format(encoding, view_encoding, confidence) if encoding else 'Encoding can not be detected'
msg_status(result)
print(result)
not_detected = not encoding or confidence < 0.95 or encoding == view_encoding
# CudaText can't detect the encoding
if view_encoding in ('ANSI', '?'):
if not_detected:
return
else:
return
init_encoding_vars(view, encoding)
def get_menu(encoding):
cp = CODE_PAGES.get(encoding, encoding.lower())
menu = 'cmd_Encoding_' + cp + '_Reload'
return getattr(cudatext_cmd, menu, None)
def init_encoding_vars(view, encoding, run_convert=True, stamp=None, detect_on_fail=False):
if not encoding:
return
if encoding in SKIP_ENCODINGS:
return
menu = get_menu(encoding)
if menu:
# use reload menu item
view.cmd(menu)
return
if run_convert:
if stamp == None:
stamp = '{0}'.format(time.time())
convert_to_utf8(view, encoding, stamp, detect_on_fail)
def convert_to_utf8(view, encoding=None, stamp=None, detect_on_fail=False):
if not encoding:
return
file_name = view.get_filename()
if not (file_name and os.path.exists(file_name)):
return
# try fast decode
fp = None
try:
fp = codecs.open(file_name, 'rb', encoding, errors='strict')
contents = fp.read()
except LookupError as e:
try:
# reload codecs
import _multibytecodec, imp, encodings
imp.reload(encodings)
imp.reload(codecs)
codecs.getencoder(encoding)
msg_status("Please reopen this file")
except (ImportError, LookupError) as e:
need_codecs = (type(e) == ImportError)
msg_box("Codecs for {0} is not supproted".format(encoding), MB_OK)
return
except UnicodeDecodeError as e:
if detect_on_fail:
detect(view, file_name, 100)
return
superset = SUPERSETS.get(encoding)
if superset:
print('Try encoding {0} instead of {1}.'.format(superset, encoding))
init_encoding_vars(view, superset, True, stamp)
return
fp.close()
fp = codecs.open(file_name, 'rb', encoding, errors='ignore')
contents = fp.read()
finally:
if fp:
fp.close()
contents = contents.replace('\r\n', '\n').replace('\r', '\n')
view.set_text_all(contents)
view.set_prop(PROP_ENC, encoding)
msg_status('{0} -> UTF8'.format(encoding))
def convert_from_utf8(file_name, encoding):
if encoding in SKIP_ENCODINGS or get_menu(encoding):
return
msg = "Converting {0} back to {1}".format(file_name, encoding)
msg_status(msg)
print(msg)
try:
fp = open(file_name, 'rb')
contents = codecs.EncodedFile(fp, encoding, 'UTF-8').read()
except (LookupError, UnicodeEncodeError) as e:
msg_box('Can not convert file encoding of {0} to {1}, it was saved as UTF-8 instead:\n\n{2}'.format
(os.path.basename(file_name), encoding, e), MB_OK)
return
finally:
if fp:
fp.close()
# write content to temporary file
tmp_name = os.path.join(TMP_DIR, get_temp_name(file_name))
fp = open(tmp_name, 'wb')
fp.write(contents)
fp.close()
# os.rename has "Invalid cross-device link" issue
os.chmod(tmp_name, os.stat(file_name)[0])
shutil.move(tmp_name, file_name)
msg_status('UTF8 -> {0}'.format(encoding))
class Command:
def on_open(self, ed_self):
detect(ed_self, ed_self.get_filename(), 100)
def on_save(self, ed_self):
convert_from_utf8(ed_self.get_filename(), ed_self.get_prop(PROP_ENC))