Skip to content

Commit 4ff9ce4

Browse files
authored
Merge pull request #20 from Besedo/manage-all-jsons
Refacto and clean escaped elements
2 parents c7ce302 + 2cbb061 commit 4ff9ce4

File tree

1 file changed

+66
-47
lines changed

1 file changed

+66
-47
lines changed

json_to_csv/json_to_csv.py

Lines changed: 66 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -195,57 +195,76 @@ def read_jsons_chunks(file_object, chunk_size=10000):
195195
"""Lazy function to read a json by chunk.
196196
Default chunk size: 10k"""
197197

198+
# Parse the next real chunk_size lines
199+
chunk = file_object.read(1000000)
200+
data = []
201+
i = 0
202+
nb_bracket = 0
203+
nb_quotes = 0
204+
example = ""
205+
count_escape_char = 0
198206
while True:
199-
# Parse the next real chunk_size lines
200-
data = []
201-
for i in range(chunk_size):
202-
nb_bracket = 0
203-
nb_quotes = 0
204-
example = ""
205-
c_bef = ""
206-
c_2bef = ""
207-
while True:
208-
# Read one character
209-
c = file_object.read(1)
210-
# If we are at the end of the file
211-
if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0:
212-
break
213-
# If we are in between 2 json examples or a the end or at the beginning
214-
if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
207+
# Read cahracter by character
208+
for k, c in enumerate(chunk):
209+
# Check quoting
210+
if c == '"':
211+
# Check only when '"' is a delimiter of field or value in json
212+
if count_escape_char % 2 == 0:
213+
nb_quotes += 1
214+
# Check beginning of brackets
215+
elif c == '{' and nb_quotes % 2 == 0:
216+
# Check only when '{' is a delimiter of field or value in json
217+
if count_escape_char % 2 == 0:
218+
nb_bracket += 1
219+
# Check ending of brackets
220+
elif c == '}' and nb_quotes % 2 == 0:
221+
# Check only when '"' is a delimiter of field or value in json
222+
if count_escape_char % 2 == 0:
223+
nb_bracket -= 1
224+
# This means we finished to read one json
225+
if nb_bracket == 0 and nb_quotes % 2 == 0:
226+
example += c
227+
data.append(json.loads(example))
228+
i += 1
229+
# When chunk_size jsons obtained, dump those
230+
if i % chunk_size == 0:
231+
yield(data)
232+
data = []
233+
234+
# Initialize those
235+
example = ""
236+
c_bef = ""
237+
c_2bef = ""
215238
continue
216-
# Check beginning of brackets
217-
if c == '{' and nb_quotes % 2 == 0:
218-
# Check only when '{' is a delimiter of field or value in json
219-
if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
220-
nb_bracket += 1
221-
# Check quoting
222-
elif c == '"':
223-
# Check only when '"' is a delimiter of field or value in json
224-
if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
225-
nb_quotes += 1
226-
# Check ending of brackets
227-
elif c == '}' and nb_quotes % 2 == 0:
228-
# Check only when '"' is a delimiter of field or value in json
229-
if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
230-
nb_bracket -= 1
231-
# This means we finished to read one json
232-
if nb_bracket == 0 and nb_quotes % 2 == 0:
233-
example += c
234-
break
235-
# Append character to the json example
236-
example += c
237-
# Set previous characters
238-
c_2bef = c_bef
239-
c_bef = c
240-
# If EOF obtained or end of jsonarray send what's left of the data
241-
if example == "" or example == "]":
242-
yield(data)
243-
return
239+
# If we are in between 2 json examples or at the beginning
240+
elif c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
241+
continue
242+
# If we are at the end of the file
243+
if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0:
244+
# If EOF obtained or end of jsonarray send what's left of the data
245+
if example == "" or example == "]":
246+
yield(data)
247+
return
248+
if c == "\\":
249+
count_escape_char += 1
244250
else:
245-
data.append(json.loads(example))
246-
if not data:
251+
count_escape_char = 0
252+
# Append character to the json example
253+
example += c
254+
255+
# Set previous characters
256+
c_2bef = c_bef
257+
c_bef = c
258+
# If at the end of the chunk, read new chunk
259+
if k == len(chunk) - 1:
260+
chunk = file_object.read(1000000)
261+
# Keep what's left of the chunk
262+
elif len(chunk) != 0:
263+
chunk = chunk[k:]
264+
# if k == 0 that means that we read the whole file
265+
else:
247266
break
248-
yield data
267+
249268

250269

251270
def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json):

0 commit comments

Comments
 (0)