@@ -195,57 +195,76 @@ def read_jsons_chunks(file_object, chunk_size=10000):
195
195
"""Lazy function to read a json by chunk.
196
196
Default chunk size: 10k"""
197
197
198
+ # Parse the next real chunk_size lines
199
+ chunk = file_object .read (1000000 )
200
+ data = []
201
+ i = 0
202
+ nb_bracket = 0
203
+ nb_quotes = 0
204
+ example = ""
205
+ count_escape_char = 0
198
206
while True :
199
- # Parse the next real chunk_size lines
200
- data = []
201
- for i in range (chunk_size ):
202
- nb_bracket = 0
203
- nb_quotes = 0
204
- example = ""
205
- c_bef = ""
206
- c_2bef = ""
207
- while True :
208
- # Read one character
209
- c = file_object .read (1 )
210
- # If we are at the end of the file
211
- if c in [']' , '' ] and nb_bracket == 0 and nb_quotes % 2 == 0 :
212
- break
213
- # If we are in between 2 json examples or a the end or at the beginning
214
- if c in ['[' , ',' , '\n ' ] and nb_bracket == 0 and nb_quotes % 2 == 0 :
207
+ # Read cahracter by character
208
+ for k , c in enumerate (chunk ):
209
+ # Check quoting
210
+ if c == '"' :
211
+ # Check only when '"' is a delimiter of field or value in json
212
+ if count_escape_char % 2 == 0 :
213
+ nb_quotes += 1
214
+ # Check beginning of brackets
215
+ elif c == '{' and nb_quotes % 2 == 0 :
216
+ # Check only when '{' is a delimiter of field or value in json
217
+ if count_escape_char % 2 == 0 :
218
+ nb_bracket += 1
219
+ # Check ending of brackets
220
+ elif c == '}' and nb_quotes % 2 == 0 :
221
+ # Check only when '"' is a delimiter of field or value in json
222
+ if count_escape_char % 2 == 0 :
223
+ nb_bracket -= 1
224
+ # This means we finished to read one json
225
+ if nb_bracket == 0 and nb_quotes % 2 == 0 :
226
+ example += c
227
+ data .append (json .loads (example ))
228
+ i += 1
229
+ # When chunk_size jsons obtained, dump those
230
+ if i % chunk_size == 0 :
231
+ yield (data )
232
+ data = []
233
+
234
+ # Initialize those
235
+ example = ""
236
+ c_bef = ""
237
+ c_2bef = ""
215
238
continue
216
- # Check beginning of brackets
217
- if c == '{' and nb_quotes % 2 == 0 :
218
- # Check only when '{' is a delimiter of field or value in json
219
- if c_bef != '\\ ' or c_bef == '\\ ' and c_2bef == '\\ ' :
220
- nb_bracket += 1
221
- # Check quoting
222
- elif c == '"' :
223
- # Check only when '"' is a delimiter of field or value in json
224
- if c_bef != '\\ ' or c_bef == '\\ ' and c_2bef == '\\ ' :
225
- nb_quotes += 1
226
- # Check ending of brackets
227
- elif c == '}' and nb_quotes % 2 == 0 :
228
- # Check only when '"' is a delimiter of field or value in json
229
- if c_bef != '\\ ' or c_bef == '\\ ' and c_2bef == '\\ ' :
230
- nb_bracket -= 1
231
- # This means we finished to read one json
232
- if nb_bracket == 0 and nb_quotes % 2 == 0 :
233
- example += c
234
- break
235
- # Append character to the json example
236
- example += c
237
- # Set previous characters
238
- c_2bef = c_bef
239
- c_bef = c
240
- # If EOF obtained or end of jsonarray send what's left of the data
241
- if example == "" or example == "]" :
242
- yield (data )
243
- return
239
+ # If we are in between 2 json examples or at the beginning
240
+ elif c in ['[' , ',' , '\n ' ] and nb_bracket == 0 and nb_quotes % 2 == 0 :
241
+ continue
242
+ # If we are at the end of the file
243
+ if c in [']' , '' ] and nb_bracket == 0 and nb_quotes % 2 == 0 :
244
+ # If EOF obtained or end of jsonarray send what's left of the data
245
+ if example == "" or example == "]" :
246
+ yield (data )
247
+ return
248
+ if c == "\\ " :
249
+ count_escape_char += 1
244
250
else :
245
- data .append (json .loads (example ))
246
- if not data :
251
+ count_escape_char = 0
252
+ # Append character to the json example
253
+ example += c
254
+
255
+ # Set previous characters
256
+ c_2bef = c_bef
257
+ c_bef = c
258
+ # If at the end of the chunk, read new chunk
259
+ if k == len (chunk ) - 1 :
260
+ chunk = file_object .read (1000000 )
261
+ # Keep what's left of the chunk
262
+ elif len (chunk ) != 0 :
263
+ chunk = chunk [k :]
264
+ # if k == 0 that means that we read the whole file
265
+ else :
247
266
break
248
- yield data
267
+
249
268
250
269
251
270
def get_columns (list_data_paths , sep , logger , int_to_float , remove_null , is_json ):
0 commit comments