From 37305ce5e6ef1b91f6058eff5b128064f76fbe45 Mon Sep 17 00:00:00 2001 From: Simon Bray <32272674+simonbray@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:48:52 +0100 Subject: [PATCH] Fix two bugs with `split_file_to_collection` (#1358) * fix two bugs * allow chunksize to be greater than the number of records in the input file without failing * allow an empty file to be split without an error * bump version number --- .../split_file_to_collection/split_file_to_collection.py | 3 ++- .../split_file_to_collection/split_file_to_collection.xml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/text_processing/split_file_to_collection/split_file_to_collection.py b/tools/text_processing/split_file_to_collection/split_file_to_collection.py index 675e15451d..6580fecf2b 100755 --- a/tools/text_processing/split_file_to_collection/split_file_to_collection.py +++ b/tools/text_processing/split_file_to_collection/split_file_to_collection.py @@ -224,6 +224,7 @@ def split_by_record(args, in_file, out_dir, top, ftype): for i in range(top): f.readline() n_records = 0 + last_line_matched = False for line in f: if (num == 0 and re.match(sep, line) is not None) or ( num > 0 and n_records % num == 0 @@ -241,7 +242,7 @@ def split_by_record(args, in_file, out_dir, top, ftype): if chunksize == 0: # i.e. no chunking n_per_file = n_records // numnew else: - numnew = n_records // chunksize + numnew = max(n_records // chunksize, 1) # should not be less than 1 n_per_file = chunksize # make new files diff --git a/tools/text_processing/split_file_to_collection/split_file_to_collection.xml b/tools/text_processing/split_file_to_collection/split_file_to_collection.xml index 56c4248d99..354df3f590 100644 --- a/tools/text_processing/split_file_to_collection/split_file_to_collection.xml +++ b/tools/text_processing/split_file_to_collection/split_file_to_collection.xml @@ -1,4 +1,4 @@ - + to dataset collection