diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index a81fe8ca7e..9a7fecfd76 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -146,7 +146,11 @@ def split_sentences(self, file_name): def process_json_file(self, file_name): input_file_name, output_prefix = file_name print("Opening", input_file_name) - fin = open(input_file_name, 'r', encoding='utf-8') + # support for gzip files + if input_file_name.endswith(".gz"): + fin = gzip.open(input_file_name, 'rt') + else: + fin = open(input_file_name, 'r', encoding='utf-8') startup_start = time.time() encoder = Encoder(self.args)