-
Notifications
You must be signed in to change notification settings - Fork 0
/
detector.py
43 lines (36 loc) · 1.49 KB
/
detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json
import glob
NANOSECONDS_PER_SECOND = 1e9
def apply_filter(text: str, filters: list):
"""
filters : list of strings
"""
for filter in filters:
if (filter in text.casefold()):
return True
return False
def filter_json(data_path: str = "input", filters: list = []):
"""
data_path : relative path for the folder that contains the input .json files
filters : list of strings
"""
# Get every .json file from data_path folder
json_files = glob.glob(data_path + "\*.json", recursive=False)
for file_path in json_files:
output_text = []
with open(f'{file_path}', encoding='utf-8') as json_file:
# Making .json output
json_file_name = file_path[len(data_path)+1:-5]
json_output = open(f'output\{json_file_name}.json', 'w', encoding='utf-8')
# Reading data
data = json.load(json_file)
transcript = data["text"]
print(f"[{json_file_name}] Transcription: {transcript}\n")
for segment in data["segments"]:
for word in segment["words"]:
# Saving only if word contains any filter
if (apply_filter(word['text'], filters)):
output_text.append({'text': word['text'],
'timestamp': [word['start'], word['end']]})
json_output.write(json.dumps(output_text, indent=4, ensure_ascii=False))
json_output.close()