-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpt3_parse_daybooks.py
189 lines (130 loc) · 5.82 KB
/
gpt3_parse_daybooks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import glob
import json
import os
import openai
import util
import re
import sys
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
def count_tokens(text: str) -> int:
"""count the number of tokens in a string"""
return len(tokenizer.encode(text))
openai.api_key = os.getenv("OPENAI_API_KEY")
if len(sys.argv[1:]) != 1:
print("Pass what dir to work on, or 'all'")
work_on = sys.argv[1:][0]
for file in glob.glob('daybook-and-diaries-1856-1906-daybook-1*/*.json'):
print(file)
data = json.load(open(file))
dir = file.split('/')[-2]
file_id = int(file.split('/')[-1].replace('.json',''))
data['id'] = file_id
data['dir'] = dir
if work_on != 'all':
if work_on not in dir:
print('skipping',dir)
continue
if 'full_text' in data:
if 'gpt' not in data:
data['gpt'] = {}
if 'daybook-json' in data['gpt']:
continue
print("WOrking on:",file)
full_text = util.clean_up_transcribed_text(data['full_text'])
if len(full_text) < 50:
continue
prompt = f"Using only the text below. Structure the following multiple diary text entries by Susan B Anthony into a valid JSON array of dictionaries extracting the date, the date again in the format yyyy-mm-dd, the city or state it was written in, other geographical locations mentioned that day, people mentioned that day, and the complete full text of the entry and a one sentence summary of the text, using the JSON keys date, dateFormated, cityOrState, geographical, people, and fullText, summaryText:\n\n---\n{full_text}\n---\n"
print('----PROMT----')
print(prompt)
response = openai.Completion.create(
model="text-davinci-003",
prompt=prompt,
temperature=0.0,
max_tokens=4096 - count_tokens(prompt),
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
print("response['choices'][0]")
print(response['choices'][0])
text_response=response['choices'][0]['text']
print('---------text_response before=========')
print(text_response)
if response['choices'][0]['finish_reason'] == "length":
print('daybook-length-too-long')
data['gpt']['error'] = 'daybook-length-too-long'
continue
if text_response.find('[') == -1:
data['gpt']['error'] = 'daybook-json'
else:
# trim off any extra (? why) text before the structured data
if text_response.find('[') > 0:
text_response = text_response[text_response.find('[')-1:].strip()
text_response = text_response.replace('\\\\',"")
text_response = text_response.replace('\\"',"'")
text_response = text_response.replace('("',"(")
text_response = text_response.replace('")',")")
text_response = text_response.replace('"-',"'")
text_response = text_response.replace('""','')
text_response = text_response.replace('`','')
text_response = text_response.replace('JSON:','')
text_response = text_response.replace('[JSON]','')
text_response = text_response.replace('// etc.','')
text_response = text_response.replace('// and so on...','')
text_response = text_response.replace('// ...','')
text_response = text_response.replace('"cityOrState": ,','"cityOrState": null,')
text_response = text_response.replace('"dateFormatted": ,','"dateFormatted": null,')
text_response = text_response.replace('"dateFormated": ,','"dateFormated": null,')
text_response = text_response.replace('"geographical": ,','"geographical": null,')
text_response = text_response.replace('"date": ,','"date": null,')
text_response = text_response.replace('"people": ,','"people": null,')
text_response = text_response.replace('"dateFormated": ,','"dateFormated": null,')
text_response = text_response.replace('"fullText": ,','"fullText": null,')
text_response = text_response.replace('"summaryText":\n','"summaryText": null\n')
text_response = text_response.replace('"summaryText": \n','"summaryText": null\n')
text_response=text_response.strip()
# remove any trailing }, that are there
text_response = re.sub(r'},\n\s*\n\]', '}]', text_response)
# find the fulltext part
fulltext_searches = re.finditer(r'"fullText"\:(.*)', text_response, re.IGNORECASE)
fulltext_searches_findall = re.findall(r'"fullText"\:(.*)', text_response, re.IGNORECASE)
print('-------text_response after replace cleanup')
print(text_response)
print('------')
print("fulltext:")
print("fulltext_searches",list(fulltext_searches))
print('fulltext_searches_findall',fulltext_searches_findall)
if len(list(fulltext_searches_findall)) == 0:
print("fulltext_search failed, try on next run")
continue
for fulltext_search in fulltext_searches_findall:
print('fulltext_search',fulltext_search)
if fulltext_search.strip() == 'null,':
continue
replace_with = fulltext_search
replace_with = replace_with.replace("\\",'')
replace_with = replace_with.replace('"','')
replace_with = replace_with.replace('\ ','')
replace_with = f'"{replace_with}",'
print("Replacing with:",replace_with)
text_response = text_response.replace(fulltext_search,replace_with)
print('------text_response post fulltext regex')
print(text_response)
# # pull out the geographical key and parse it, if it fails nuke it, its too complicated to parse why a json array could be malformed
# geo_search = re.search(r'"geographical"\:(.*)', text_response, re.IGNORECASE)
# geo_text = geo_search.group(1).strip()
# if geo_text[-1] == ',':
# geo_text = geo_text[0:-1].strip()
# try:
# json.loads(geo_text)
# except:
# print(text_response)
# print("geographical parse failed, setting it to empty")
# text_response = text_response.replace(geo_search.group(1),'[],')
print('---------')
print(text_response)
print('---------')
jsonResponse = json.loads(text_response)
data['gpt']['daybook-json'] = jsonResponse
json.dump(data,open(file,'w'),indent=2)