-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpt3_build_embeddings.py
127 lines (85 loc) · 3.45 KB
/
gpt3_build_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import openai
import pickle
import glob
import json
import util
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
def count_tokens(text: str) -> int:
"""count the number of tokens in a string"""
return len(tokenizer.encode(text))
EMBEDDING_MODEL = "text-embedding-ada-002"
total_daybook = len(list(glob.glob('daybook-and-diaries-1856-1906-daybook-1*/*.json')))
done_counter = 0
for file in glob.glob('daybook-and-diaries-1856-1906-daybook-1*/*.json'):
done_counter+=1
print(done_counter, '/',total_daybook)
dir = file.split('/')[-2]
file_id = int(file.split('/')[-1].replace('.json',''))
data = json.load(open(file))
# for the daybooks we use the extracted text which doesn't have the date
if 'gpt' in data:
if 'gpt3.5-daybook-json' in data['gpt']:
for entry in data['gpt']['gpt3.5-daybook-json']:
if 'embedding' in entry:
print('skip entry')
continue
if entry['fullText'] == None:
continue
if count_tokens(entry['fullText']) > 40:
print(file)
text = entry['fullText'].replace('\n',' ')
text = util.clean_up_transcribed_text(text)
print(text)
result = openai.Embedding.create(
model=EMBEDDING_MODEL,
input=text
)
entry['embedding'] = result["data"][0]["embedding"]
json.dump(data,open(file,'w'),indent=2)
total_writtings = len(list(glob.glob('anthony-speeches-and-other-writings-resources/*.json')))
done_counter = 0
for file in glob.glob('anthony-speeches-and-other-writings-resources/*.json'):
done_counter+=1
print(done_counter, '/',total_writtings)
print(file)
file_id = file.split('/')[-1].replace('.json','')
data = json.load(open(file))
# for the daybooks we use the extracted text which doesn't have the date
for block in data:
if len(block['text']) > 0:
text = block['text']
text = text.replace('\n',' ')
text = util.clean_up_transcribed_text(text)
result = openai.Embedding.create(
model=EMBEDDING_MODEL,
input=text
)
block['embedding'] = result["data"][0]["embedding"]
json.dump(data,open(file,'w'),indent=2)
total_writtings = len(list(glob.glob('anthony-correspondence-resources/*.json')))
done_counter = 0
for file in glob.glob('anthony-correspondence-resources/*.json'):
done_counter+=1
print(done_counter, '/',total_writtings)
print(file)
file_id = file.split('/')[-1].replace('.json','')
data = json.load(open(file))
# for the daybooks we use the extracted text which doesn't have the date
all_text = ""
for item in data['items']:
if 'full_text' in item:
text = item['full_text']
text = text.replace('\n',' ')
text = util.clean_up_transcribed_text(text)
all_text=all_text + ' ' + text
if count_tokens(all_text) <= 1000:
print("<1000")
else:
print("bigger",count_tokens(all_text))
result = openai.Embedding.create(
model=EMBEDDING_MODEL,
input=all_text
)
data['embedding'] = result["data"][0]["embedding"]
json.dump(data,open(file,'w'),indent=2)