-
Notifications
You must be signed in to change notification settings - Fork 0
/
chunkChecker.py
35 lines (24 loc) · 984 Bytes
/
chunkChecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
"""
Utility to check a chunk list for token counts and potential issues
TODO:
- turn this into a widget to use in the GUI
- in the GUI, show prior and following chunk to allow assessment
"""
import json
import tokensToUTF
from GPT2.encoder import get_encoder
inPath = ""
inFileName = "chapter 1"
tokensPerChunk = 65 # number of tokens a chunk should have at most
lowTokenBoundary = 20
inJSONfilePath = f"{inPath}{inFileName}_{tokensPerChunk}tkChunks.json"
inJSON = open(inJSONfilePath, "r", encoding="utf-8").read()
chunkList = json.loads(inJSON)
fixEncodes = tokensToUTF.getFixEncodes()
encoder = get_encoder()
for chunk in chunkList:
chunkTokens = encoder.encode(chunk)
if len(chunkTokens) > tokensPerChunk:
print(f"'{chunk}'\nhas {len(chunkTokens)} tokens, which are {len(chunkTokens) - tokensPerChunk} too many!\n")
if len(chunkTokens) <= lowTokenBoundary:
print(f"'{chunk}'\nhas {len(chunkTokens)} tokens, which is very little!\n")