@@ -156,21 +156,37 @@ private void CacheSpecialTokensEncoding(IReadOnlyDictionary<string, int>? specia
156
156
internal static async ValueTask < ( Dictionary < ReadOnlyMemory < byte > , int > , Dictionary < StringSpanOrdinalKey , ( int Id , string Token ) > , Dictionary < int , ReadOnlyMemory < byte > > ) > LoadTiktokenBpeAsync (
157
157
Stream vocabStream , bool useAsync , CancellationToken cancellationToken = default )
158
158
{
159
- var encoder = new Dictionary < ReadOnlyMemory < byte > , int > ( ReadOnlyMemoryByteComparer . Instance ) ;
160
- var vocab = new Dictionary < StringSpanOrdinalKey , ( int Id , string Token ) > ( ) ;
161
- var decoder = new Dictionary < int , ReadOnlyMemory < byte > > ( ) ;
159
+ Dictionary < ReadOnlyMemory < byte > , int > encoder ;
160
+ Dictionary < StringSpanOrdinalKey , ( int Id , string Token ) > vocab ;
161
+ Dictionary < int , ReadOnlyMemory < byte > > decoder ;
162
162
163
163
try
164
164
{
165
165
// Don't dispose the reader as it will dispose the underlying stream vocabStream. The caller is responsible for disposing the stream.
166
166
StreamReader reader = new StreamReader ( vocabStream ) ;
167
- string ? line ;
168
- do
167
+ string ? line = useAsync ? await Helpers . ReadLineAsync ( reader , cancellationToken ) . ConfigureAwait ( false ) : reader . ReadLine ( ) ;
168
+
169
+ const string capacity = "Capacity: " ;
170
+ int suggestedCapacity = 0 ; // default capacity
171
+ if ( line is not null && line . StartsWith ( capacity , StringComparison . Ordinal ) )
169
172
{
170
- line = useAsync ?
171
- await Helpers . ReadLineAsync ( reader , cancellationToken ) . ConfigureAwait ( false ) :
172
- reader . ReadLine ( ) ;
173
- } while ( line is not null && line . Length == 0 ) ;
173
+ if ( ! Helpers . TryParseInt32 ( line , capacity . Length , out suggestedCapacity ) )
174
+ {
175
+ throw new FormatException ( $ "Invalid format in the BPE vocab file stream") ;
176
+ }
177
+
178
+ line = useAsync ? await Helpers . ReadLineAsync ( reader , cancellationToken ) . ConfigureAwait ( false ) : reader . ReadLine ( ) ;
179
+ }
180
+
181
+ encoder = new Dictionary < ReadOnlyMemory < byte > , int > ( suggestedCapacity , ReadOnlyMemoryByteComparer . Instance ) ;
182
+ vocab = new Dictionary < StringSpanOrdinalKey , ( int Id , string Token ) > ( suggestedCapacity ) ;
183
+ decoder = new Dictionary < int , ReadOnlyMemory < byte > > ( suggestedCapacity ) ;
184
+
185
+ // skip empty lines
186
+ while ( line is not null && line . Length == 0 )
187
+ {
188
+ line = useAsync ? await Helpers . ReadLineAsync ( reader , cancellationToken ) . ConfigureAwait ( false ) : reader . ReadLine ( ) ;
189
+ }
174
190
175
191
if ( line is not null && line . IndexOf ( ' ' ) < 0 )
176
192
{
0 commit comments