-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram.c
592 lines (547 loc) · 17.8 KB
/
ngram.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
// == STEP 1: Include necessary standard libraries ==
#include <math.h> // For mathematical functions like log and exp
#include <stdio.h> // For input/output operations
#include <stdlib.h> // For memory allocation and program control
#include <string.h> // For string manipulation functions
#include <stdint.h> // For fixed-width integer types
#include <assert.h> // For the assert macro used in debugging
// ----------------------------------------------------------------------------------
// == STEP 2: utility functions ==
/**
* Computes integer exponentiation.
*
* @param base The base number
* @param exp The exponent
* @return size_t The result of base raised to the power of exp
*/
size_t powi(int base, int exp)
{
size_t result = 1;
for (int i = 0; i < exp; i++)
{
result *= base;
}
return result;
}
/**
* Safely opens a file and checks for errors.
*
* @param path The path to the file to be opened
* @param mode The mode in which to open the file (e.g., "r" for read)
* @param file The name of the source file calling this function (__FILE__)
* @param line The line number where this function is called (__LINE__)
* @return FILE* A pointer to the opened file
*/
FILE *fopen_check(const char *path, const char *mode, char *file, int line)
{
FILE *fp = fopen(path, mode);
if (fp == NULL)
{
// If file opening fails, print an error message and exit the program
fprintf(stderr, "Error: Failed to open file '%s' at %s:%d\n", path, file, line);
exit(EXIT_FAILURE);
}
return fp;
}
// Macro to automatically pass __FILE__ and __LINE__ to fopen_check
#define fopenCheck(path, mode) fopen_check(path, mode, __FILE__, __LINE__)
/**
* Safely allocates memory and checks for errors.
*
* @param size The number of bytes to allocate
* @param file The name of the source file calling this function (__FILE__)
* @param line The line number where this function is called (__LINE__)
* @return void* A pointer to the allocated memory
*/
void *malloc_check(size_t size, const char *file, int line)
{
void *ptr = malloc(size);
if (ptr == NULL)
{
// If memory allocation fails, print an error message and exit the program
fprintf(stderr, "Error: Memory allocation failed at %s:%d\n", file, line);
exit(EXIT_FAILURE);
}
return ptr;
}
// Macro to automatically pass __FILE__ and __LINE__ to malloc_check
#define mallocCheck(size) malloc_check(size, __FILE__, __LINE__)
// ----------------------------------------------------------------------------------
// == STEP 3: tokenizer: convert strings <---> 1D integer sequences ==
// 26 lowercase letters + 1 end-of-text token
// Define the number of tokens in our vocabulary
#define NUM_TOKENS 27
// Define the end-of-text token
#define EOT_TOKEN 0
/**
* Encodes a character to its corresponding token ID.
*
* @param c The character to encode
* @return int The token ID of the character
*/
int tokenizer_encode(const char c)
{
// characters a-z are encoded as 1-26, and '\n' is encoded as 0
assert(c == '\n' || ('a' <= c && c <= 'z'));
int token = (c == '\n') ? EOT_TOKEN : (c - 'a' + 1);
return token;
}
/**
* Decodes a token ID back to its corresponding character.
*
* @param token The token ID to decode
* @return char The character corresponding to the token ID
*/
char tokenizer_decode(const int token)
{
// tokens 0-25 are decoded as a-z, and token 26 is decoded as '\n'
assert(token >= 0 && token < NUM_TOKENS);
char c = (token == EOT_TOKEN) ? '\n' : 'a' + (token - 1);
return c;
}
// ----------------------------------------------------------------------------------
// STEP 4: tape stores a fixed window of tokens, functions like a finite queue
/**
* Structure representing a fixed-size buffer of tokens.
*/
typedef struct
{
int n; // Current number of elements in the buffer
int length; // Maximum length of the buffer
int *buffer; // Array to store the tokens
} Tape;
/**
* Initializes a Tape structure.
*
* @param tape Pointer to the Tape structure
* @param length Maximum length of the tape
*/
void tape_init(Tape *tape, const int length)
{
// we will allow a buffer of length 0, useful for the Unigram model
assert(length >= 0);
tape->length = length;
tape->n = 0; // counts the number of elements in the buffer up to max
tape->buffer = NULL;
if (length > 0)
{
tape->buffer = (int *)mallocCheck(length * sizeof(int));
}
}
/**
* Sets all elements in the tape to a given value.
*
* @param tape Pointer to the Tape structure
* @param val Value to set all elements to
*/
void tape_set(Tape *tape, const int val)
{
for (int i = 0; i < tape->length; i++)
{
tape->buffer[i] = val;
}
}
/**
* Updates the tape with a new token.
*
* @param tape Pointer to the Tape structure
* @param token New token to add to the tape
* @return int 1 if the tape is full, 0 otherwise
*/
int tape_update(Tape *tape, const int token)
{
// returns 1 if the tape is ready/full, 0 otherwise
if (tape->length == 0)
{
return 1; // unigram tape is always ready
}
// Shift all elements to the left by one
for (int i = 0; i < tape->length - 1; i++)
{
tape->buffer[i] = tape->buffer[i + 1];
}
// Add the new token to the end
tape->buffer[tape->length - 1] = token;
// Keep track of when we've filled the tape
if (tape->n < tape->length)
{
tape->n++;
}
return (tape->n == tape->length);
}
/**
* Frees the memory allocated for the tape.
*
* @param tape Pointer to the Tape structure
*/
void tape_free(Tape *tape)
{
free(tape->buffer);
}
// ----------------------------------------------------------------------------------
// == STEP 5: n-gram modelling ==
/**
* Structure representing the N-gram model.
*/
typedef struct
{
// hyperparameters
int seq_len; // Length of the sequence (n in n-gram)
int vocab_size; // Size of the vocabulary
float smoothing; // Smoothing factor for probability calculation
// parameters
size_t num_counts; // Total number of count entries (size_t because int would only handle up to 2^31-1 ~= 2 billion counts)
uint32_t *counts; // Array to store counts
// internal buffer for ravel_index
int *ravel_buffer; // Buffer for index calculations
} NgramModel;
/**
* Initializes the N-gram model.
*
* @param model Pointer to the NgramModel structure
* @param vocab_size Size of the vocabulary
* @param seq_len Length of the sequence (n in n-gram)
* @param smoothing Smoothing factor for probability calculation
*/
void ngram_init(NgramModel *model, const int vocab_size, const int seq_len, const float smoothing)
{
// sanity check and store the hyperparameters
assert(vocab_size > 0);
assert(seq_len >= 1 && seq_len <= 6); // sanity check max ngram size we'll handle
model->vocab_size = vocab_size;
model->seq_len = seq_len;
model->smoothing = smoothing;
// allocate and init memory for counts (np.zeros in numpy)
// Calculate total number of possible n-grams
model->num_counts = powi(vocab_size, seq_len);
// Allocate memory for counts array
model->counts = (uint32_t *)mallocCheck(model->num_counts * sizeof(uint32_t));
// Initialize all counts to zero
for (size_t i = 0; i < model->num_counts; i++)
{
model->counts[i] = 0;
}
// allocate buffer we will use for ravel_index
// Initialize all counts to zero
model->ravel_buffer = (int *)mallocCheck(seq_len * sizeof(int));
}
/**
* Converts a multi-dimensional index to a 1D index.
*
* @param index Array of indices
* @param n Length of the index array
* @param dim Dimension size
* @return size_t The calculated 1D index
*/
size_t ravel_index(const int *index, const int n, const int dim)
{
// convert an n-dimensional index into a 1D index (ravel_multi_index in numpy)
// each index[i] is in the range [0, dim)
size_t index1d = 0;
size_t multiplier = 1;
for (int i = n - 1; i >= 0; i--)
{
int ix = index[i];
assert(ix >= 0 && ix < dim);
index1d += multiplier * ix;
multiplier *= dim;
}
return index1d;
}
/**
* Frees the memory allocated for the N-gram model.
*
* @param model Pointer to the NgramModel structure
*/
void ngram_free(NgramModel *model)
{
free(model->counts);
free(model->ravel_buffer);
}
// ----------------------------------------------------------------------------------
// == STEP 6: dataloader: iterates all windows of a given length in a text file ==
/**
* Structure representing a data loader for reading from a file.
*/
typedef struct
{
FILE *file; // File pointer
int seq_len; // Length of sequences to read
Tape tape; // Tape to store the current sequence
} DataLoader;
/**
* Initializes a DataLoader structure.
*
* @param dataloader Pointer to the DataLoader structure
* @param path Path to the input file
* @param seq_len Length of sequences to read
*/
void dataloader_init(DataLoader *dataloader, const char *path, const int seq_len)
{
dataloader->file = fopenCheck(path, "r");
dataloader->seq_len = seq_len;
tape_init(&dataloader->tape, seq_len);
}
/**
* Reads the next sequence from the file.
*
* @param dataloader Pointer to the DataLoader structure
* @return int 1 if a new sequence was read, 0 if end of file was reached
*/
int dataloader_next(DataLoader *dataloader)
{
// returns 1 if a new window was read, 0 if the end of the file was reached
int c;
while (1)
{
c = fgetc(dataloader->file);
if (c == EOF)
{
break;
}
int token = tokenizer_encode(c);
int ready = tape_update(&dataloader->tape, token);
if (ready)
{
return 1;
}
}
return 0;
}
/**
* Frees resources associated with the DataLoader.
*
* @param dataloader Pointer to the DataLoader structure
*/
void dataloader_free(DataLoader *dataloader)
{
fclose(dataloader->file);
tape_free(&dataloader->tape);
}
// ----------------------------------------------------------------------------------
// == STEP 7: core ngram modelling ==
/**
* Updates the model during training.
*
* @param model Pointer to the NgramModel structure
* @param tape Array of tokens representing the current n-gram
*/
void ngram_train(NgramModel *model, const int *tape)
{
// tape here is of length `seq_len`, and we want to update the counts
// Calculate the 1D index for this n-gram
size_t offset = ravel_index(tape, model->seq_len, model->vocab_size);
assert(offset >= 0 && offset < model->num_counts);
// Increment the count for this n-gram
model->counts[offset]++;
}
/**
* Performs inference with the trained model.
*
* @param model Pointer to the NgramModel structure
* @param tape Array of tokens representing the context
* @param probs Array to store the calculated probabilities
*/
void ngram_inference(NgramModel *model, const int *tape, float *probs)
{
// here, tape is of length `seq_len - 1`, and we want to predict the next token
// probs should be a pre-allocated buffer of size `vocab_size`
// Copy the context to the ravel buffer (copy the tape into the buffer and set the last element to zero)
for (int i = 0; i < model->seq_len - 1; i++)
{
model->ravel_buffer[i] = tape[i];
}
model->ravel_buffer[model->seq_len - 1] = 0;
// Calculate the 1D index for this context (find the offset into the counts array based on the context)
size_t offset = ravel_index(model->ravel_buffer, model->seq_len, model->vocab_size);
assert(offset < model->num_counts);
// Get the pointer to the row of counts for this context (seek to the row of counts for this context)
uint32_t *counts_row = model->counts + offset;
// Calculate the sum of counts in the row
float row_sum = model->vocab_size * model->smoothing;
for (int i = 0; i < model->vocab_size; i++)
{
row_sum += counts_row[i];
}
if (row_sum == 0.0f)
{
// If the row sum is zero, set uniform probabilities (the entire row of counts is zero, so let's set uniform probabilities)
float uniform_prob = 1.0f / model->vocab_size;
for (int i = 0; i < model->vocab_size; i++)
{
probs[i] = uniform_prob;
}
}
else
{
// Calculate probabilities with smoothing (normalize the row of counts into probabilities)
float scale = 1.0f / row_sum;
for (int i = 0; i < model->vocab_size; i++)
{
float counts_i = counts_row[i] + model->smoothing;
probs[i] = scale * counts_i;
}
}
}
// ----------------------------------------------------------------------------------
// == STEP 8: random number generation ==
/**
* Generates a random 32-bit unsigned integer using the xorshift* algorithm.
*
* @param state Pointer to the 64-bit state of the random number generator
* @return uint32_t A random 32-bit unsigned integer
*/
uint32_t random_u32(uint64_t *state)
{
// xorshift* algorithm: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
*state ^= *state >> 12;
*state ^= *state << 25;
*state ^= *state >> 27;
return (uint32_t)((*state * 0x2545F4914F6CDD1Dull) >> 32);
}
/**
* Generates a random float32 between 0 and 1.
*
* @param state Pointer to the 64-bit state of the random number generator
* @return float A random float between 0 and 1
*/
float random_f32(uint64_t *state)
{
return (random_u32(state) >> 8) / 16777216.0f;
}
// ----------------------------------------------------------------------------------
// == STEP 8: sampling ==
/**
* Samples from a discrete probability distribution.
*
* @param probs Array of probabilities
* @param n Length of the probs array
* @param coinf A random float between 0 and 1
* @return int The index of the sampled element
*/
int sample_discrete(const float *probs, const int n, const float coinf)
{
assert(coinf >= 0.0f && coinf < 1.0f);
float cdf = 0.0f;
for (int i = 0; i < n; i++)
{
float probs_i = probs[i];
assert(probs_i >= 0.0f && probs_i <= 1.0f);
cdf += probs_i;
if (coinf < cdf)
{
return i;
}
}
return n - 1; // in case of rounding errors
}
// ----------------------------------------------------------------------------------
// == STEP 9: error handling and cleanup ==
/**
* Prints usage information and exits the program.
*/
void error_usage(void)
{
fprintf(stderr, "Usage: ./ngram [options]\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -n <int> n-gram model arity (default 4)\n");
fprintf(stderr, " -s <float> smoothing factor (default 0.1)\n");
exit(EXIT_FAILURE);
}
/**
* Main function of the program.
*
* @param argc Number of command-line arguments
* @param argv Array of command-line argument strings
* @return int Exit status of the program
*/
int main(int argc, char *argv[])
{
// Default values for n-gram arity and smoothing factor (the arity of the n-gram model (1 = unigram, 2 = bigram, 3 = trigram, ...))
int seq_len = 4;
float smoothing = 0.1f;
// Parse command-line arguments (simple argparse, example usage: ./ngram -n 4 -s 0.1)
for (int i = 1; i < argc; i += 2)
{
// must have arg after flag
if (i + 1 >= argc)
{
error_usage();
}
// must start with dash
if (argv[i][0] != '-')
{
error_usage();
}
// must be -x (one dash, one letter)
if (!(strlen(argv[i]) == 2))
{
error_usage();
}
if (argv[i][1] == 'n')
{
seq_len = atoi(argv[i + 1]);
}
else if (argv[i][1] == 's')
{
smoothing = atof(argv[i + 1]);
}
else
{
error_usage();
}
}
// Initialize the n-gram model
NgramModel model;
ngram_init(&model, NUM_TOKENS, seq_len, smoothing);
// Train the model using the training data
DataLoader train_loader;
dataloader_init(&train_loader, "data/train.txt", seq_len);
while (dataloader_next(&train_loader))
{
ngram_train(&model, train_loader.tape.buffer);
}
dataloader_free(&train_loader);
// Allocate memory for probability distribution (allocate probs buffer for inference)
float *probs = (float *)mallocCheck(NUM_TOKENS * sizeof(float));
// Sample from the model for 200 time steps
Tape sample_tape;
tape_init(&sample_tape, seq_len - 1);
tape_set(&sample_tape, EOT_TOKEN); // Initialize with EOT tokens
uint64_t rng = 1337; // Seed for random number generator
for (int i = 0; i < 200; i++)
{
ngram_inference(&model, sample_tape.buffer, probs);
float coinf = random_f32(&rng);
int token = sample_discrete(probs, NUM_TOKENS, coinf);
tape_update(&sample_tape, token);
char c = tokenizer_decode(token);
printf("%c", c);
}
printf("\n");
// Evaluate the model on the test data
DataLoader test_loader;
dataloader_init(&test_loader, "data/test.txt", seq_len);
float sum_loss = 0.0f;
int count = 0;
while (dataloader_next(&test_loader))
{
// note that ngram_inference will only use the first seq_len - 1 tokens in buffer
ngram_inference(&model, test_loader.tape.buffer, probs);
// and the last token in the tape buffer is the label
int target = test_loader.tape.buffer[seq_len - 1];
// negative log likelihood loss
sum_loss += -logf(probs[target]); // Negative log likelihood loss
count++;
}
dataloader_free(&test_loader);
// Calculate and print test loss and perplexity
float mean_loss = sum_loss / count;
float test_perplexity = expf(mean_loss);
printf("test_loss %f, test_perplexity %f\n", mean_loss, test_perplexity);
// Clean up resources
ngram_free(&model);
free(probs);
tape_free(&sample_tape);
return EXIT_SUCCESS;
}