-
Notifications
You must be signed in to change notification settings - Fork 92
/
ngram.c
358 lines (313 loc) · 11.4 KB
/
ngram.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
/*
Compile and run:
clang -O3 -Wall -Wextra -Wpedantic -fsanitize=address -fsanitize=undefined -o ngram ngram.c && ./ngram
*/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
// ----------------------------------------------------------------------------
// utils
FILE *fopen_check(const char *path, const char *mode, const char *file, int line) {
FILE *fp = fopen(path, mode);
if (fp == NULL) {
fprintf(stderr, "Error: Failed to open file '%s' at %s:%d\n", path, file, line);
exit(EXIT_FAILURE);
}
return fp;
}
#define fopenCheck(path, mode) fopen_check(path, mode, __FILE__, __LINE__)
void *malloc_check(size_t size, const char *file, int line) {
void *ptr = malloc(size);
if (ptr == NULL) {
fprintf(stderr, "Error: Memory allocation failed at %s:%d\n", file, line);
exit(EXIT_FAILURE);
}
return ptr;
}
#define mallocCheck(size) malloc_check(size, __FILE__, __LINE__)
size_t powi(int base, int exp) {
// integer exponentiation utility
size_t result = 1;
for (int i = 0; i < exp; i++) {
result *= base;
}
return result;
}
// ----------------------------------------------------------------------------
// random number generation
uint32_t random_u32(uint64_t *state) {
// xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
*state ^= *state >> 12;
*state ^= *state << 25;
*state ^= *state >> 27;
return (uint32_t)((*state * 0x2545F4914F6CDD1Dull) >> 32);
}
float random_f32(uint64_t *state) {
// random float32 in [0,1)
return (random_u32(state) >> 8) / 16777216.0f;
}
// ----------------------------------------------------------------------------
// sampling
int sample_discrete(const float* probs, const int n, const float coinf) {
// sample from a discrete distribution
assert(coinf >= 0.0f && coinf < 1.0f);
float cdf = 0.0f;
for (int i = 0; i < n; i++) {
float probs_i = probs[i];
assert(probs_i >= 0.0f && probs_i <= 1.0f);
cdf += probs_i;
if (coinf < cdf) {
return i;
}
}
return n - 1; // in case of rounding errors
}
// ----------------------------------------------------------------------------
// tokenizer: convert strings <---> 1D integer sequences
// 26 lowercase letters + 1 end-of-text token
#define NUM_TOKENS 27
#define EOT_TOKEN 0
int tokenizer_encode(const char c) {
// characters a-z are encoded as 1-26, and '\n' is encoded as 0
assert(c == '\n' || ('a' <= c && c <= 'z'));
int token = (c == '\n') ? EOT_TOKEN : (c - 'a' + 1);
return token;
}
char tokenizer_decode(const int token) {
// tokens 0-25 are decoded as a-z, and token 26 is decoded as '\n'
assert(token >= 0 && token < NUM_TOKENS);
char c = (token == EOT_TOKEN) ? '\n' : 'a' + (token - 1);
return c;
}
// ----------------------------------------------------------------------------
// ngram model
typedef struct {
// hyperparameters
int seq_len;
int vocab_size;
float smoothing;
// parameters
size_t num_counts; // size_t because int would only handle up to 2^31-1 ~= 2 billion counts
uint32_t* counts;
// internal buffer for ravel_index
int* ravel_buffer;
} NgramModel;
void ngram_init(NgramModel *model, const int vocab_size, const int seq_len, const float smoothing) {
// sanity check and store the hyperparameters
assert(vocab_size > 0);
assert(seq_len >= 1 && seq_len <= 6); // sanity check max ngram size we'll handle
model->vocab_size = vocab_size;
model->seq_len = seq_len;
model->smoothing = smoothing;
// allocate and init memory for counts (np.zeros in numpy)
model->num_counts = powi(vocab_size, seq_len);
model->counts = (uint32_t*)mallocCheck(model->num_counts * sizeof(uint32_t));
for (size_t i = 0; i < model->num_counts; i++) {
model->counts[i] = 0;
}
// allocate buffer we will use for ravel_index
model->ravel_buffer = (int*)mallocCheck(seq_len * sizeof(int));
}
size_t ravel_index(const int* index, const int n, const int dim) {
// convert an n-dimensional index into a 1D index (ravel_multi_index in numpy)
// each index[i] is in the range [0, dim)
size_t index1d = 0;
size_t multiplier = 1;
for (int i = n - 1; i >= 0; i--) {
int ix = index[i];
assert(ix >= 0 && ix < dim);
index1d += multiplier * ix;
multiplier *= dim;
}
return index1d;
}
void ngram_train(NgramModel *model, const int* tape) {
// tape here is of length `seq_len`, and we want to update the counts
size_t offset = ravel_index(tape, model->seq_len, model->vocab_size);
assert(offset >= 0 && offset < model->num_counts);
model->counts[offset]++;
}
void ngram_inference(NgramModel *model, const int* tape, float* probs) {
// here, tape is of length `seq_len - 1`, and we want to predict the next token
// probs should be a pre-allocated buffer of size `vocab_size`
// copy the tape into the buffer and set the last element to zero
for (int i = 0; i < model->seq_len - 1; i++) {
model->ravel_buffer[i] = tape[i];
}
model->ravel_buffer[model->seq_len - 1] = 0;
// find the offset into the counts array based on the context
size_t offset = ravel_index(model->ravel_buffer, model->seq_len, model->vocab_size);
// seek to the row of counts for this context
uint32_t* counts_row = model->counts + offset;
// calculate the sum of counts in the row
float row_sum = model->vocab_size * model->smoothing;
for (int i = 0; i < model->vocab_size; i++) {
row_sum += counts_row[i];
}
if (row_sum == 0.0f) {
// the entire row of counts is zero, so let's set uniform probabilities
float uniform_prob = 1.0f / model->vocab_size;
for (int i = 0; i < model->vocab_size; i++) {
probs[i] = uniform_prob;
}
} else {
// normalize the row of counts into probabilities
float scale = 1.0f / row_sum;
for (int i = 0; i < model->vocab_size; i++) {
float counts_i = counts_row[i] + model->smoothing;
probs[i] = scale * counts_i;
}
}
}
void ngram_free(NgramModel *model) {
free(model->counts);
free(model->ravel_buffer);
}
// ----------------------------------------------------------------------------
// tape stores a fixed window of tokens, functions like a finite queue
typedef struct {
int n;
int length;
int* buffer;
} Tape;
void tape_init(Tape *tape, const int length) {
// we will allow a buffer of length 0, useful for the Unigram model
assert(length >= 0);
tape->length = length;
tape->n = 0; // counts the number of elements in the buffer up to max
tape->buffer = NULL;
if (length > 0) {
tape->buffer = (int*)mallocCheck(length * sizeof(int));
}
}
void tape_set(Tape *tape, const int val) {
for (int i = 0; i < tape->length; i++) {
tape->buffer[i] = val;
}
}
int tape_update(Tape *tape, const int token) {
// returns 1 if the tape is ready/full, 0 otherwise
if (tape->length == 0) {
return 1; // unigram tape is always ready
}
// shift all elements to the left by one
for (int i = 0; i < tape->length - 1; i++) {
tape->buffer[i] = tape->buffer[i + 1];
}
// add the new token to the end (on the right)
tape->buffer[tape->length - 1] = token;
// keep track of when we've filled the tape
if (tape->n < tape->length) {
tape->n++;
}
return (tape->n == tape->length);
}
void tape_free(Tape *tape) {
free(tape->buffer);
}
// ----------------------------------------------------------------------------
// dataloader: iterates all windows of a given length in a text file
typedef struct {
FILE *file;
int seq_len;
Tape tape;
} DataLoader;
void dataloader_init(DataLoader *dataloader, const char *path, const int seq_len) {
dataloader->file = fopenCheck(path, "r");
dataloader->seq_len = seq_len;
tape_init(&dataloader->tape, seq_len);
}
int dataloader_next(DataLoader *dataloader) {
// returns 1 if a new window was read, 0 if the end of the file was reached
int c;
while (1) {
c = fgetc(dataloader->file);
if (c == EOF) {
break;
}
int token = tokenizer_encode(c);
int ready = tape_update(&dataloader->tape, token);
if (ready) {
return 1;
}
}
return 0;
}
void dataloader_free(DataLoader *dataloader) {
fclose(dataloader->file);
tape_free(&dataloader->tape);
}
// ----------------------------------------------------------------------------
void error_usage(void) {
fprintf(stderr, "Usage: ./ngram [options]\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -n <int> n-gram model arity (default 4)\n");
fprintf(stderr, " -s <float> smoothing factor (default 0.1)\n");
exit(EXIT_FAILURE);
}
int main(int argc, char *argv[]) {
// the arity of the n-gram model (1 = unigram, 2 = bigram, 3 = trigram, ...)
int seq_len = 4;
float smoothing = 0.1f;
// simple argparse, example usage: ./ngram -n 4 -s 0.1
for (int i = 1; i < argc; i+=2) {
if (i + 1 >= argc) { error_usage(); } // must have arg after flag
if (argv[i][0] != '-') { error_usage(); } // must start with dash
if (!(strlen(argv[i]) == 2)) { error_usage(); } // must be -x (one dash, one letter)
if (argv[i][1] == 'n') { seq_len = atoi(argv[i+1]); }
else if (argv[i][1] == 's') { smoothing = atof(argv[i+1]); }
else { error_usage(); }
}
// init the model
NgramModel model;
ngram_init(&model, NUM_TOKENS, seq_len, smoothing);
// train the model
DataLoader train_loader;
dataloader_init(&train_loader, "data/train.txt", seq_len);
while (dataloader_next(&train_loader)) {
ngram_train(&model, train_loader.tape.buffer);
}
dataloader_free(&train_loader);
// allocate probs buffer for inference
float* probs = (float*)mallocCheck(NUM_TOKENS * sizeof(float));
// sample from the model for 200 time steps
Tape sample_tape;
tape_init(&sample_tape, seq_len - 1);
tape_set(&sample_tape, EOT_TOKEN); // fill with EOT tokens to init
uint64_t rng = 1337;
for (int i = 0; i < 200; i++) {
ngram_inference(&model, sample_tape.buffer, probs);
float coinf = random_f32(&rng);
int token = sample_discrete(probs, NUM_TOKENS, coinf);
tape_update(&sample_tape, token);
char c = tokenizer_decode(token);
printf("%c", c);
}
printf("\n");
// evaluate the test split loss
DataLoader test_loader;
dataloader_init(&test_loader, "data/test.txt", seq_len);
float sum_loss = 0.0f;
int count = 0;
while (dataloader_next(&test_loader)) {
// note that ngram_inference will only use the first seq_len - 1 tokens in buffer
ngram_inference(&model, test_loader.tape.buffer, probs);
// and the last token in the tape buffer is the label
int target = test_loader.tape.buffer[seq_len - 1];
// negative log likelihood loss
sum_loss += -logf(probs[target]);
count++;
}
dataloader_free(&test_loader);
float mean_loss = sum_loss / count;
float test_perplexity = expf(mean_loss);
printf("test_loss %f, test_perplexity %f\n", mean_loss, test_perplexity);
// clean ups
ngram_free(&model);
free(probs);
tape_free(&sample_tape);
return EXIT_SUCCESS;
}