Skip to content

Commit

Permalink
Merge pull request #658 from epfml/656-improve-gpt-julien
Browse files Browse the repository at this point in the history
Fix gpt-tfjs bugs, add tests and refactor code
  • Loading branch information
JulienVig authored Apr 16, 2024
2 parents d1e2be5 + a80c403 commit 3b30992
Show file tree
Hide file tree
Showing 27 changed files with 933 additions and 770 deletions.
2 changes: 1 addition & 1 deletion discojs/discojs-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
"ws": "8"
},
"devDependencies": {
"@tensorflow/tfjs-node": "4",
"@types/chai": "4",
"@types/mocha": "10",
"@types/msgpack-lite": "0.1",
"@types/simple-peer": "9",
"chai": "5",
"mocha": "10",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import { TEXT_PREPROCESSING } from './index.js'
import { expect } from 'chai'

import type { Task } from '../../../index.js'
import * as tf from '@tensorflow/tfjs'

describe('text preprocessing', function () {
const [tokenize, leftPadding] = TEXT_PREPROCESSING
// Use a function to create different task object for each test (otherwise the tokenizer gets cached)
function initMockTask(): Task {
return {
id: 'mock-task-id',
displayInformation: {},
trainingInformation: {
modelID: 'model-id',
epochs: 1,
roundDuration: 1,
validationSplit: 0,
batchSize: 8,
scheme: 'local',
dataType: 'text',
tokenizer: 'Xenova/gpt2',
}}
}

const text = "Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia written and maintained by a community \n of volunteers, known as Wikipedians. Founded by Jimmy Wales and Larry Sanger on January 15, 2001, Wikipedia is hosted by the Wikimedia Foundation, an American nonprofit organization that employs a staff of over 700 people.[7]"
const expectedTokens = [15496, 995, 11, 257, 47125, 352, 2242, 2231, 11, 705, 30, 860, 4304, 13, 15312, 318, 257, 1479, 2695, 2691, 45352, 3194, 290, 9456, 416, 257, 2055, 220, 198, 286, 11661, 11, 1900, 355, 11145, 46647, 1547, 13, 4062, 276, 416, 12963, 11769, 290, 13633, 311, 2564, 319, 3269, 1315, 11, 5878, 11, 15312, 318, 12007, 416, 262, 44877, 5693, 11, 281, 1605, 15346, 4009, 326, 24803, 257, 3085, 286, 625, 13037, 661, 3693, 22, 60]

it('can tokenize text', async () => {
const { tokens } = await tokenize.apply(Promise.resolve(text), initMockTask()) as { tokens: number[]}
expect(tokens).to.be.deep.equal(expectedTokens)
})

it('can truncate inputs when tokenizing', async () => {
const truncationTask = initMockTask()
truncationTask.trainingInformation.maxSequenceLength = 10
const { tokens } = await tokenize.apply(Promise.resolve(text), truncationTask) as { tokens: number[] }
const expectedLength = truncationTask.trainingInformation.maxSequenceLength + 1 // + 1 because tokenization includes an extra token label for next label prediction
expect(tokens.length).to.be.equal(expectedLength)
expect(tokens).to.be.deep.equal(expectedTokens.slice(0, expectedLength))
})

it('can left pad tokens', async () => {
// Create a task where output token sequence should all have length 20
const paddingTask = initMockTask()
paddingTask.trainingInformation.maxSequenceLength = 20

// Create a token sequence of length 10
const tokens = { tokens: [0,1,2,3,4,5,6,7,8,9] }
const { xs, ys } = await leftPadding.apply(Promise.resolve(tokens), paddingTask) as { xs: tf.Tensor1D, ys: tf.Tensor2D }
const xsArray = await xs.array()
const ysArray = await ys.array()

// Output sequences should have shape (20) and (20, 50258), 50258 being the size of the vocab for gpt2
expect(xsArray.length).to.be.equal(paddingTask.trainingInformation.maxSequenceLength)
expect(ysArray.length).to.be.equal(paddingTask.trainingInformation.maxSequenceLength)
expect(ysArray[0].length).to.be.equal(50258)

// xs should be left pad with gpt2's padding token 50256 to be of length 20.
// We expect the last token of input token sequence (9) to not be included in xs since it doesn't have a next token to be predicted
const paddingToken = 50256
const expectedXs = Array.from({length:11}).map(_ => paddingToken).concat(tokens.tokens.slice(0,9))
expect(xsArray).to.be.deep.equal(expectedXs)

// ys should be a one hot encoding of the next token in xs
// if the input tokens are [0,1,2,3] then the labels are [1,2,3] which are then one-hot encoded
// So the sum of each row should be equal to 1
const expectedOneHot = Array.from({ length: 20 }).map(_ => 1)
expect(await ys.sum(-1).array()).to.be.deep.equal(expectedOneHot)

// In each row, the index of the 1 should be the token id
const expectedYs = Array.from({length:10}).map(_ => paddingToken).concat(tokens.tokens)
expect(await ys.argMax(-1).array()).to.be.deep.equal(expectedYs)
})

it('throws an error if no tokenizer is specified', async () => {
const invalidTask = initMockTask()
invalidTask.trainingInformation.tokenizer = undefined;
try {
await tokenize.apply(Promise.resolve("input text doesn't matter"), invalidTask)
} catch {
return
}
throw new Error("undefined tokenizer should have thrown an error")
})
it('throws an error if the tokenizer name is invalid', async () => {
const invalidTask = initMockTask()
invalidTask['trainingInformation']['tokenizer'] = 'invalid-tokenizer-name'
try {
await tokenize.apply(Promise.resolve("input text doesn't matter"), invalidTask)
} catch {
return
}
throw new Error("invalid tokenizer name should have thrown an error")
})

})
Original file line number Diff line number Diff line change
Expand Up @@ -14,43 +14,58 @@ export enum TextPreprocessing {
}

interface TokenizedEntry extends tf.TensorContainerObject {
xs: tf.Tensor1D
tokens: number []
}

/**
* LeftPadding pads all incoming inputs to be a fixed length, which should be specified
* in `task.trainingInformation.maxSequenceLength`.
*
* We are currently only implementing left padding for text generation
* https://huggingface.co/docs/transformers/en/llm_tutorial#wrong-padding-side
* The function can easily be extended to support right padding once the need arise
* The function can easily be extended to support right padding if needed
*
* Once Transformers.js supports left padding, it will be possible to pad inputs
* directly when tokenizing
* https://github.com/xenova/transformers.js/blob/8804c36591d11d8456788d1bb4b16489121b3be2/src/tokenizers.js#L2517
*/
const leftPadding: PreprocessingFunction = {
type: TextPreprocessing.LeftPadding,
apply: async (x: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
let { xs } = await x as TokenizedEntry
if (xs === undefined || !(xs instanceof tf.tensor) ||xs.rankType !== tf.Rank.R1) {
new Error("The leftPadding preprocessing expects a 1D tensor named 'xs' as input")
if (x === undefined || !Array.isArray(x) || x.length == 0 || typeof(x[0] != 'number')) {
new Error("The leftPadding preprocessing expects a non empty 1D array of number")
}
const { tokens } = await x as TokenizedEntry
const tokenizer = await models.getTaskTokenizer(task)


const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
// Should never happen because tokenization truncates inputs
if (xs.size > maxLength) {
xs = xs.slice([0], [maxLength])
} else if (xs.size < maxLength) {
const paddingToken = tokenizer.pad_token_id
xs = xs.pad([[Math.max(0, maxLength - xs.size), 0]], paddingToken)
}
// if xs.size == maxLength we can leave it as it is
return {
xs,
ys: tf.oneHot(xs, tokenizer.model.vocab.length + 1) // gpt-tfjs expects a one-hot encoded token label
}
return tf.tidy(() => {
// maxLength is the final length of xs
// Because ys the contains the tokens in xs shifted by one (to predict the next token), we need
// to include one more token than maxSequenceLength in order to have the next token's label of the maxSequenceLength'th token
const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
const maxLengthPlusLabel = maxLength + 1

let fixedLengthTokens = tf.tensor(tokens, undefined, 'int32') // cast tokens from float to int for gpt-tfjs
if (fixedLengthTokens.size > maxLengthPlusLabel) { // Should never happen because tokenization truncates inputs
throw Error("There are more tokens than expected after tokenization and truncation")
} else if (fixedLengthTokens.size < maxLengthPlusLabel) { // Pad inputs to fixed length
const paddingToken = tokenizer.pad_token_id
fixedLengthTokens = fixedLengthTokens.pad([[Math.max(0, maxLengthPlusLabel - fixedLengthTokens.size), 0]], paddingToken)
}
// if tokens.size == maxLengthPlusLabel we can leave it as it is

// ys is a one-hot encoding of the next token (i.e. xs shifted by one)
const ys = tf.oneHot(fixedLengthTokens.slice([1]), tokenizer.model.vocab.length + 1)
// remove the extra token now that ys is created
const xs = fixedLengthTokens.slice([0], maxLength)
return { xs, ys }
})
}
}

interface TokenizerOutput {
input_ids: number[]
}

/**
* Tokenize and truncates input strings
*/
Expand All @@ -62,7 +77,10 @@ const tokenize: PreprocessingFunction = {
}
const xs = await x as string // tf.TextLineDataset yields strings
const tokenizer = await models.getTaskTokenizer(task)
const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
// Add plus one to include the next token label of the last token in the input sequence
// The inputs are truncated down to exactly maxSequenceLength in leftPadding
const maxLength = task.trainingInformation.maxSequenceLength ?? (tokenizer.model_max_length as number)
const maxLengthPlusLabel = maxLength + 1

const {input_ids: tokens} = tokenizer(xs, {
// Transformers.js currently only supports right padding while we need left for text generation
Expand All @@ -71,11 +89,9 @@ const tokenize: PreprocessingFunction = {
padding: false,
truncation: true,
return_tensor: false,
max_length: maxLength,
max_length: maxLengthPlusLabel,
}) as TokenizerOutput
return {
xs: tf.tensor(tokens, undefined, 'int32') // cast tokens from float to int for gpt-tfjs}
}
return { tokens }
}
}

Expand Down
26 changes: 17 additions & 9 deletions discojs/discojs-core/src/models/gpt/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,12 @@ type ModelType =
| 'gpt-micro'
| 'gpt-nano'

export interface ModelSize {
nLayer?: number
nHead?: number
nEmbd?: number
}

export interface GPTConfig {
lr: number
blockSize: number
vocabSize: number
modelType: ModelType
name?: string,
evaluate?: boolean
maxEvalBatches?: number
evaluateEvery?: number
Expand All @@ -30,13 +26,16 @@ export interface GPTConfig {
embdDrop?: number
tokEmb?: boolean
lmHead?: boolean
modelType: ModelType
nLayer?: number
nHead?: number
nEmbd?: number
}

export const DEFAULT_CONFIG: Required<GPTConfig> = {
name: 'transformer',
lr: 0.001,
weightDecay: 0,
maxIter: 10_000,
maxIter: 5,
verbose: 0,
modelType: 'gpt-nano',
evaluate: true,
Expand All @@ -50,7 +49,16 @@ export const DEFAULT_CONFIG: Required<GPTConfig> = {
residDrop: 0.2,
embdDrop: 0.2,
tokEmb: true,
lmHead: true
lmHead: true,
nLayer: 3,
nHead: 3,
nEmbd: 48,
}

export type ModelSize = {
nLayer: number
nHead: number
nEmbd: number
}

export function getModelSizes (modelType: ModelType): Required<ModelSize> {
Expand Down
44 changes: 44 additions & 0 deletions discojs/discojs-core/src/models/gpt/gpt.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import { expect } from 'chai'
import * as tf from '@tensorflow/tfjs-node'
import { AutoTokenizer } from '@xenova/transformers';
import { GPT } from './index.js'
import { type GPTConfig } from './config.js'

describe('gpt-tfjs', function() {
this.timeout(50_000)
const data = "Lorem ipsum dolor sit"

const config: GPTConfig = {
modelType: 'gpt-nano',
lr: 0.01,
maxIter: 10,
evaluateEvery:10,
maxEvalBatches: 10,
blockSize: 8,
vocabSize: 50258
}

it('can overfit one sentence', async () => {
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
const datasetSource = new tf.data.FileDataSource(Buffer.from(data))
const textDataset = new tf.data.TextLineDataset(datasetSource)
const tokenDataset = textDataset.map((text: string) => {
const { input_ids: tokens } = tokenizer(text, {
padding: true,
truncation: true,
return_tensor: false,
max_length: config.blockSize + 1,
}) as { input_ids: number[] }
const ys = tf.oneHot(tokens.slice(1), tokenizer.model.vocab.length + 1)
const xs = tf.tensor(tokens.slice(0, config.blockSize), undefined, 'int32')
return {xs, ys}
}).repeat().batch(64)

const model = new GPT(config)
const logGenerator = model.train(tokenDataset, undefined, 5) // 5 epochs
for await (const _ of logGenerator); // Await the end of training
const generation = await model.generate("Lorem ipsum dolor", tokenizer, 1)
console.log(generation)
expect(generation).equal(data) // Assert that the model completes 'Lorem ipsum dolor' with 'sit'
})
})
Loading

0 comments on commit 3b30992

Please sign in to comment.