discojs/model: expose batch generator

epfml · Jun 21, 2024 · c86b033 · c86b033
1 parent 76befd7
commit c86b033
Show file tree

Hide file tree

Showing 11 changed files with 341 additions and 123 deletions.
diff --git a/cli/src/benchmark_gpt.ts b/cli/src/benchmark_gpt.ts
@@ -3,6 +3,7 @@ import type { Task } from '@epfml/discojs'
 import { fetchTasks, data, models } from '@epfml/discojs'
 import { NodeTextLoader, loadModelFromDisk } from '@epfml/discojs-node'
 import { startServer } from 'server'
+import { get_return_value } from './utils.js';
 
 interface CLIArguments{
   modelType?: string; // 'gpt-nano', 'gpt-micro', 'gpt-mini', 'gpt2'
@@ -80,8 +81,9 @@ async function main(args: Required<CLIArguments>): Promise<void> {
     console.log(`\tmodel type ${modelType} \n\tbatch size ${batchSize} \n\tcontext length ${contextLength}`)
 
     let epochTime = performance.now()
-    const logGenerator = model.train(preprocessedDataset, undefined, epoch)
-    for await (const logs of logGenerator) {
+    const rounds = model.train(preprocessedDataset, undefined, epoch)
+    for (const round of rounds) {
+      const logs = await get_return_value(round)
       epochTime = (performance.now() - epochTime)
       const msPerToken = epochTime / (batchSize * contextLength * iterationsPerEpoch * epoch)
       console.log(`\t\tTraining time: ${msPerToken.toFixed(2)} ms/token <br> ${logs.peakMemory.toFixed(2)} GB`)

diff --git a/cli/src/utils.ts b/cli/src/utils.ts
@@ -0,0 +1,9 @@
+export async function get_return_value<T>(
+  iter: AsyncIterator<unknown, T>,
+): Promise<T> {
+  for (;;) {
+    const v = await iter.next();
+    if (!v.done) continue;
+    return v.value;
+  }
+}
diff --git a/discojs/src/default_tasks/wikitext.ts b/discojs/src/default_tasks/wikitext.ts
@@ -20,7 +20,7 @@ export const wikitext: TaskProvider = {
         modelID: 'wikitext-103-raw-model',
         preprocessingFunctions: [data.TextPreprocessing.Tokenize, data.TextPreprocessing.LeftPadding],
         scheme: 'federated',
-        epochs: 5,
+        epochs: 10,
         // Unused by wikitext because data already comes split
         // But if set to 0 then the webapp doesn't display the validation metrics
         validationSplit: 0.1, 

diff --git a/discojs/src/models/gpt/index.ts b/discojs/src/models/gpt/index.ts
@@ -2,15 +2,17 @@
  * this code is taken from gpt-tfjs with modifications from @peacefulotter and @lukemovement
  **/
 
+import { List, Map } from 'immutable';
 import * as tf from '@tensorflow/tfjs'
 import { PreTrainedTokenizer } from '@xenova/transformers';
 
 import { WeightsContainer } from '../../index.js'
 import type { Dataset } from '../../dataset/index.js'
 
-import { Model } from '../model.js'
+import { BatchLogs, EpochLogs, Model } from "../index.js";
+import type { Prediction, Sample } from '../model.js'
+
 import { GPTForCausalLM } from './model.js'
-import type { EpochLogs, Prediction, Sample } from '../model.js'
 import type { GPTConfig } from './config.js'
 
 export type GPTSerialization = {
@@ -35,54 +37,108 @@ export class GPT extends Model {
    * @param epochs the number of passes of the training dataset
    * @param tracker
    */
-  override async *train(
+  override *train(
     trainingData: Dataset,
     validationData?: Dataset,
     epochs = 1,
-  ): AsyncGenerator<EpochLogs, void> {
-    this.model.compile()
+  ): Generator<AsyncGenerator<BatchLogs, EpochLogs>, void> {
+    this.model.compile();
+
+    for (let epoch = 0; epoch < epochs; epoch++)
+      yield async function* (this: GPT) {
+        let batchesLogs = List<BatchLogs>();
+
+        const batches = await trainingData.iterator(); // tf.LazyIterator isn't an AsyncGenerator
+        for (let batchNumber = 0; true; batchNumber++) {
+          const iteration = await batches.next();
+          if (iteration.done) break;
+          const batch = iteration.value;
+
+          const batchLogs = {
+            batch: batchNumber,
+            ...(await this.#runBatch(batch)),
+          };
+
+          yield batchLogs;
+          batchesLogs = batchesLogs.push(batchLogs);
+        }
+
+        const validation =
+          validationData && (await this.#evaluate(validationData));
+
+        return new EpochLogs(epoch, batchesLogs, validation);
+      }.bind(this)();
+  }
+
+  async #runBatch(
+    batch: tf.TensorContainer,
+  ): Promise<Omit<BatchLogs, "batch">> {
     let logs: tf.Logs | undefined;
-    const trainingArgs: tf.ModelFitDatasetArgs<tf.TensorContainer> = {
-      epochs: 1, // force fitDataset to do only one epoch because it is wrapped in a for loop
-      validationData,
-      callbacks: { onEpochEnd: (_, cur) => { logs = cur }},
+    await this.model.fitDataset(tf.data.array([batch]), {
+      epochs: 1,
+      callbacks: {
+        onEpochEnd: (_, cur) => {
+          logs = cur;
+        },
+      },
+    });
+    if (logs === undefined) throw new Error("batch didn't gave any logs");
+
+    const { loss, acc: accuracy } = logs;
+    if (loss === undefined || isNaN(loss))
+      throw new Error("training loss is undefined or NaN");
+
+    return {
+      accuracy,
+      loss,
+      memoryUsage: tf.memory().numBytes / 1024 / 1024 / 1024,
     };
-    for (let epoch = 0; epoch < epochs; epoch++) {
-      await this.model.fitDataset(trainingData, trainingArgs);
-      if (logs === undefined) {
-        throw new Error("Epoch didn't gave any logs");
-      }
-      const { loss, val_acc, val_loss, peakMemory } = logs;
-      if (loss === undefined || isNaN(loss)) {
-        throw new Error("Training loss is undefined or nan");
-      }
-      const structuredLogs: EpochLogs = {
-        epoch,
-        peakMemory,
-        training: {
-          loss: logs.loss,
-          accuracy: logs.acc
-        }
-      }
+  }
 
-      if (validationData !== undefined) {
-        if(val_loss === undefined || isNaN(val_loss) ||
-          val_acc === undefined || isNaN(val_acc)) {
-          throw new Error("Validation accuracy or loss is undefined or nan");
+  async #evaluate(
+    dataset: Dataset,
+  ): Promise<Record<"accuracy" | "loss", number>> {
+    const evaluation = await this.model.evaluateDataset(
+      dataset.map((t) => {
+        switch (t) {
+          case null:
+          case undefined:
+            throw new Error("nullish value in dataset");
+          default:
+            return t as Exclude<tf.TensorContainer, void>;
         }
-        structuredLogs.validation = { accuracy: logs.val_acc, loss: logs.val_loss}
-      }
-      yield structuredLogs
-    }
+      }),
+    );
+    const metricToValue = Map(
+      List(this.model.metricsNames).zip(
+        Array.isArray(evaluation)
+          ? List(await Promise.all(evaluation.map((t) => t.data())))
+          : List.of(await evaluation.data()),
+      ),
+    ).map((values) => {
+      if (values.length !== 1) throw new Error("more than one metric value");
+      return values[0];
+    });
+
+    const [accuracy, loss] = [
+      metricToValue.get("acc"),
+      metricToValue.get("loss"),
+    ];
+    if (accuracy === undefined || loss === undefined)
+      throw new Error("some needed metrics are missing");
+
+    return { accuracy, loss };
   }
 
-  override predict (input: Sample): Promise<Prediction> {
-    const ret = this.model.predict(input)
+  override predict(input: Sample): Promise<Prediction> {
+    const ret = this.model.predict(input);
     if (Array.isArray(ret)) {
-      throw new Error('prediction yield many Tensors but should have only returned one')
+      throw new Error(
+        "prediction yield many Tensors but should have only returned one",
+      );
     }
 
-    return Promise.resolve(ret)
+    return Promise.resolve(ret);
   }
 
   async generate(input: string, tokenizer: PreTrainedTokenizer, newTokens: number = 10): Promise<string> {

diff --git a/discojs/src/models/index.ts b/discojs/src/models/index.ts
@@ -1,4 +1,5 @@
-export { EpochLogs, Model } from './model.js'
+export { Model } from './model.js'
+export { BatchLogs, EpochLogs } from "./logs.js";
 
 export { GPT } from './gpt/index.js'
 export { GPTConfig } from './gpt/config.js'

diff --git a/discojs/src/models/logs.ts b/discojs/src/models/logs.ts
@@ -0,0 +1,34 @@
+import { List } from "immutable";
+
+export interface BatchLogs {
+  batch: number; // first batch is zero
+  accuracy: number;
+  loss: number;
+  memoryUsage: number; // GB
+}
+
+export class EpochLogs {
+  public readonly batches: List<BatchLogs>;
+
+  constructor(
+    public readonly epoch: number, // first epoch is zero
+    batches: Iterable<BatchLogs>,
+    public readonly validation?: Record<"accuracy" | "loss", number>,
+  ) {
+    this.batches = List(batches);
+  }
+
+  get training(): Record<"accuracy" | "loss", number> {
+    return this.batches.reduce(
+      (acc, batch) => ({
+        accuracy: acc.accuracy + batch.accuracy,
+        loss: acc.loss + batch.loss,
+      }),
+      { loss: 0, accuracy: 0 },
+    );
+  }
+
+  get peakMemory(): number {
+    return this.batches.map((batch) => batch.memoryUsage).max() ?? 0;
+  }
+}
diff --git a/discojs/src/models/model.ts b/discojs/src/models/model.ts
@@ -3,18 +3,7 @@ import type tf from "@tensorflow/tfjs";
 import type { WeightsContainer } from "../index.js";
 import type { Dataset } from "../dataset/index.js";
 
-export interface EpochLogs {
-  epoch: number; // first epoch is zero
-  training: {
-    loss: number,
-    accuracy?: number
-  };
-  validation?: {
-    loss: number,
-    accuracy: number
-  };
-  peakMemory: number;
-}
+import type { BatchLogs, EpochLogs } from "./logs.js";
 
 // TODO still bound to tfjs
 export type Prediction = tf.Tensor;
@@ -26,7 +15,7 @@ export type Sample = tf.Tensor;
  * Allow for various implementation of models (various train function, tensor-library, ...)
  **/
 // TODO make it typesafe: same shape of data/input/weights
-export abstract class Model implements Disposable{
+export abstract class Model implements Disposable {
   // TODO don't allow external access but upgrade train to return weights on every epoch
   /** Return training state */
   abstract get weights(): WeightsContainer;
@@ -46,13 +35,12 @@ export abstract class Model implements Disposable{
     trainingData: Dataset,
     validationData?: Dataset,
     epochs?: number,
-  ): AsyncGenerator<EpochLogs, void>;
+  ): Generator<AsyncGenerator<BatchLogs, EpochLogs>, void>;
 
   /** Predict likely values */
   // TODO extract in separated TrainedModel?
   abstract predict(input: Sample): Promise<Prediction>;
 
-
   /**
    * This method is automatically called to cleanup the memory occupied by the model
    * when leaving the definition scope if the instance has been defined with the `using` keyword.