From 9d82c17fa7bd4562b28fd1b256a31b23f2fdc776 Mon Sep 17 00:00:00 2001 From: lutangar Date: Mon, 29 Apr 2024 17:09:51 +0200 Subject: [PATCH] chore(test): clean a bit --- .gitattributes | 1 + packages/transcription/README.md | 44 +++++++++++++++++++ packages/transcription/src/model-factory.ts | 9 ---- .../transcription/src/transcription-engine.ts | 6 +-- .../transcription/src/transcription-model.ts | 43 ------------------ .../transcriber/transformers-transcriber.ts | 6 +-- 6 files changed, 50 insertions(+), 59 deletions(-) create mode 100644 .gitattributes delete mode 100644 packages/transcription/src/model-factory.ts diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000000..b98b6380c83 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +packages/tests/fixtures/transcription/models linguist-generated=true diff --git a/packages/transcription/README.md b/packages/transcription/README.md index a9e4af2967e..101668f1873 100644 --- a/packages/transcription/README.md +++ b/packages/transcription/README.md @@ -46,3 +46,47 @@ pip install -U - missing .json file - binary name is awkard, package is name whisper-timestamped and binary name is whisper-tiomestamped > https://github.com/linto-ai/whisper-timestamped/issues?q=is:issue+author:lutangar + + +## About models +Convert Whisper transformer model from PyTorch to ggml format +: e original Whisper PyTorch models provided by OpenAI a ggml format in order to be able to load them in C/C++ + +In supervised machine learning, the artefact created after training that is used to make predictions on new data is called a model. +models can be saved in a file that can potentially be compressed, so typically model files have a binary file format +TensorFlow saves models as protocol buffer files, with a .pb file extension. +Keras saves models natively as .h5 file. +Scikit-Learn saves models as pickled python objects, with a .pkl file extension. +An older format for model serving based on XML, predictive model markup language (.pmml), is still usable on some frameworks, such as Scikit-Learn. + +Training File Formats : +- petastorm +- npy +- tfrecords + +Model Serving Serialization Formats +- pb +- mlmodel +onnx +pkl +older : h5 pmml + +Hugging Face fine-tuned models to ggml format +or Whisper transformer model ? + +ML models vs Transformer Model +Transcription Model + +Other model file formats that are used include SparkML models that can be saved in MLeap file format and served in real-time using a MLleap model server (files are packaged in .zip format). Apple developed the .mlmodel file format to store models embedded in iOS applications as part of its Core ML framework (which has superior support for ObjectiveC and Swift languages). Applications trained in TensorFlow, Scikit-Learn, and other frameworks need to convert their model files to the .mlmodel file format for use on iOS, with tools like, coremltools and Tensorflow converter being available to help file format conversion. ONNX is a ML framework independent file format, supported by Microsoft, Facebook, and Amazon. In theory, any ML framework should be able to export its models in .onnx file format, so it offers great promise in unifying model serving across the different frameworks. However, as of late 2019, ONNX does not support all operations for the most popular ML frameworks (TensorFlow, PyTorch, Scikit-Learn), so ONNX is not yet practical for those frameworks. In PyTorch, the recommended way to serve models is to use Torch Script to trace and save a model as a .pt file and serve it from a C++ application. + + One final file format to mention here is YAML that is used to package models as part of the MLFlow framework for ML pipelines on Spark. MLFlow stores a YAML file that describes the files it packages for model serving, so that deployment tools can understand the model file format and know what files to deploy. +// ModelServingFileSerializationFormats + File formats: .pb, .onnx, .pkl, .mlmodel, .zip, .pmml, .pt +Inference: .pb files are served by TensorFlowServing Server; +.onnx files are served by Microsoft’s commercial model serving platorm; +.pkl files are served for Scikit-Learn models, often on Flask servers; +.mlmodel files are served by iOS platforms; +.zip files are used to package up MLeap files that are served on the MLeap runtime; +.pt files are use to package PyTorch models that can be served inside C++ applications. +.'PyTorch' | 'GGML' | 'ONNX' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark +https://towardsdatascience.com/guide-to-file-formats-for-machine-learning-columnar-training-inferencing-and-the-feature-store-2e0c3d18d4f9 diff --git a/packages/transcription/src/model-factory.ts b/packages/transcription/src/model-factory.ts deleted file mode 100644 index fbdc5abed7c..00000000000 --- a/packages/transcription/src/model-factory.ts +++ /dev/null @@ -1,9 +0,0 @@ -import { TranscriptionModel } from './transcription-model.js' - -export class ModelFactory { - createModelFromName (name: string): TranscriptionModel { - return { - name - } - } -} diff --git a/packages/transcription/src/transcription-engine.ts b/packages/transcription/src/transcription-engine.ts index 794bb3c0f22..88b6b4b7f27 100644 --- a/packages/transcription/src/transcription-engine.ts +++ b/packages/transcription/src/transcription-engine.ts @@ -2,7 +2,6 @@ import { ModelFormat } from './transcription-model.js' /** * The engine, or framework. - * */ export class TranscriptionEngine { name: string @@ -14,11 +13,10 @@ export class TranscriptionEngine { license?: string forgeURL?: string supportedModelFormats: ModelFormat[] + // There could be a default models. + // There could be a list of default models constructor (parameters: TranscriptionEngine) { Object.assign(this, parameters) } - - // There could be a default models. - // There could be a list of default models } diff --git a/packages/transcription/src/transcription-model.ts b/packages/transcription/src/transcription-model.ts index 3a9a02e3245..e9219d133c8 100644 --- a/packages/transcription/src/transcription-model.ts +++ b/packages/transcription/src/transcription-model.ts @@ -1,46 +1,3 @@ -// Convert Whisper transformer model from PyTorch to ggml format -// : e original Whisper PyTorch models provided by OpenAI a -// ggml format in order to be able to load them in C/C++ - -// In supervised machine learning, the artefact created after training that is used to make predictions on new data is called a model. -// models can be saved in a file that can potentially be compressed, so typically model files have a binary file format -// TensorFlow saves models as protocol buffer files, with a .pb file extension. -// Keras saves models natively as .h5 file. -// Scikit-Learn saves models as pickled python objects, with a .pkl file extension. -// An older format for model serving based on XML, predictive model markup language (.pmml), is still usable on some frameworks, such as Scikit-Learn. - -// Training File Formats : -// - petastorm -// - npy -// - tfrecords - -// Model Serving Serialization Formats -// - pb -// - mlmodel -// onnx -// pkl -// older : h5 pmml - -// Hugging Face fine-tuned models to ggml format -// or Whisper transformer model ? - -// ML models vs Transformer Model -// Transcription Model - -// Other model file formats that are used include SparkML models that can be saved in MLeap file format and served in real-time using a MLleap model server (files are packaged in .zip format). Apple developed the .mlmodel file format to store models embedded in iOS applications as part of its Core ML framework (which has superior support for ObjectiveC and Swift languages). Applications trained in TensorFlow, Scikit-Learn, and other frameworks need to convert their model files to the .mlmodel file format for use on iOS, with tools like, coremltools and Tensorflow converter being available to help file format conversion. ONNX is a ML framework independent file format, supported by Microsoft, Facebook, and Amazon. In theory, any ML framework should be able to export its models in .onnx file format, so it offers great promise in unifying model serving across the different frameworks. However, as of late 2019, ONNX does not support all operations for the most popular ML frameworks (TensorFlow, PyTorch, Scikit-Learn), so ONNX is not yet practical for those frameworks. In PyTorch, the recommended way to serve models is to use Torch Script to trace and save a model as a .pt file and serve it from a C++ application. -// -// One final file format to mention here is YAML that is used to package models as part of the MLFlow framework for ML pipelines on Spark. MLFlow stores a YAML file that describes the files it packages for model serving, so that deployment tools can understand the model file format and know what files to deploy. -// // ModelServingFileSerializationFormats -// File formats: .pb, .onnx, .pkl, .mlmodel, .zip, .pmml, .pt -// Inference: .pb files are served by TensorFlowServing Server; -// .onnx files are served by Microsoft’s commercial model serving platorm; -// .pkl files are served for Scikit-Learn models, often on Flask servers; -// .mlmodel files are served by iOS platforms; -// .zip files are used to package up MLeap files that are served on the MLeap runtime; -// .pt files are use to package PyTorch models that can be served inside C++ applications. -// .'PyTorch' | 'GGML' | 'ONNX' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark -// https://towardsdatascience.com/guide-to-file-formats-for-machine-learning-columnar-training-inferencing-and-the-feature-store-2e0c3d18d4f9 - export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark export abstract class TranscriptionModel { diff --git a/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts b/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts index de095735602..71a53f92318 100644 --- a/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts @@ -1,6 +1,6 @@ import { TranscriptionModel } from '../../transcription-model.js' +import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' import { AbstractTranscriber } from '../../abstract-transcriber.js' -import { TranscriptFile, TranscriptFormat } from '../../transcript/transcriptFile.js' import { $ } from 'execa' import { join } from 'path' @@ -34,10 +34,10 @@ export class TransformersTranscriber extends AbstractTranscriber { mediaFilePath ]}` - return { + return new TranscriptFile({ language, path: join(this.transcriptDirectory, `test.${format}`), format - } + }) } }