diff --git a/README.md b/README.md index 68d79e27b..d34545766 100644 --- a/README.md +++ b/README.md @@ -366,6 +366,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **Moondream1** released in the repository [moondream](https://github.com/vikhyat/moondream) by vikhyat. +1. **[Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine)** (from Useful Sensors) released with the paper [Moonshine: Speech Recognition for Live Transcription and Voice Commands](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index aa971793e..898bc756f 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -81,6 +81,7 @@ 1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. 1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. 1. **Moondream1** released in the repository [moondream](https://github.com/vikhyat/moondream) by vikhyat. +1. **[Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine)** (from Useful Sensors) released with the paper [Moonshine: Speech Recognition for Live Transcription and Voice Commands](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden. 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaicML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team. 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. diff --git a/src/configs.js b/src/configs.js index 8964c6506..13bc8aaa4 100644 --- a/src/configs.js +++ b/src/configs.js @@ -186,6 +186,7 @@ function getNormalizedConfig(config) { mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model'; break; case 'musicgen_decoder': + case 'moonshine': mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers'; mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads'; mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size'; diff --git a/src/models.js b/src/models.js index 1094f10c3..8c6ed59e2 100644 --- a/src/models.js +++ b/src/models.js @@ -3359,6 +3359,29 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { } ////////////////////////////////////////////////// + +////////////////////////////////////////////////// +// Moonshine models +export class MoonshinePreTrainedModel extends PreTrainedModel { + + requires_attention_mask = false; + main_input_name = 'input_values'; + forward_params = [ + 'input_values', + 'decoder_input_ids', + 'past_key_values', + ]; +}; + +/** + * MoonshineModel class for training Moonshine models without a language model head. + */ +export class MoonshineModel extends MoonshinePreTrainedModel { } + +export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { } +////////////////////////////////////////////////// + + ////////////////////////////////////////////////// /** * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks @@ -7013,6 +7036,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([ const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([ ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]], ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]], + ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]], ]); const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([ diff --git a/src/models/feature_extractors.js b/src/models/feature_extractors.js index 869c8191b..98aa61572 100644 --- a/src/models/feature_extractors.js +++ b/src/models/feature_extractors.js @@ -1,6 +1,7 @@ export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js'; export * from './clap/feature_extraction_clap.js'; +export * from './moonshine/feature_extraction_moonshine.js'; export * from './pyannote/feature_extraction_pyannote.js'; export * from './seamless_m4t/feature_extraction_seamless_m4t.js'; export * from './speecht5/feature_extraction_speecht5.js'; diff --git a/src/models/moonshine/feature_extraction_moonshine.js b/src/models/moonshine/feature_extraction_moonshine.js new file mode 100644 index 000000000..9f01ab342 --- /dev/null +++ b/src/models/moonshine/feature_extraction_moonshine.js @@ -0,0 +1,26 @@ +import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; +import { Tensor } from '../../utils/tensor.js'; + + +export class MoonshineFeatureExtractor extends FeatureExtractor { + /** + * Asynchronously extracts input values from a given audio using the provided configuration. + * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. + * @returns {Promise<{ input_values: Tensor; }>} The extracted input values. + */ + async _call(audio) { + validate_audio_inputs(audio, 'MoonshineFeatureExtractor'); + + if (audio instanceof Float64Array) { + audio = new Float32Array(audio); + } + + const shape = [ + 1, /* batch_size */ + audio.length, /* num_samples */ + ]; + return { + input_values: new Tensor('float32', audio, shape), + }; + } +} diff --git a/src/models/moonshine/processing_moonshine.js b/src/models/moonshine/processing_moonshine.js new file mode 100644 index 000000000..e313976ec --- /dev/null +++ b/src/models/moonshine/processing_moonshine.js @@ -0,0 +1,20 @@ +import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js" +import { AutoTokenizer } from "../../tokenizers.js" +import { Processor } from "../../base/processing_utils.js" + +/** + * Represents a MoonshineProcessor that extracts features from an audio input. + */ +export class MoonshineProcessor extends Processor { + static tokenizer_class = AutoTokenizer + static feature_extractor_class = AutoFeatureExtractor + + /** + * Calls the feature_extractor function with the given audio input. + * @param {any} audio The audio input to extract features from. + * @returns {Promise} A Promise that resolves with the extracted features. + */ + async _call(audio) { + return await this.feature_extractor(audio); + } +} diff --git a/src/models/processors.js b/src/models/processors.js index d254ad118..9d08faf73 100644 --- a/src/models/processors.js +++ b/src/models/processors.js @@ -1,5 +1,6 @@ export * from './florence2/processing_florence2.js'; export * from './mgp_str/processing_mgp_str.js'; +export * from './moonshine/processing_moonshine.js'; export * from './idefics3/processing_idefics3.js'; export * from './janus/processing_janus.js'; export * from './jina_clip/processing_jina_clip.js'; diff --git a/src/pipelines.js b/src/pipelines.js index a61cb1dde..55faef8c0 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -1729,6 +1729,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options case 'unispeech-sat': case 'hubert': return this._call_wav2vec2(audio, kwargs) + case 'moonshine': + return this._call_moonshine(audio, kwargs) default: throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`) } @@ -1882,6 +1884,34 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options } return single ? toReturn[0] : toReturn; } + + /** + * @type {AutomaticSpeechRecognitionPipelineCallback} + * @private + */ + async _call_moonshine(audio, kwargs) { + const single = !Array.isArray(audio); + if (single) { + audio = [/** @type {AudioInput} */ (audio)]; + } + const sampling_rate = this.processor.feature_extractor.config.sampling_rate; + const preparedAudios = await prepareAudios(audio, sampling_rate); + const toReturn = []; + for (const aud of preparedAudios) { + const inputs = await this.processor(aud); + + // According to the [paper](https://arxiv.org/pdf/2410.15608): + // "We use greedy decoding, with a heuristic limit of 6 output tokens + // per second of audio to avoid repeated output sequences." + const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6; + const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs }); + + const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0]; + toReturn.push({ text }); + } + return single ? toReturn[0] : toReturn; + } + } /** diff --git a/tests/models/moonshine/test_feature_extraction_moonshine.js b/tests/models/moonshine/test_feature_extraction_moonshine.js new file mode 100644 index 000000000..df2c5332a --- /dev/null +++ b/tests/models/moonshine/test_feature_extraction_moonshine.js @@ -0,0 +1,30 @@ +import { AutoFeatureExtractor, MoonshineFeatureExtractor } from "../../../src/transformers.js"; + +import { load_cached_audio } from "../../asset_cache.js"; +import { MAX_FEATURE_EXTRACTOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + // MoonshineFeatureExtractor + describe("MoonshineFeatureExtractor", () => { + const model_id = "onnx-community/moonshine-tiny-ONNX"; + + /** @type {MoonshineFeatureExtractor} */ + let feature_extractor; + beforeAll(async () => { + feature_extractor = await AutoFeatureExtractor.from_pretrained(model_id); + }, MAX_FEATURE_EXTRACTOR_LOAD_TIME); + + it( + "default", + async () => { + const audio = await load_cached_audio("mlk"); + const { input_values } = await feature_extractor(audio); + expect(input_values.dims).toEqual([1, 208000]); + expect(input_values.mean().item()).toBeCloseTo(-1.5654930507480458e-7, 6); + expect(input_values.data[0]).toBeCloseTo(0.0067138671875, 6); + expect(input_values.data.at(-1)).toBeCloseTo(-0.013427734375, 6); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +};