huggingface · xenova · Oct 26, 2024
diff --git a/llama-3.2-webgpu/package-lock.json b/llama-3.2-webgpu/package-lock.json
diff --git a/llama-3.2-webgpu/package.json b/llama-3.2-webgpu/package.json
@@ -10,7 +10,7 @@
     "preview": "vite preview"
   },
   "dependencies": {
-    "@huggingface/transformers": "3.0.0",
+    "@huggingface/transformers": "3.0.1",
     "dompurify": "^3.1.2",
     "marked": "^12.0.2",
     "react": "^18.3.1",

diff --git a/llama-3.2-webgpu/src/App.jsx b/llama-3.2-webgpu/src/App.jsx
@@ -10,7 +10,7 @@ const STICKY_SCROLL_THRESHOLD = 120;
 const EXAMPLES = [
   "Give me some tips to improve my time management skills.",
   "What is the difference between AI and ML?",
-  "Write python code to compute the nth fibonacci number.",
+  "Write Python code to perform merge sort.",
 ];
 
 function App() {
@@ -209,7 +209,7 @@ function App() {
               <br />
               You are about to load{" "}
               <a
-                href="https://huggingface.co/onnx-community/Llama-3.2-1B-Instruct-q4f16"
+                href="https://huggingface.co/onnx-community/Llama-3.2-1B-Instruct-onnx-web-gqa"
                 target="_blank"
                 rel="noreferrer"
                 className="font-medium underline"

diff --git a/llama-3.2-webgpu/src/worker.js b/llama-3.2-webgpu/src/worker.js
@@ -9,15 +9,14 @@ import {
  * This class uses the Singleton pattern to enable lazy-loading of the pipeline
  */
 class TextGenerationPipeline {
-  static model_id = "onnx-community/Llama-3.2-1B-Instruct-q4f16";
+  static model_id = "onnx-community/Llama-3.2-1B-Instruct-onnx-web-gqa";
 
   static async getInstance(progress_callback = null) {
     this.tokenizer ??= AutoTokenizer.from_pretrained(this.model_id, {
       progress_callback,
     });
 
     this.model ??= AutoModelForCausalLM.from_pretrained(this.model_id, {
-      dtype: "q4f16",
       device: "webgpu",
       progress_callback,
     });
@@ -69,18 +68,17 @@ async function generate(messages) {
 
   const { past_key_values, sequences } = await model.generate({
     ...inputs,
-    // TODO: Add when model is fixed
-    // past_key_values: past_key_values_cache,
+    past_key_values: past_key_values_cache,
 
     // Sampling
-    do_sample: false,
+    // do_sample: true,
 
     max_new_tokens: 1024,
     streamer,
     stopping_criteria,
     return_dict_in_generate: true,
   });
-  // past_key_values_cache = past_key_values;
+  past_key_values_cache = past_key_values;
 
   const decoded = tokenizer.batch_decode(sequences, {
     skip_special_tokens: true,
@@ -153,7 +151,7 @@ self.addEventListener("message", async (e) => {
       break;
 
     case "reset":
-      // past_key_values_cache = null;
+      past_key_values_cache = null;
       stopping_criteria.reset();
       break;
   }