From 9bbbe47c82b508e6230aef2ad34f20ed4e266fc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 24 Oct 2024 12:06:21 +0000
Subject: [PATCH] Fix Phi 3.5 MoE tests

PR #2682 also fixed in issue in Phi MoE, but it changes the test
outputs a bit. Fix this.
---
 .../test_flash_phi35_moe.json                 |  58 ++---
 .../test_flash_phi35_moe_all_params.json      |  72 +++---
 .../test_flash_phi35_moe_load.json            | 232 +++++++++---------
 .../models/test_flash_phi35_moe.py            |   8 +-
 4 files changed, 185 insertions(+), 185 deletions(-)

diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
index 0d6dca31de8..cfabe3c65cd 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
@@ -11,32 +11,32 @@
       },
       {
         "id": 338,
-        "logprob": -0.7133789,
+        "logprob": -0.6201172,
         "text": "is"
       },
       {
         "id": 16030,
-        "logprob": -13.9296875,
+        "logprob": -13.6484375,
         "text": "gradient"
       },
       {
         "id": 26815,
-        "logprob": -0.048919678,
+        "logprob": -0.003894806,
         "text": "descent"
       },
       {
         "id": 29973,
-        "logprob": -3.0078125,
+        "logprob": -2.6386719,
         "text": "?"
       },
       {
         "id": 13,
-        "logprob": -2.8105469,
+        "logprob": -6.46875,
         "text": "\n"
       },
       {
         "id": 13,
-        "logprob": -0.84521484,
+        "logprob": -6.6875,
         "text": "\n"
       }
     ],
@@ -44,66 +44,66 @@
     "tokens": [
       {
         "id": 25584,
-        "logprob": -0.017028809,
+        "logprob": -0.008979797,
         "special": false,
         "text": "Grad"
       },
       {
         "id": 993,
-        "logprob": -0.0027313232,
+        "logprob": -8.34465e-07,
         "special": false,
         "text": "ient"
       },
       {
         "id": 26815,
-        "logprob": -0.023254395,
+        "logprob": -0.0009407997,
         "special": false,
         "text": " descent"
       },
       {
         "id": 338,
-        "logprob": -2.0623207e-05,
+        "logprob": -0.0003838539,
         "special": false,
         "text": " is"
       },
       {
-        "id": 263,
-        "logprob": -0.5361328,
+        "id": 385,
+        "logprob": -0.24499512,
         "special": false,
-        "text": " a"
+        "text": " an"
       },
       {
-        "id": 937,
-        "logprob": -0.17578125,
+        "id": 13883,
+        "logprob": -0.010406494,
         "special": false,
-        "text": " first"
+        "text": " optimization"
       },
       {
-        "id": 29899,
-        "logprob": 0.0,
+        "id": 5687,
+        "logprob": -0.00024354458,
         "special": false,
-        "text": "-"
+        "text": " algorithm"
       },
       {
-        "id": 2098,
-        "logprob": -0.00011539459,
+        "id": 15574,
+        "logprob": -0.6582031,
         "special": false,
-        "text": "order"
+        "text": " commonly"
       },
       {
-        "id": 13883,
-        "logprob": -0.47436523,
+        "id": 1304,
+        "logprob": -0.00092840195,
         "special": false,
-        "text": " optimization"
+        "text": " used"
       },
       {
-        "id": 5687,
-        "logprob": -0.00027680397,
+        "id": 297,
+        "logprob": -0.19470215,
         "special": false,
-        "text": " algorithm"
+        "text": " in"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Gradient descent is a first-order optimization algorithm"
+  "generated_text": "Gradient descent is an optimization algorithm commonly used in"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
index 38b803353d2..b524859fdc7 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
@@ -5,95 +5,95 @@
     "generated_tokens": 10,
     "prefill": [
       {
-        "id": 16030,
+        "id": 338,
         "logprob": null,
+        "text": "is"
+      },
+      {
+        "id": 16030,
+        "logprob": -13.328125,
         "text": "gradient"
       },
       {
         "id": 26815,
-        "logprob": -6.4960938,
+        "logprob": -0.24023438,
         "text": "descent"
       },
       {
         "id": 29973,
-        "logprob": -5.1484375,
+        "logprob": -3.1386719,
         "text": "?"
       },
       {
         "id": 13,
-        "logprob": -4.0351562,
-        "text": "\n"
-      },
-      {
-        "id": 13,
-        "logprob": -5.2265625,
+        "logprob": -3.0878906,
         "text": "\n"
       }
     ],
     "seed": 0,
     "tokens": [
       {
-        "id": 10994,
-        "logprob": -1.1542969,
+        "id": 25584,
+        "logprob": 0.0,
         "special": false,
-        "text": "Hello"
+        "text": "Grad"
       },
       {
-        "id": 29991,
+        "id": 993,
         "logprob": 0.0,
         "special": false,
-        "text": "!"
+        "text": "ient"
       },
       {
-        "id": 739,
+        "id": 2726,
         "logprob": 0.0,
         "special": false,
-        "text": " It"
+        "text": " Des"
       },
       {
-        "id": 2444,
-        "logprob": -0.42260742,
+        "id": 1760,
+        "logprob": 0.0,
         "special": false,
-        "text": " seems"
+        "text": "cent"
       },
       {
-        "id": 366,
-        "logprob": 0.0,
+        "id": 313,
+        "logprob": -0.12322998,
         "special": false,
-        "text": " you"
+        "text": " ("
       },
       {
-        "id": 29915,
+        "id": 29954,
         "logprob": 0.0,
         "special": false,
-        "text": "'"
+        "text": "G"
       },
       {
-        "id": 276,
-        "logprob": -0.9838867,
+        "id": 29928,
+        "logprob": 0.0,
         "special": false,
-        "text": "re"
+        "text": "D"
       },
       {
-        "id": 3211,
+        "id": 29897,
         "logprob": 0.0,
         "special": false,
-        "text": " address"
+        "text": ")"
       },
       {
-        "id": 292,
-        "logprob": 0.0,
+        "id": 338,
+        "logprob": -0.6040039,
         "special": false,
-        "text": "ing"
+        "text": " is"
       },
       {
-        "id": 263,
-        "logprob": -0.15124512,
+        "id": 385,
+        "logprob": -0.1796875,
         "special": false,
-        "text": " a"
+        "text": " an"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "What is gradient descent?\n\nHello! It seems you're addressing a"
+  "generated_text": "What is gradient descent?\nGradient Descent (GD) is an"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
index f1f81152c46..2c977d8b2be 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
@@ -12,32 +12,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7133789,
+          "logprob": -0.6201172,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9296875,
+          "logprob": -13.6484375,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.048919678,
+          "logprob": -0.003894806,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6386719,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8105469,
+          "logprob": -6.46875,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.84521484,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -45,68 +45,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.017028809,
+          "logprob": -0.008979797,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0028476715,
+          "logprob": -8.34465e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023971558,
+          "logprob": -0.00097084045,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.0003838539,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.23840332,
           "special": false,
-          "text": " a"
+          "text": " an"
         },
         {
-          "id": 937,
-          "logprob": -0.17602539,
+          "id": 13883,
+          "logprob": -0.010406494,
           "special": false,
-          "text": " first"
+          "text": " optimization"
         },
         {
-          "id": 29899,
-          "logprob": 0.0,
+          "id": 5687,
+          "logprob": -0.0002501011,
           "special": false,
-          "text": "-"
+          "text": " algorithm"
         },
         {
-          "id": 2098,
-          "logprob": -0.000116467476,
+          "id": 15574,
+          "logprob": -0.6582031,
           "special": false,
-          "text": "order"
+          "text": " commonly"
         },
         {
-          "id": 13883,
-          "logprob": -0.47436523,
+          "id": 1304,
+          "logprob": -0.00092840195,
           "special": false,
-          "text": " optimization"
+          "text": " used"
         },
         {
-          "id": 5687,
-          "logprob": -0.00027871132,
+          "id": 297,
+          "logprob": -0.18933105,
           "special": false,
-          "text": " algorithm"
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -121,32 +121,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7128906,
+          "logprob": -0.6113281,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.6640625,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.05053711,
+          "logprob": -0.003929138,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0058594,
+          "logprob": -2.625,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.484375,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.84521484,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -154,68 +154,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.018859863,
+          "logprob": -0.009017944,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.002822876,
+          "logprob": -9.536743e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.00097084045,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.0003838539,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
+          "text": " an"
         },
         {
-          "id": 937,
-          "logprob": -0.17126465,
+          "id": 13883,
+          "logprob": -0.010406494,
           "special": false,
-          "text": " first"
+          "text": " optimization"
         },
         {
-          "id": 29899,
-          "logprob": 0.0,
+          "id": 5687,
+          "logprob": -0.0002501011,
           "special": false,
-          "text": "-"
+          "text": " algorithm"
         },
         {
-          "id": 2098,
-          "logprob": -0.0001155138,
+          "id": 15574,
+          "logprob": -0.6435547,
           "special": false,
-          "text": "order"
+          "text": " commonly"
         },
         {
-          "id": 13883,
-          "logprob": -0.47436523,
+          "id": 1304,
+          "logprob": -0.0009279251,
           "special": false,
-          "text": " optimization"
+          "text": " used"
         },
         {
-          "id": 5687,
-          "logprob": -0.00027036667,
+          "id": 297,
+          "logprob": -0.18933105,
           "special": false,
-          "text": " algorithm"
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -230,32 +230,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.71484375,
+          "logprob": -0.609375,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.671875,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.049346924,
+          "logprob": -0.0040016174,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6230469,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.453125,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.86328125,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -263,68 +263,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.017196655,
+          "logprob": -0.008956909,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0028438568,
+          "logprob": -8.34465e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.0009407997,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.026558e-05,
+          "logprob": -0.0003721714,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
+          "text": " an"
         },
         {
-          "id": 937,
-          "logprob": -0.17602539,
+          "id": 13883,
+          "logprob": -0.010406494,
           "special": false,
-          "text": " first"
+          "text": " optimization"
         },
         {
-          "id": 29899,
-          "logprob": 0.0,
+          "id": 5687,
+          "logprob": -0.0002501011,
           "special": false,
-          "text": "-"
+          "text": " algorithm"
         },
         {
-          "id": 2098,
-          "logprob": -0.00011622906,
+          "id": 15574,
+          "logprob": -0.6435547,
           "special": false,
-          "text": "order"
+          "text": " commonly"
         },
         {
-          "id": 13883,
-          "logprob": -0.48608398,
+          "id": 1304,
+          "logprob": -0.00092601776,
           "special": false,
-          "text": " optimization"
+          "text": " used"
         },
         {
-          "id": 5687,
-          "logprob": -0.00027894974,
+          "id": 297,
+          "logprob": -0.19177246,
           "special": false,
-          "text": " algorithm"
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -339,32 +339,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7192383,
+          "logprob": -0.609375,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.6640625,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.050445557,
+          "logprob": -0.0038967133,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6347656,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.453125,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.8276367,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -372,67 +372,67 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.01727295,
+          "logprob": -0.008979797,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0027542114,
+          "logprob": -9.536743e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.0009407997,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.00038409233,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
+          "text": " an"
         },
         {
-          "id": 937,
-          "logprob": -0.17126465,
+          "id": 13883,
+          "logprob": -0.010414124,
           "special": false,
-          "text": " first"
+          "text": " optimization"
         },
         {
-          "id": 29899,
-          "logprob": 0.0,
+          "id": 5687,
+          "logprob": -0.00024354458,
           "special": false,
-          "text": "-"
+          "text": " algorithm"
         },
         {
-          "id": 2098,
-          "logprob": -0.00011301041,
+          "id": 15574,
+          "logprob": -0.6435547,
           "special": false,
-          "text": "order"
+          "text": " commonly"
         },
         {
-          "id": 13883,
-          "logprob": -0.48608398,
+          "id": 1304,
+          "logprob": -0.0009279251,
           "special": false,
-          "text": " optimization"
+          "text": " used"
         },
         {
-          "id": 5687,
-          "logprob": -0.00027894974,
+          "id": 297,
+          "logprob": -0.19470215,
           "special": false,
-          "text": " algorithm"
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   }
 ]
diff --git a/integration-tests/models/test_flash_phi35_moe.py b/integration-tests/models/test_flash_phi35_moe.py
index 2173740ad3c..d3043b028a8 100644
--- a/integration-tests/models/test_flash_phi35_moe.py
+++ b/integration-tests/models/test_flash_phi35_moe.py
@@ -25,7 +25,7 @@ async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "Gradient descent is a first-order optimization algorithm"
+        == "Gradient descent is an optimization algorithm commonly used in"
     )
     assert response == response_snapshot
 
@@ -33,7 +33,7 @@ async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
 @pytest.mark.asyncio
 async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
     response = await flash_phi35_moe.generate(
-        "What is gradient descent?\n\n",
+        "What is gradient descent?\n",
         max_new_tokens=10,
         repetition_penalty=1.2,
         return_full_text=True,
@@ -51,7 +51,7 @@ async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "What is gradient descent?\n\nHello! It seems you're addressing a"
+        == "What is gradient descent?\nGradient Descent (GD) is an"
     )
     assert response == response_snapshot
 
@@ -66,7 +66,7 @@ async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, response_sna
     assert responses[0].details.generated_tokens == 10
     assert (
         responses[0].generated_text
-        == "Gradient descent is a first-order optimization algorithm"
+        == "Gradient descent is an optimization algorithm commonly used in"
     )
     assert all(
         [r.generated_text == responses[0].generated_text for r in responses]