Benchmark >2048 token sequence prompts in batches

0cc4m · Jun 11, 2023 · 896da5d · 896da5d
1 parent b65d774
commit 896da5d
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 2 deletions.
diff --git a/model.py b/model.py
@@ -705,6 +705,10 @@ def __init__(self, config):
                                                  temp_zeros_float,
                                                  temp_dq)
 
+        # Clear the cache
+
+        torch.cuda.empty_cache()
+
 
     def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False):
 

diff --git a/test_benchmark_inference.py b/test_benchmark_inference.py
@@ -33,7 +33,14 @@ def begin():
 def next_logits(input_ids, last_id_only = True):
     global model, cache
 
-    return model.forward(input_ids, cache, last_id_only)
+    n_logits = None
+    a = 0
+    while a < input_ids.shape[-1]:
+        b = min(input_ids.shape[-1], a + 2048)
+        n_logits = model.forward(input_ids[:, a:b], cache, last_id_only)
+        a = b
+
+    return n_logits
 
 
 def tokenize(text):
@@ -121,7 +128,7 @@ def mem(name, total = False):
 
     # Warming up apparently makes a huge difference
 
-    for i in range(1, 4):
+    for i in range(1, 3):
         print(f" -- Warmup pass {i}...")
         begin()
         logits = timer("Warmup", lambda: next_logits(ids))