diff --git a/backends/client/src/lib.rs b/backends/client/src/lib.rs index 7fd010ae7f6..cec820f6b89 100644 --- a/backends/client/src/lib.rs +++ b/backends/client/src/lib.rs @@ -83,7 +83,7 @@ impl ChunksToString for Vec { data, mimetype, width, - height, + height: _, frames: _, })) => { // TODO: revisit if we should limit video support to v3 - to avoid sending very large base64 strings diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index a1c7a02a560..587ef5b4f0c 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -243,14 +243,16 @@ def batch_tokenized_inputs( ) num_bytes = len(video_frame_buf) bytes_per_frame = num_bytes // chunk.video.frames - + # iterate over with a stride the size of a frame frames = [] for i in range(chunk.video.frames): frame = video_frame_buf[ i * bytes_per_frame : (i + 1) * bytes_per_frame ] - frame = frame.reshape(chunk.video.height, chunk.video.width, 3) + frame = frame.reshape( + chunk.video.height, chunk.video.width, 3 + ) frames.append(frame) video_frame_buf = np.stack(frames)