Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Python binding inference performance improvement #426

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
134 changes: 15 additions & 119 deletions python/test/test_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -90,24 +90,18 @@ def test_create_request(self, server_options):
request = tritonserver.InferenceRequest(server.model("test"))


class TestAllocators:
class MockMemoryAllocator(tritonserver.MemoryAllocator):
def __init__(self):
pass

def allocate(self, *args, **kwargs):
raise Exception("foo")

class TestOutputMemory:
@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
def test_memory_fallback_to_cpu(self, server_options):
server = tritonserver.Server(server_options).start(wait_until_ready=True)

assert server.ready()

allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]

del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]

# The memory allocator is internal to the binding, and before GPU memory support
# is added, it will always fallback to CPU memory regardless of the memory
# preferred by the backend.
# TODO: Revisit this test when GPU memory support is added, i.e. the backend
# prefers GPU memory, but the system only has CPU memory.
server.load(
"test",
{
Expand Down Expand Up @@ -135,36 +129,10 @@ def test_memory_fallback_to_cpu(self, server_options):
fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
assert fp16_input[0][0] == fp16_output[0][0]

tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] = allocator

def test_memory_allocator_exception(self, server_options):
server = tritonserver.Server(server_options).start(wait_until_ready=True)

assert server.ready()

server.load(
"test",
{
"config": json.dumps(
{
"backend": "python",
"parameters": {"decoupled": {"string_value": "False"}},
}
)
},
)

with pytest.raises(tritonserver.InternalError):
for response in server.model("test").infer(
inputs={
"string_input": tritonserver.Tensor.from_string_array([["hello"]])
},
output_memory_type="gpu",
output_memory_allocator=TestAllocators.MockMemoryAllocator(),
):
pass

def test_unsupported_memory_type(self, server_options):
# TODO: Revisit this test when GPU memory support is added, i.e. the request
# specifies output to be in GPU memory, but the system only has CPU
# memory, which an exception should be raised during inference.
server = tritonserver.Server(server_options).start(wait_until_ready=True)

assert server.ready()
Expand All @@ -181,91 +149,15 @@ def test_unsupported_memory_type(self, server_options):
},
)

if tritonserver.MemoryType.GPU in tritonserver.default_memory_allocators:
allocator = tritonserver.default_memory_allocators[
tritonserver.MemoryType.GPU
]

del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
else:
allocator = None

with pytest.raises(tritonserver.InvalidArgumentError):
for response in server.model("test").infer(
inputs={
"string_input": tritonserver.Tensor.from_string_array([["hello"]])
},
output_memory_type="gpu",
output_memory_type="unsupported",
):
pass

if allocator is not None:
tritonserver.default_memory_allocators[
tritonserver.MemoryType.GPU
] = allocator

@pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
def test_allocate_on_cpu_and_reshape(self):
allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU]

memory_buffer = allocator.allocate(
memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200
)

cpu_array = memory_buffer.owner

assert memory_buffer.size == 200

fp32_size = int(memory_buffer.size / 4)

tensor = tritonserver.Tensor(
tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
)

cpu_fp32_array = numpy.from_dlpack(tensor)
assert cpu_array.ctypes.data == cpu_fp32_array.ctypes.data
assert cpu_fp32_array.dtype == numpy.float32
assert cpu_fp32_array.nbytes == 200

@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
@pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
def test_allocate_on_gpu_and_reshape(self):
allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]

memory_buffer = allocator.allocate(
memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200
)

gpu_array = memory_buffer.owner

gpu_array = cupy.empty([10, 20], dtype=cupy.uint8)
memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array)

assert memory_buffer.size == 200

fp32_size = int(memory_buffer.size / 4)

tensor = tritonserver.Tensor(
tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
)

gpu_fp32_array = cupy.from_dlpack(tensor)
assert (
gpu_array.__cuda_array_interface__["data"][0]
== gpu_fp32_array.__cuda_array_interface__["data"][0]
)

assert gpu_fp32_array.dtype == cupy.float32
assert gpu_fp32_array.nbytes == 200

torch_fp32_tensor = torch.from_dlpack(tensor)
assert torch_fp32_tensor.dtype == torch.float32
assert (
torch_fp32_tensor.data_ptr()
== gpu_array.__cuda_array_interface__["data"][0]
)
assert torch_fp32_tensor.nbytes == 200


class TestTensor:
@pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
Expand Down Expand Up @@ -418,6 +310,10 @@ def test_ready(self, server_options):
server = tritonserver.Server(server_options).start()
assert server.ready()

@pytest.mark.xfail(
run=False,
reason="Some request/response object may not be released which may cause server stop to fail",
)
def test_stop(self, server_options):
server = tritonserver.Server(server_options).start(wait_until_ready=True)

Expand Down
Loading
Loading