triton-inference-server · kthui · Dec 18, 2024 · Jan 10, 2025 · Jan 11, 2025 · Jan 14, 2025
diff --git a/python/test/test_api.py b/python/test/test_api.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -90,24 +90,18 @@ def test_create_request(self, server_options):
         request = tritonserver.InferenceRequest(server.model("test"))
 
 
-class TestAllocators:
-    class MockMemoryAllocator(tritonserver.MemoryAllocator):
-        def __init__(self):
-            pass
-
-        def allocate(self, *args, **kwargs):
-            raise Exception("foo")
-
+class TestOutputMemory:
     @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
     def test_memory_fallback_to_cpu(self, server_options):
         server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
         assert server.ready()
 
-        allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
-
-        del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
-
+        # The memory allocator is internal to the binding, and before GPU memory support
+        # is added, it will always fallback to CPU memory regardless of the memory
+        # preferred by the backend.
+        # TODO: Revisit this test when GPU memory support is added, i.e. the backend
+        #       prefers GPU memory, but the system only has CPU memory.
         server.load(
             "test",
             {
@@ -135,36 +129,10 @@ def test_memory_fallback_to_cpu(self, server_options):
             fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
             assert fp16_input[0][0] == fp16_output[0][0]
 
-        tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] = allocator
-
-    def test_memory_allocator_exception(self, server_options):
-        server = tritonserver.Server(server_options).start(wait_until_ready=True)
-
-        assert server.ready()
-
-        server.load(
-            "test",
-            {
-                "config": json.dumps(
-                    {
-                        "backend": "python",
-                        "parameters": {"decoupled": {"string_value": "False"}},
-                    }
-                )
-            },
-        )
-
-        with pytest.raises(tritonserver.InternalError):
-            for response in server.model("test").infer(
-                inputs={
-                    "string_input": tritonserver.Tensor.from_string_array([["hello"]])
-                },
-                output_memory_type="gpu",
-                output_memory_allocator=TestAllocators.MockMemoryAllocator(),
-            ):
-                pass
-
     def test_unsupported_memory_type(self, server_options):
+        # TODO: Revisit this test when GPU memory support is added, i.e. the request
+        #       specifies output to be in GPU memory, but the system only has CPU
+        #       memory, which an exception should be raised during inference.
         server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
         assert server.ready()
@@ -181,91 +149,15 @@ def test_unsupported_memory_type(self, server_options):
             },
         )
 
-        if tritonserver.MemoryType.GPU in tritonserver.default_memory_allocators:
-            allocator = tritonserver.default_memory_allocators[
-                tritonserver.MemoryType.GPU
-            ]
-
-            del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
-        else:
-            allocator = None
-
         with pytest.raises(tritonserver.InvalidArgumentError):
             for response in server.model("test").infer(
                 inputs={
                     "string_input": tritonserver.Tensor.from_string_array([["hello"]])
                 },
-                output_memory_type="gpu",
+                output_memory_type="unsupported",
             ):
                 pass
 
-        if allocator is not None:
-            tritonserver.default_memory_allocators[
-                tritonserver.MemoryType.GPU
-            ] = allocator
-
-    @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
-    def test_allocate_on_cpu_and_reshape(self):
-        allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU]
-
-        memory_buffer = allocator.allocate(
-            memory_type=tritonserver.MemoryType.CPU, memory_type_id=0, size=200
-        )
-
-        cpu_array = memory_buffer.owner
-
-        assert memory_buffer.size == 200
-
-        fp32_size = int(memory_buffer.size / 4)
-
-        tensor = tritonserver.Tensor(
-            tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
-        )
-
-        cpu_fp32_array = numpy.from_dlpack(tensor)
-        assert cpu_array.ctypes.data == cpu_fp32_array.ctypes.data
-        assert cpu_fp32_array.dtype == numpy.float32
-        assert cpu_fp32_array.nbytes == 200
-
-    @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
-    @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
-    def test_allocate_on_gpu_and_reshape(self):
-        allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
-
-        memory_buffer = allocator.allocate(
-            memory_type=tritonserver.MemoryType.GPU, memory_type_id=0, size=200
-        )
-
-        gpu_array = memory_buffer.owner
-
-        gpu_array = cupy.empty([10, 20], dtype=cupy.uint8)
-        memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array)
-
-        assert memory_buffer.size == 200
-
-        fp32_size = int(memory_buffer.size / 4)
-
-        tensor = tritonserver.Tensor(
-            tritonserver.DataType.FP32, shape=[fp32_size], memory_buffer=memory_buffer
-        )
-
-        gpu_fp32_array = cupy.from_dlpack(tensor)
-        assert (
-            gpu_array.__cuda_array_interface__["data"][0]
-            == gpu_fp32_array.__cuda_array_interface__["data"][0]
-        )
-
-        assert gpu_fp32_array.dtype == cupy.float32
-        assert gpu_fp32_array.nbytes == 200
-
-        torch_fp32_tensor = torch.from_dlpack(tensor)
-        assert torch_fp32_tensor.dtype == torch.float32
-        assert (
-            torch_fp32_tensor.data_ptr()
-            == gpu_array.__cuda_array_interface__["data"][0]
-        )
-        assert torch_fp32_tensor.nbytes == 200
-
 
 class TestTensor:
     @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
@@ -418,6 +310,10 @@ def test_ready(self, server_options):
         server = tritonserver.Server(server_options).start()
         assert server.ready()
 
+    @pytest.mark.xfail(
+        run=False,
+        reason="Some request/response object may not be released which may cause server stop to fail",
+    )
     def test_stop(self, server_options):
         server = tritonserver.Server(server_options).start(wait_until_ready=True)