Auto-inserted deallocation of GPU memory leads to crash #664

fschlimb · 2023-10-18T16:30:08Z

The following IR does not work with the below pipeline. The returned pointer had been deallocated. It works fine when the generation of the gpu-dealloc op/call is omitted.

// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/ptensor-gpu.pp --runner imex-cpu-runner -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck --O3
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/ptensor-gpu.pp \
// RUN:                                        --runner imex-cpu-runner -e main \
// RUN:                                        --entry-point-result=void \
// RUN:                                        --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func private @printMemrefI32(tensor<*xi32>)
  func.func private @printMemrefF32(tensor<*xf32>)
  func.func @main() {
    %0:4 = call @ddpt_jit() : () -> (memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>)
    %1 = bufferization.to_tensor %0#1 : memref<?x?xi32, strided<[?, ?], offset: ?>>
    %cast = tensor.cast %1 : tensor<?x?xi32> to tensor<*xi32>
    call @printMemrefI32(%cast) : (tensor<*xi32>) -> ()
    // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}}
    return
  }
  func.func @ddpt_jit() -> (memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>) attributes {llvm.emit_c_interface} {
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %0 = tensor.empty() : tensor<16x16xi32>
    %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%0 : tensor<16x16xi32>) {
    ^bb0(%out: i32):
      linalg.yield %c0_i32 : i32
    } -> tensor<16x16xi32>
    %2 = tensor.empty() : tensor<0x0xi32>
    %3 = bufferization.to_memref %2 : memref<0x0xi32>
    %cast = memref.cast %3 : memref<0x0xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
    %4 = bufferization.to_memref %1 : memref<16x16xi32>
    %cast_0 = memref.cast %4 : memref<16x16xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
    %alloc = memref.alloc() {alignment = 8 : i64} : memref<2xindex>
    memref.store %c0, %alloc[%c0] : memref<2xindex>
    memref.store %c0, %alloc[%c1] : memref<2xindex>
    return %cast, %cast_0, %cast, %alloc : memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<?x?xi32, strided<[?, ?], offset: ?>>, memref<2xindex>
  }
}

builtin.module(
    func.func(tosa-make-broadcastable)
    func.func(tosa-to-linalg)
    func.func(tosa-to-tensor)
    canonicalize
    linalg-fuse-elementwise-ops
    arith-expand
    memref-expand
    arith-bufferize
    func-bufferize
    func.func(empty-tensor-to-alloc-tensor)
    func.func(scf-bufferize)
    func.func(tensor-bufferize)
    func.func(bufferization-bufferize)
    func.func(linalg-bufferize)
    func.func(linalg-detensorize)
    func.func(tensor-bufferize)
    func.func(finalizing-bufferize)
    imex-remove-temporaries
    func.func(convert-linalg-to-parallel-loops)
    func.func(scf-parallel-loop-fusion)
// GPU
    func.func(imex-add-outer-parallel-loop)
    func.func(gpu-map-parallel-loops)
    func.func(convert-parallel-loops-to-gpu)
// insert-gpu-allocs pass can have client-api = opencl or vulkan args
    func.func(insert-gpu-allocs{client-api=opencl})
    canonicalize
    normalize-memrefs
// Unstride memrefs does not seem to be needed.
//  func.func(unstride-memrefs)
    func.func(lower-affine)
    gpu-kernel-outlining
    canonicalize
    cse
// The following set-spirv-* passes can have client-api = opencl or vulkan args
    set-spirv-capabilities{client-api=opencl}
    gpu.module(set-spirv-abi-attrs{client-api=opencl})
    canonicalize
    fold-memref-alias-ops
    imex-convert-gpu-to-spirv
    spirv.module(spirv-lower-abi-attrs
             spirv-update-vce)
    func.func(llvm-request-c-wrappers)
    serialize-spirv
    convert-gpu-to-gpux
    convert-func-to-llvm
    convert-math-to-llvm
    convert-gpux-to-llvm
    expand-strided-metadata
    lower-affine
    finalize-memref-to-llvm
    reconcile-unrealized-casts)

The text was updated successfully, but these errors were encountered:

fschlimb assigned silee2 and nbpatel Oct 18, 2023

fschlimb added the bug Something isn't working label Oct 18, 2023

fschlimb mentioned this issue Oct 20, 2023

insert gpu2host copies when returning views of gpuAlloced memrefs #666

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Auto-inserted deallocation of GPU memory leads to crash #664

Auto-inserted deallocation of GPU memory leads to crash #664

fschlimb commented Oct 18, 2023

Auto-inserted deallocation of GPU memory leads to crash #664

Auto-inserted deallocation of GPU memory leads to crash #664

Comments

fschlimb commented Oct 18, 2023