Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lld hangs targeting amdgpu and so does opt using amdgpu-attributor pass #58639

Closed
pozulp opened this issue Oct 26, 2022 · 5 comments
Closed

lld hangs targeting amdgpu and so does opt using amdgpu-attributor pass #58639

pozulp opened this issue Oct 26, 2022 · 5 comments

Comments

@pozulp
Copy link
Member

pozulp commented Oct 26, 2022

The following hangs for me, opt --amdgpu-attributor hang.ll -o foo.bc where hang.ll is

; ModuleID = 'reduced.ll'
source_filename = "reduced.ll"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"

%0 = type { %1, i32, i32, i32, %2, i32, [4 x i8], %3, i32, i32, double, i8, i8, %4, [3 x [3 x double]], %5*, %22* }
%1 = type { i32 (...)** }
%2 = type { %1, double, double, double, double, double, double, double, double }
%3 = type <{ i8*, i64, i64, i32, [4 x i8] }>
%4 = type { double, double, double }
%5 = type { %3, i8, i32, %6, %7, i32, i32, i32, i32, double, i8, i8, i8, %4, [3 x [3 x double]], double, double, double, i32, [4 x i8], %3, i32, i32, i8, i32, %8, i32, %0*, %3, i8*, double (double, double, double, double, double, double, double, double, double)*, %10* }
%6 = type { i32, double, double, double }
%7 = type { i32, double* }
%8 = type { %1, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %3, %9, %9, %9, %9, %9, %9, %9, %9, %9, %9 }
%9 = type <{ double*, i64, i64, i32, [4 x i8] }>
%10 = type { i64, %11* }
%11 = type { %12, i8*, i64, i64, void (%10*)*, i64, %10* (%10*, i8*)*, i32 (%10*, i8*, %10*)*, %13*, %10* (%10*)*, %14*, %15*, %16*, i64 (%10*)*, %10* (%10*, %10*, %10*)*, %10* (%10*)*, %10* (%10*, %10*)*, i32 (%10*, %10*, %10*)*, %17*, i64, i8*, i32 (%10*, i32 (%10*, i8*)*, i8*)*, i32 (%10*)*, %10* (%10*, %10*, i32)*, i64, %10* (%10*)*, %10* (%10*)*, %19*, %20*, %21*, %11*, %10*, %10* (%10*, %10*, %10*)*, i32 (%10*, %10*, %10*)*, i64, i32 (%10*, %10*, %10*)*, %10* (%11*, i64)*, %10* (%11*, %10*, %10*)*, void (i8*)*, i32 (%10*)*, %10*, %10*, %10*, %10*, %10*, void (%10*)*, i32, void (%10*)*, %10* (%10*, %10**, i64, %10*)* }
%12 = type { %10, i64 }
%13 = type { %10* (%10*)*, %10* (%10*)*, %10* (%10*)* }
%14 = type { %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*, %10*)*, %10* (%10*)*, %10* (%10*)*, %10* (%10*)*, i32 (%10*)*, %10* (%10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*)*, i8*, %10* (%10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*)*, %10* (%10*, %10*)*, %10* (%10*, %10*)* }
%15 = type { i64 (%10*)*, %10* (%10*, %10*)*, %10* (%10*, i64)*, %10* (%10*, i64)*, i8*, i32 (%10*, i64, %10*)*, i8*, i32 (%10*, %10*)*, %10* (%10*, %10*)*, %10* (%10*, i64)* }
%16 = type { i64 (%10*)*, %10* (%10*, %10*)*, i32 (%10*, %10*, %10*)* }
%17 = type { i32 (%10*, %18*, i32)*, void (%10*, %18*)* }
%18 = type { i8*, %10*, i64, i64, i32, i32, i8*, i64*, i64*, i64*, i8* }
%19 = type { i8*, %10* (%10*, %10*)*, i32, i8* }
%20 = type { i8*, i32, i64, i32, i8* }
%21 = type { i8*, %10* (%10*, i8*)*, i32 (%10*, %10*, i8*)*, i8*, i8* }
%22 = type { %3, i32, %23, %24, i32, i32, %3 }
%23 = type { i32, i64, i64, i64 }
%24 = type { i64, i64* }

define internal fastcc i1 @widget(%0* %arg) {
bb:
  %tmp = getelementptr inbounds %0, %0* %arg, i64 0, i32 15
  %tmp1 = load %5*, %5** %tmp, align 8
  %tmp2 = call fastcc double @baz(%5* %tmp1)
  ret i1 false
}

define internal fastcc double @baz(%5* %arg) {
bb:
  %tmp = getelementptr inbounds %5, %5* %arg, i64 0, i32 30
  %tmp1 = load double (double, double, double, double, double, double, double, double, double)*, double (double, double, double, double, double, double, double, double, double)** %tmp, align 8
  %tmp2 = tail call double %tmp1(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00)
  br label %bb3

bb3:                                              ; preds = %bb
  %tmp4 = getelementptr inbounds %5, %5* %arg, i64 0, i32 27
  br label %bb5

bb5:                                              ; preds = %bb5, %bb3
  %tmp6 = load %0*, %0** %tmp4, align 8
  %tmp7 = call fastcc i1 @widget(%0* %tmp6)
  br label %bb5
}
@llvmbot
Copy link
Member

llvmbot commented Oct 26, 2022

@llvm/issue-subscribers-backend-amdgpu

@jdoerfert
Copy link
Member

Are you sure llvm-reduce didn't break it? This is all dead code, it's just deleted: https://godbolt.org/z/4n56Gvf36

@arsenm
Copy link
Contributor

arsenm commented Oct 27, 2022

It's still broken whether or not the code is functional, but the deadness is a red herring. If there is a real use, I still observe the hang / stack overflow. I cut down the test slightly, but I don't see this reproduce with tip of tree. Probably need to bisect this to see if it was deliberately fixed

target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"

%0 = type { double()*, %0* }

define internal fastcc i1 @widget(%0* %arg) {
bb:
  %tmp = getelementptr inbounds %0, %0* %arg, i64 0, i32 1
  %tmp1 = load %0*, %0** %tmp, align 8
  %tmp2 = call fastcc double @baz(%0* %tmp1)
  ret i1 false
}

define internal fastcc double @baz(%0* %arg) {
bb:
  %tmp = getelementptr inbounds %0, %0* %arg, i64 0, i32 0
  %tmp1 = load double ()*, double ()** %tmp, align 8
  %tmp2 = tail call double %tmp1()
  br label %bb3

bb3:                                              ; preds = %bb
  %tmp4 = getelementptr inbounds %0, %0* %arg, i64 0, i32 1
  br label %bb5

bb5:                                              ; preds = %bb5, %bb3
  %tmp6 = load %0*, %0** %tmp4, align 8
  %tmp7 = call fastcc i1 @widget(%0* %tmp6)
  br label %bb5
}

define amdgpu_kernel void @entry() {
  %alloca = alloca %0, align 8, addrspace(5)
  %cast = addrspacecast %0 addrspace(5)* %alloca to %0*
  %arst = call double @baz(%0* %cast)
  ret void
}


@pozulp
Copy link
Member Author

pozulp commented Oct 31, 2022

I bisected with @arsenm and we found that @jdoerfert fixed the hang in bf789b1. Below I describe how I created this reproducer and bisected.

My llvm installation

I am using rocm to target mi250x on linux. My code is written in c++ and uses hip. I can build with rocm 5.2.3, but when I try 5.3.0 my lld process hangs.

Making a reproducer

I attached gdb to the lld process. I cut off the stacktrace at 1 million frames. Many were llvm::AA::getAssumedUnderlyingObjects. I re-ran with --save-temps. The last bitcode file emitted was precodegen.bc. I ran opt --amdgpu-attributor precodegen.bc which reproduced the hang.

Minimizing the reproducer

My reproducer was almost 400,000 lines long (llvm-dis < precodegen.bc|wc -l). I wrote this "test" script [1] which I called repro.sh

#!/bin/bash -x

timeout 10 opt --amdgpu-attributor "$@" -o foo.bc
if [ $? -eq 124 ]; then
  echo "TIMED OUT"
  exit 0
elif [ $? -eq 0 ]; then
  echo "SUCCESS"
  exit 1
else
  echo "INVALID"
  exit 1
fi

and ran llvm-reduce --test=./repro.sh precodegen.bc --write-tmp-files-as-bitcode. (I first ran without --write-tmp-files-as-bitcode and the process terminated in 2 seconds. This was suspicious because I set a timeout of 10 seconds. In the output I saw opt failed with the error 'wrong number of indexes'. Switching from text to bitcode for intermediate files fixed the problem.)

After about 1 hour, llvm-reduce terminated. It output a reduced.ll that was only 51 lines, an 8000x improvement! I ran opt -strip -metarenamer -instnamer reduced.ll -So hang.ll to create the minimum reproducer that I posted when I opened this issue.

Running git bisect

Thus far I had only run tools in my rocm install's llvm/bin directory. As @jdoerfert noted, upstream opt does not hang. I built the default branch (amd-stg-open) of RadeonOpenCompute's llvm-project fork and opt did not hang. I built the rocm-5.3.x branch and opt hung. Thus, there is a commit on amd-stg-open which fixes the hang. Here is how I ran git bisect to determine said commit

I created a build space (because each step of the bisection needs to build and run opt).

git clone https://github.com/RadeonOpenCompute/llvm-project.git amd-llvm-project
mkdir build_llvm
pushd build_llvm
targets='X86;AMDGPU'
time \
cmake \
    -G Ninja \
    -DCMAKE_BUILD_TYPE=Debug \
    -DCMAKE_C_COMPILER=gcc \
    -DCMAKE_CXX_COMPILER=g++ \
    -DLLVM_USE_LINKER=gold \
    -DLLVM_TARGETS_TO_BUILD=$targets \
    -DLLVM_PARALLEL_LINK_JOBS=18 \
    -DLLVM_ENABLE_ASSERTIONS=ON \
    ../amd-llvm-project/llvm 2>&1 | tee cmake_amdgpubackend.out
popd

I started the bisection with

git -C amd-llvm-project bisect start origin/amd-stg-open origin/rocm-5.3.x
sbatch sbatch1.sh

where sbatch1.sh is the slurm batch script

#!/bin/bash
#SBATCH -t 4:0:0
#SBATCH -N 1 --exclusive
#SBATCH -J bisect
#SBATCH -e /path/to/rundir/sbatch1-%A.e
#SBATCH -o /path/to/rundir/sbatch1-%A.o

cd /path/to/rundir/
cd amd-llvm-project
git -C amd-llvm-project bisect run ./bisect.sh

and bisect.sh builds and runs opt

cd /path/to/rundir
cd build_llvm
time ninja opt
timeout 10 ./bin/opt --amdgpu-attributor ../hang.ll -o foo.bc
# Timing out is "good" because I am looking for the fix not the bug
# good: 0
if [ $? -eq 124 ]; then
  echo "TIMED OUT, good, exit 0"
  exit 0
elif [ $? -eq 0 ]; then
  echo "NO TIME OUT, bad, exit 1"
  exit 1
else
  echo "INVALID, bad, exit 1"
  exit 1
fi

I ran the job on a system with 2 18-core broadwell sockets and 128 GB of DRAM per node. The job finished after about 2 hours. I ran git -C amd-llvm-project bisect log and saw

# bad: [092984937f2778f7e5440e69788a1d9542f7f0fc] merge main into amd-stg-open
# good: [3cf23f77f8208174a2ee7c616f4be23674d7b081] [SROA] Try harder to find a vector promotion viable type when rewriting
git bisect start 'origin/amd-stg-open' 'origin/rocm-5.3.x'
# good: [34b6327ae8a19989a631f1842b5ce6dd3cab7c92] merge main into amd-stg-open
git bisect good 34b6327ae8a19989a631f1842b5ce6dd3cab7c92
# bad: [86bc4587e1fdb7b1b90eadc138619f5e3f2dd6fd] Use std::clamp (NFC)
git bisect bad 86bc4587e1fdb7b1b90eadc138619f5e3f2dd6fd
# bad: [4d9251bd780d20eebbcb124608b36a69787d5575] [C++20] [Modules] Merge same concept decls in global module fragment
git bisect bad 4d9251bd780d20eebbcb124608b36a69787d5575
# good: [3139cc766c86b09426893a7349763c347639cbdc] [mlir][Linalg] Add a pattern to decompose `linalg.generic` ops.
git bisect good 3139cc766c86b09426893a7349763c347639cbdc
# bad: [44f81dfba407c82589abbb5867714ad030d1b80c] Remove references to old mailing lists that have moved to discourse. Replace with links to discourse.
git bisect bad 44f81dfba407c82589abbb5867714ad030d1b80c
# bad: [3f73c5793515867935d59ff8c511c61ace848e79] Argument name support for function pointer signature hints
git bisect bad 3f73c5793515867935d59ff8c511c61ace848e79
# good: [8aad330eebc0b9cfd8dd00e8ed692cb89e7577df] [libc] Fix API for remove_{prefix, suffix}
git bisect good 8aad330eebc0b9cfd8dd00e8ed692cb89e7577df
# good: [4baf8f092b47f4f31bda96a7acb7169d389c96fd] [AMDGPU] Pre-commit tests for D129759
git bisect good 4baf8f092b47f4f31bda96a7acb7169d389c96fd
# good: [f6017abb602780d81be928c93ec6afe74752f613] [lld-macho] Support folding of functions with identical LSDAs
git bisect good f6017abb602780d81be928c93ec6afe74752f613
# good: [374db8fc2e49f7d627e8942681d467422641f4b2] [gn build] (manually) port c91ce941448 (HTMLForestResources.inc)
git bisect good 374db8fc2e49f7d627e8942681d467422641f4b2
# bad: [1cf6b93df168fea81e3ca7c6c3c9fcaaf82c7785] Revert "[Local] Allow creating callbr with duplicate successors"
git bisect bad 1cf6b93df168fea81e3ca7c6c3c9fcaaf82c7785
# good: [f1243fa1933fdbcf292f134e0628604c4b9e5487] [LV] Autogen a partially autogened test for ease of update
git bisect good f1243fa1933fdbcf292f134e0628604c4b9e5487
# bad: [d2c0572b2efef6c71d13bb579ac50f2d3dd8e76e] [mlir] Flip LinAlg dialect to _Both
git bisect bad d2c0572b2efef6c71d13bb579ac50f2d3dd8e76e
# bad: [95401b015393b350f826d097cc5b45b6a604dfa5] Revert "[x86] use zero-extending load of a byte outside of loops too"
git bisect bad 95401b015393b350f826d097cc5b45b6a604dfa5
# bad: [bf789b1957efd2482e1dbd164d91a6612a450fe3] [Attributor] Replace AAValueSimplify with AAPotentialValues
git bisect bad bf789b1957efd2482e1dbd164d91a6612a450fe3
# first bad commit: [bf789b1957efd2482e1dbd164d91a6612a450fe3] [Attributor] Replace AAValueSimplify with AAPotentialValues

Thus, bf789b1 fixed the hang.

Action items

@arsenm will ensure that bf789b1 makes it into the next rocm release
@arsenm will close this issue

(I tried to cherry-pick bf789b1 on the rocm-5.3.x branch of RadeonOpenCompute's llvm-project fork but I got a conflict.)

[1] rust-lang/rust#66036

@arsenm
Copy link
Contributor

arsenm commented Nov 1, 2022

Pushed testcase in bcedeef

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

5 participants