Skip to content

Commit

Permalink
SWDEV-321398: replace hostcall module flag with function attribute
Browse files Browse the repository at this point in the history
This internal version is currently a squash of four upstream reviews:

1. D119087: [AMDGPU] [NFC] refactor the AMDGPU attributor
2. D119308: [AMDGPU] [NFC] Fix incorrect use of bitwise operator.
3. D119249: [Attributor][NFC] Expose new API in AAPointerInfo
4. D119216: [AMDGPU] replace hostcall module flag with function attribute

Of these #1, #2 and #3 are submitted in upstream/main, while #4 is
under review.

The module flag to indicate use of hostcall is insufficient to catch
all cases where hostcall might be in use by a kernel. This is now
replaced by a function attribute that gets propagated to top-level
kernel functions via their respective call-graph.

If the attribute "amdgpu-no-hostcall-ptr" is absent on a kernel, the
default behaviour is to emit kernel metadata indicating that the
kernel uses the hostcall buffer pointer passed as an implicit
argument.

The attribute may be placed explicitly by the user, or inferred by the
AMDGPU attributor by examining the call-graph. The attribute is
inferred only if the function is not being sanitized, and the
implictarg_ptr does not result in a load of any byte in the hostcall
pointer argument.

Change-Id: I6cc12050602c3f477575c3ca09a883797169e9e3
  • Loading branch information
ssahasra committed Feb 11, 2022
1 parent 6935430 commit 030a405
Show file tree
Hide file tree
Showing 27 changed files with 623 additions and 348 deletions.
42 changes: 42 additions & 0 deletions llvm/include/llvm/Transforms/IPO/Attributor.h
Original file line number Diff line number Diff line change
Expand Up @@ -4592,6 +4592,48 @@ struct AAPointerInfo : public AbstractAttribute {
/// See AbstractAttribute::getIdAddr()
const char *getIdAddr() const override { return &ID; }

/// Helper to represent an access offset and size, with logic to deal with
/// uncertainty and check for overlapping accesses.
struct OffsetAndSize : public std::pair<int64_t, int64_t> {
using BaseTy = std::pair<int64_t, int64_t>;
OffsetAndSize(int64_t Offset, int64_t Size) : BaseTy(Offset, Size) {}
OffsetAndSize(const BaseTy &P) : BaseTy(P) {}
int64_t getOffset() const { return first; }
int64_t getSize() const { return second; }
static OffsetAndSize getUnknown() {
return OffsetAndSize(Unknown, Unknown);
}

/// Return true if offset or size are unknown.
bool offsetOrSizeAreUnknown() const {
return getOffset() == OffsetAndSize::Unknown ||
getSize() == OffsetAndSize::Unknown;
}

/// Return true if this offset and size pair might describe an address that
/// overlaps with \p OAS.
bool mayOverlap(const OffsetAndSize &OAS) const {
// Any unknown value and we are giving up -> overlap.
if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown())
return true;

// Check if one offset point is in the other interval [offset,
// offset+size].
return OAS.getOffset() + OAS.getSize() > getOffset() &&
OAS.getOffset() < getOffset() + getSize();
}

/// Constant used to represent unknown offset or sizes.
static constexpr int64_t Unknown = 1 << 31;
};

/// Call \p CB on all accesses that might interfere with \p OAS and return
/// true if all such accesses were known and the callback returned true for
/// all of them, false otherwise. An access interferes with an offset-size
/// pair if it might read or write that memory region.
virtual bool forallInterferingAccesses(
OffsetAndSize OAS, function_ref<bool(const Access &, bool)> CB) const = 0;

/// Call \p CB on all accesses that might interfere with \p LI and return true
/// if all such accesses were known and the callback returned true for all of
/// them, false otherwise.
Expand Down
29 changes: 29 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
//===--- AMDGPUAttributes.def ---------------------------------*- C++ -*---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains descriptions of the various function attributes
// that indicate *absence* of the corresponding implicit kernel
// arguments.
//
//===----------------------------------------------------------------------===//

// NOTE: NO INCLUDE GUARD DESIRED!

AMDGPU_ATTRIBUTE(DISPATCH_PTR, "amdgpu-no-dispatch-ptr")
AMDGPU_ATTRIBUTE(QUEUE_PTR, "amdgpu-no-queue-ptr")
AMDGPU_ATTRIBUTE(DISPATCH_ID, "amdgpu-no-dispatch-id")
AMDGPU_ATTRIBUTE(IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr")
AMDGPU_ATTRIBUTE(HOSTCALL_PTR, "amdgpu-no-hostcall-ptr")
AMDGPU_ATTRIBUTE(WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x")
AMDGPU_ATTRIBUTE(WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y")
AMDGPU_ATTRIBUTE(WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z")
AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x")
AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y")
AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")

#undef AMDGPU_ATTRIBUTE
202 changes: 127 additions & 75 deletions llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
Expand All @@ -22,37 +23,25 @@

using namespace llvm;

#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,

enum ImplicitArgumentPositions {
#include "AMDGPUAttributes.def"
LAST_ARG_POS
};

#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,

enum ImplicitArgumentMask {
NOT_IMPLICIT_INPUT = 0,

// SGPRs
DISPATCH_PTR = 1 << 0,
QUEUE_PTR = 1 << 1,
DISPATCH_ID = 1 << 2,
IMPLICIT_ARG_PTR = 1 << 3,
WORKGROUP_ID_X = 1 << 4,
WORKGROUP_ID_Y = 1 << 5,
WORKGROUP_ID_Z = 1 << 6,

// VGPRS:
WORKITEM_ID_X = 1 << 7,
WORKITEM_ID_Y = 1 << 8,
WORKITEM_ID_Z = 1 << 9,
ALL_ARGUMENT_MASK = (1 << 10) - 1
#include "AMDGPUAttributes.def"
ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
};

#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
static constexpr std::pair<ImplicitArgumentMask,
StringLiteral> ImplicitAttrs[] = {
{DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
{QUEUE_PTR, "amdgpu-no-queue-ptr"},
{DISPATCH_ID, "amdgpu-no-dispatch-id"},
{IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
{WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
{WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y"},
{WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z"},
{WORKITEM_ID_X, "amdgpu-no-workitem-id-x"},
{WORKITEM_ID_Y, "amdgpu-no-workitem-id-y"},
{WORKITEM_ID_Z, "amdgpu-no-workitem-id-z"}
#include "AMDGPUAttributes.def"
};

// We do not need to note the x workitem or workgroup id because they are always
Expand Down Expand Up @@ -90,7 +79,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
case Intrinsic::amdgcn_queue_ptr:
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
// TODO: Does not require queue ptr on gfx9+
// TODO: Does not require the queue pointer on gfx9+
case Intrinsic::trap:
case Intrinsic::debugtrap:
IsQueuePtr = true;
Expand All @@ -112,6 +101,17 @@ static bool isDSAddress(const Constant *C) {
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
}

/// Returns true if the function requires the implicit argument be passed
/// regardless of the function contents.
static bool funcRequiresHostcallPtr(const Function &F) {
// Sanitizers require the hostcall buffer passed in the implicit arguments.
return F.hasFnAttribute(Attribute::SanitizeAddress) ||
F.hasFnAttribute(Attribute::SanitizeThread) ||
F.hasFnAttribute(Attribute::SanitizeMemory) ||
F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
F.hasFnAttribute(Attribute::SanitizeMemTag);
}

class AMDGPUInformationCache : public InformationCache {
public:
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
Expand All @@ -129,7 +129,7 @@ class AMDGPUInformationCache : public InformationCache {
}

private:
/// Check if the ConstantExpr \p CE requires queue ptr attribute.
/// Check if the ConstantExpr \p CE requires the queue pointer.
static bool visitConstExpr(const ConstantExpr *CE) {
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
Expand Down Expand Up @@ -163,7 +163,7 @@ class AMDGPUInformationCache : public InformationCache {
}

public:
/// Returns true if \p Fn needs a queue ptr attribute because of \p C.
/// Returns true if \p Fn needs the queue pointer because of \p C.
bool needsQueuePtr(const Constant *C, Function &Fn) {
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
bool HasAperture = hasApertureRegs(Fn);
Expand All @@ -182,7 +182,7 @@ class AMDGPUInformationCache : public InformationCache {
}

private:
/// Used to determine if the Constant needs a queue ptr attribute.
/// Used to determine if the Constant needs the queue pointer.
DenseMap<const Constant *, uint8_t> ConstantStatus;
};

Expand Down Expand Up @@ -327,7 +327,20 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {

void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();

// If the function requires the implicit arg pointer due to sanitizers,
// assume it's needed even if explicitly marked as not requiring it.
const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
if (NeedsHostcall) {
removeAssumedBits(IMPLICIT_ARG_PTR);
removeAssumedBits(HOSTCALL_PTR);
}

for (auto Attr : ImplicitAttrs) {
if (NeedsHostcall &&
(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
continue;

if (F->hasFnAttribute(Attr.second))
addKnownBits(Attr.first);
}
Expand Down Expand Up @@ -355,7 +368,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
return indicatePessimisticFixpoint();

bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());

bool NeedsQueuePtr = false;

Expand All @@ -377,13 +389,58 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
}
}

// If we found that we need amdgpu-queue-ptr, nothing else to do.
if (!NeedsQueuePtr) {
NeedsQueuePtr = checkForQueuePtr(A);
}

if (NeedsQueuePtr) {
removeAssumedBits(QUEUE_PTR);
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
ChangeStatus::UNCHANGED;
}

if (funcRetrievesHostcallPtr(A)) {
removeAssumedBits(IMPLICIT_ARG_PTR);
removeAssumedBits(HOSTCALL_PTR);
}

return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
: ChangeStatus::UNCHANGED;
}

ChangeStatus manifest(Attributor &A) override {
SmallVector<Attribute, 8> AttrList;
LLVMContext &Ctx = getAssociatedFunction()->getContext();

for (auto Attr : ImplicitAttrs) {
if (isKnown(Attr.first))
AttrList.push_back(Attribute::get(Ctx, Attr.second));
}

return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
/* ForceReplace */ true);
}

const std::string getAsStr() const override {
std::string Str;
raw_string_ostream OS(Str);
OS << "AMDInfo[";
for (auto Attr : ImplicitAttrs)
OS << ' ' << Attr.second;
OS << " ]";
return OS.str();
}

/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {}

private:
bool checkForQueuePtr(Attributor &A) {
Function *F = getAssociatedFunction();
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());

auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());

bool NeedsQueuePtr = false;

auto CheckAddrSpaceCasts = [&](Instruction &I) {
unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
if (castRequiresQueuePtr(SrcAS)) {
Expand All @@ -398,69 +455,63 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
// `checkForAllInstructions` is much more cheaper than going through all
// instructions, try it first.

// amdgpu-queue-ptr is not needed if aperture regs is present.
// The queue pointer is not needed if aperture regs is present.
if (!HasApertureRegs) {
bool UsedAssumedInformation = false;
A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
{Instruction::AddrSpaceCast},
UsedAssumedInformation);
}

// If we found that we need amdgpu-queue-ptr, nothing else to do.
if (NeedsQueuePtr) {
removeAssumedBits(QUEUE_PTR);
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
ChangeStatus::UNCHANGED;
}
// If we found that we need the queue pointer, nothing else to do.
if (NeedsQueuePtr)
return true;

if (!IsNonEntryFunc && HasApertureRegs) {
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
ChangeStatus::UNCHANGED;
}
if (!IsNonEntryFunc && HasApertureRegs)
return false;

for (BasicBlock &BB : *F) {
for (Instruction &I : BB) {
for (const Use &U : I.operands()) {
if (const auto *C = dyn_cast<Constant>(U)) {
if (InfoCache.needsQueuePtr(C, *F)) {
removeAssumedBits(QUEUE_PTR);
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
ChangeStatus::UNCHANGED;
}
if (InfoCache.needsQueuePtr(C, *F))
return true;
}
}
}
}

return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
ChangeStatus::UNCHANGED;
return false;
}

ChangeStatus manifest(Attributor &A) override {
SmallVector<Attribute, 8> AttrList;
LLVMContext &Ctx = getAssociatedFunction()->getContext();

for (auto Attr : ImplicitAttrs) {
if (isKnown(Attr.first))
AttrList.push_back(Attribute::get(Ctx, Attr.second));
}
bool funcRetrievesHostcallPtr(Attributor &A) {
auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();

// Check if this is a call to the implicitarg_ptr builtin and it
// is used to retrieve the hostcall pointer. The implicit arg for
// hostcall is not used only if every use of the implicitarg_ptr
// is a load that clearly does not retrieve any byte of the
// hostcall pointer. We check this by tracing all the uses of the
// initial call to the implicitarg_ptr intrinsic.
auto DoesNotLeadToHostcallPtr = [&](Instruction &I) {
auto &Call = cast<CallBase>(I);
if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
return true;

const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
*this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);

AAPointerInfo::OffsetAndSize OAS(Pos, 8);
return PointerInfoAA.forallInterferingAccesses(
OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) {
return Acc.getRemoteInst()->isDroppable();
});
};

return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
/* ForceReplace */ true);
bool UsedAssumedInformation = false;
return !A.checkForAllCallLikeInstructions(DoesNotLeadToHostcallPtr, *this,
UsedAssumedInformation);
}

const std::string getAsStr() const override {
std::string Str;
raw_string_ostream OS(Str);
OS << "AMDInfo[";
for (auto Attr : ImplicitAttrs)
OS << ' ' << Attr.second;
OS << " ]";
return OS.str();
}

/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {}
};

AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
Expand Down Expand Up @@ -497,7 +548,8 @@ class AMDGPUAttributor : public ModulePass {
BumpPtrAllocator Allocator;
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AACallEdges::ID});
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
&AACallEdges::ID, &AAPointerInfo::ID});

Attributor A(Functions, InfoCache, CGUpdater, &Allowed);

Expand Down
Loading

0 comments on commit 030a405

Please sign in to comment.