Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM64-SVE: gathervector extends #103370

Merged
merged 7 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 27 additions & 10 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1601,19 +1601,36 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
: gtNewSimdHWIntrinsicNode(nodeRetType, op1, op2, op3, intrinsic, simdBaseJitType,
simdSize);

#ifdef TARGET_XARCH
if ((intrinsic == NI_AVX2_GatherVector128) || (intrinsic == NI_AVX2_GatherVector256))
switch (intrinsic)
{
assert(varTypeIsSIMD(op2->TypeGet()));
retNode->AsHWIntrinsic()->SetAuxiliaryJitType(getBaseJitTypeOfSIMDType(sigReader.op2ClsHnd));
}
#if defined(TARGET_XARCH)
case NI_AVX2_GatherVector128:
case NI_AVX2_GatherVector256:
assert(varTypeIsSIMD(op2->TypeGet()));
retNode->AsHWIntrinsic()->SetAuxiliaryJitType(getBaseJitTypeOfSIMDType(sigReader.op2ClsHnd));
break;

#elif defined(TARGET_ARM64)
if (intrinsic == NI_Sve_GatherVector)
{
assert(varTypeIsSIMD(op3->TypeGet()));
retNode->AsHWIntrinsic()->SetAuxiliaryJitType(getBaseJitTypeOfSIMDType(sigReader.op3ClsHnd));
}
case NI_Sve_GatherVector:
case NI_Sve_GatherVectorByteZeroExtend:
case NI_Sve_GatherVectorInt16SignExtend:
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorInt32SignExtend:
case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorSByteSignExtend:
case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
assert(varTypeIsSIMD(op3->TypeGet()));
retNode->AsHWIntrinsic()->SetAuxiliaryJitType(getBaseJitTypeOfSIMDType(sigReader.op3ClsHnd));
break;
#endif

default:
break;
}

break;
}

Expand Down
29 changes: 21 additions & 8 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1846,33 +1846,46 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}

case NI_Sve_GatherVector:
case NI_Sve_GatherVectorByteZeroExtend:
case NI_Sve_GatherVectorInt16SignExtend:
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorInt32SignExtend:
case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorSByteSignExtend:
case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
{
if (!varTypeIsSIMD(intrin.op2->gtType))
{
// GatherVector(Vector<T> mask, T* address, Vector<T2> indices)
// GatherVector...(Vector<T> mask, T* address, Vector<T2> indices)

assert(intrin.numOperands == 3);
emitAttr baseSize = emitActualTypeSize(intrin.baseType);

if (baseSize == EA_8BYTE)
{
// Index is multiplied by 8
GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, opt,
INS_SCALABLE_OPTS_LSL_N);
// Index is multiplied.
insScalableOpts sopt = (ins == INS_sve_ld1b || ins == INS_sve_ld1sb) ? INS_SCALABLE_OPTS_NONE
: INS_SCALABLE_OPTS_LSL_N;
GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, opt, sopt);
}
else
{
// Index is sign or zero extended to 64bits, then multiplied by 4
// Index is sign or zero extended to 64bits, then multiplied.
assert(baseSize == EA_4BYTE);
opt = varTypeIsUnsigned(node->GetAuxiliaryType()) ? INS_OPTS_SCALABLE_S_UXTW
: INS_OPTS_SCALABLE_S_SXTW;
GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, opt,
INS_SCALABLE_OPTS_MOD_N);

insScalableOpts sopt = (ins == INS_sve_ld1b || ins == INS_sve_ld1sb) ? INS_SCALABLE_OPTS_NONE
: INS_SCALABLE_OPTS_MOD_N;
GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, opt, sopt);
}
}
else
{
// GatherVector(Vector<T> mask, Vector<T2> addresses)
// GatherVector...(Vector<T> mask, Vector<T2> addresses)

assert(intrin.numOperands == 2);
GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, op2Reg, 0, opt);
Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ HARDWARE_INTRINSIC(Sve, FusedMultiplySubtract,
HARDWARE_INTRINSIC(Sve, FusedMultiplySubtractBySelectedScalar, -1, 4, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmls, INS_sve_fmls}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics|HW_Flag_FmaIntrinsic|HW_Flag_LowVectorOperation)
HARDWARE_INTRINSIC(Sve, FusedMultiplySubtractNegated, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fnmls, INS_sve_fnmls}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, GatherVector, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorByteZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1b, INS_sve_ld1b, INS_sve_ld1b, INS_sve_ld1b, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't realize earlier but all the load/gatherload need to be marked as HW_Category_MemoryLoad

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't realize earlier but all the load/gatherload need to be marked as HW_Category_MemoryLoad

That's going to require some debugging, as they don't fit a normal style of a load API (I get an assert failure).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hwintrinsic.cpp has a if ((category == HW_Category_MemoryLoad) && op1->OperIs(GT_CAST)) check. This is only checked for memory loads with a single argument. All other types of memory loads are unchecked.

I moved this code so that it is checked for all HW_Category_MemoryLoad nodes. I reused OperIsMemoryLoad() but that means it needs doing after the ret node is created.

Inside OperIsMemoryLoad() I'm not sure what to do for the cases where the load is a vector of addresses. My thought was that we should not set paddr in these cases.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hwintrinsic.cpp has a if ((category == HW_Category_MemoryLoad) && op1->OperIs(GT_CAST)) check. This is only checked for memory loads with a single argument. All other types of memory loads are unchecked.

can we leave it as is then and not touch it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we leave it as is then and not touch it?

My assumption was this was a bug. The check needs to be added for all loads where the address is not in arg1.

What happens if the address is a GT_CAST and it is not removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed the GT_CAST changes for now. I'll raise a bug so we can double check it later. PR is all still working.

HARDWARE_INTRINSIC(Sve, GatherVectorInt16SignExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorInt16WithByteOffsetsSignExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorInt32SignExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sw, INS_sve_ld1sw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorInt32WithByteOffsetsSignExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sw, INS_sve_ld1sw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorSByteSignExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sb, INS_sve_ld1sb, INS_sve_ld1sb, INS_sve_ld1sb, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorUInt16WithByteOffsetsZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_sve_ld1h, INS_sve_ld1h, INS_sve_ld1h, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorUInt16ZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_sve_ld1h, INS_sve_ld1h, INS_sve_ld1h, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorUInt32WithByteOffsetsZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorUInt32ZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GetActiveElementCount, -1, 2, true, {INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_ExplicitMaskedOperation)
HARDWARE_INTRINSIC(Sve, LeadingSignCount, -1, -1, false, {INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LeadingZeroCount, -1, -1, false, {INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
Expand Down
Loading
Loading