From 8445ddddeb502e104839dc67f92b1a7fca571139 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Thu, 10 Nov 2022 11:54:34 +0000 Subject: [PATCH] [main] Combine SME slice parameters Previously the (alpha) SME intrinsics were documented to take two slice parameters: a 32-bit variable index and a 64-bit constant offset. However, it isn't very C-like to split an addition in this way, and as Sander points out, it isn't really consistent with the way that we handle vnum parameters. The patch also removes a specific reference to w12-w15, since some SME2 instructions use w8-w11. --- main/acle.md | 106 ++++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/main/acle.md b/main/acle.md index ed900f0b..841a1687 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9043,9 +9043,18 @@ following it. --> The intrinsics in this section have the following properties in common: -* Every argument named `tile`, `slice_offset` or `tile_mask` must - be an integer constant expression in the range of the underlying - instruction. +* Every argument named `tile` or `tile_mask` must be an integer constant + expression in the range of the underlying instruction. + +* Some SME instructions index ZA using the sum of a 32-bit general-purpose + register and a constant offset. Instead of having arguments for the + two individual fields, the associated intrinsics have a single + 32-bit index called `slice` that holds the sum. + +* However, load and store intrinsics that take both a `vnum` parameter + and a `slice` parameter add `vnum` to `slice`. This helps to ensure + that the load/store address and ZA index remain balanced, and + increases the chances that an immediate offset can be used. * ZA loads and stores do not use typed pointers, since there is no C or C++ type information associated with the contents of ZA. @@ -9059,36 +9068,42 @@ The intrinsics in this section have the following properties in common: ``` c // Also for _za16, _za32, _za64 and _za128 (with the same prototype). __attribute__((arm_streaming, arm_shared_za)) - void svld1_hor_za8(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, const void *ptr); + void svld1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg, + const void *ptr); - // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr. + // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the + // address given by ptr. + // // Also for _za16, _za32, _za64 and _za128 (with the same prototype). __attribute__((arm_streaming, arm_shared_za)) - void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg, const void *ptr, int64_t vnum); // Also for _za16, _za32, _za64 and _za128 (with the same prototype). __attribute__((arm_streaming, arm_shared_za)) - void svld1_ver_za8(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, const void *ptr); + void svld1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg, + const void *ptr); - // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr. + // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the + // address given by ptr. + // // Also for _za16, _za32, _za64 and _za128 (with the same prototype). __attribute__((arm_streaming, arm_shared_za)) - void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg, const void *ptr, int64_t vnum); ``` #### LDR ``` c - // slice_offset fills the role of the usual vnum parameter. __attribute__((arm_streaming_compatible, arm_shared_za)) - void svldr_vnum_za(uint32_t slice_base, uint64_t slice_offset, - const void *ptr); + void svldr_za(uint32_t slice, const void *ptr); + + // Adds vnum to slice and vnum * svcntsb() to the address given by ptr. + // This can be done in a single instruction if vnum is a constant in the + // range [0, 15]. The intrinsic is synthetic for other vnum parameters. + __attribute__((arm_streaming_compatible, arm_shared_za)) + void svldr_vnum_za(uint32_t slice, const void *ptr, int64_t vnum); ``` #### ST1B, ST1H, ST1W, ST1D, ST1Q @@ -9096,37 +9111,42 @@ The intrinsics in this section have the following properties in common: ``` c // Also for _za16, _za32, _za64 and _za128 (with the same prototype). __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - void svst1_hor_za8(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svst1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg, void *ptr); - // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr. + // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the + // address given by ptr. + // // Also for _za16, _za32, _za64 and _za128 (with the same prototype). __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg, void *ptr, int64_t vnum); // Also for _za16, _za32, _za64 and _za128 (with the same prototype). __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - void svst1_ver_za8(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svst1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg, void *ptr); - // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr. + // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the + // address given by ptr. + // // Also for _za16, _za32, _za64 and _za128 (with the same prototype). __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg, void *ptr, int64_t vnum); ``` #### STR ``` c - // slice_offset fills the role of the usual vnum parameter. __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) - void svstr_vnum_za(uint32_t slice_base, uint64_t slice_offset, void *ptr); + void svstr_za(uint32_t slice, void *ptr); + + // Adds vnum to slice and vnum * svcntsb() to the address given by ptr. + // This can be done in a single instruction if vnum is a constant in the + // range [0, 15]. The intrinsic is synthetic for other vnum parameters. + __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) + void svstr_vnum_za(uint32_t slice, void *ptr, int64_t vnum); ``` #### MOVA @@ -9140,32 +9160,27 @@ parameter both have type `svuint8_t`. // And similarly for u8. __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8_t svread_hor_za8[_s8]_m(svint8_t zd, svbool_t pg, - uint64_t tile, uint32_t slice_base, - uint64_t slice_offset); + uint64_t tile, uint32_t slice); // And similarly for u16, bf16 and f16. __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint16_t svread_hor_za16[_s16]_m(svint16_t zd, svbool_t pg, - uint64_t tile, uint32_t slice_base, - uint64_t slice_offset); + uint64_t tile, uint32_t slice); // And similarly for u32 and f32. __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint32_t svread_hor_za32[_s32]_m(svint32_t zd, svbool_t pg, - uint64_t tile, uint32_t slice_base, - uint64_t slice_offset); + uint64_t tile, uint32_t slice); // And similarly for u64 and f64. __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint64_t svread_hor_za64[_s64]_m(svint64_t zd, svbool_t pg, - uint64_t tile, uint32_t slice_base, - uint64_t slice_offset); + uint64_t tile, uint32_t slice); // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8_t svread_hor_za128[_s8]_m(svint8_t zd, svbool_t pg, - uint64_t tile, uint32_t slice_base, - uint64_t slice_offset); + uint64_t tile, uint32_t slice); ``` Replacing `_hor` with `_ver` gives the associated vertical forms. @@ -9177,32 +9192,27 @@ the `zn` parameter to the `_u8` intrinsic has type `svuint8_t`. ``` c // And similarly for u8. __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg, svint8_t zn); // And similarly for u16, bf16 and f16. __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice, svbool_t pg, svint16_t zn); // And similarly for u32 and f32. __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice, svbool_t pg, svint32_t zn); // And similarly for u64 and f64. __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice, svbool_t pg, svint64_t zn); // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64 __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice_base, - uint64_t slice_offset, svbool_t pg, + void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg, svint8_t zn); ```