Skip to content

Commit

Permalink
[main] Combine SME slice parameters
Browse files Browse the repository at this point in the history
Previously the (alpha) SME intrinsics were documented to take two
slice parameters: a 32-bit variable index and a 64-bit constant
offset.  However, it isn't very C-like to split an addition in
this way, and as Sander points out, it isn't really consistent
with the way that we handle vnum parameters.

The patch also removes a specific reference to w12-w15, since
some SME2 instructions use w8-w11.
  • Loading branch information
rsandifo-arm committed May 31, 2023
1 parent b05080f commit 8445ddd
Showing 1 changed file with 58 additions and 48 deletions.
106 changes: 58 additions & 48 deletions main/acle.md
Original file line number Diff line number Diff line change
Expand Up @@ -9043,9 +9043,18 @@ following it. --><span id="__arm_za_disable"></span>

The intrinsics in this section have the following properties in common:

* Every argument named `tile`, `slice_offset` or `tile_mask` must
be an integer constant expression in the range of the underlying
instruction.
* Every argument named `tile` or `tile_mask` must be an integer constant
expression in the range of the underlying instruction.

* Some SME instructions index ZA using the sum of a 32-bit general-purpose
register and a constant offset. Instead of having arguments for the
two individual fields, the associated intrinsics have a single
32-bit index called `slice` that holds the sum.

* However, load and store intrinsics that take both a `vnum` parameter
and a `slice` parameter add `vnum` to `slice`. This helps to ensure
that the load/store address and ZA index remain balanced, and
increases the chances that an immediate offset can be used.

* ZA loads and stores do not use typed pointers, since there is
no C or C++ type information associated with the contents of ZA.
Expand All @@ -9059,74 +9068,85 @@ The intrinsics in this section have the following properties in common:
``` c
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
__attribute__((arm_streaming, arm_shared_za))
void svld1_hor_za8(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg, const void *ptr);
void svld1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
const void *ptr);

// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
// address given by ptr.
//
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
__attribute__((arm_streaming, arm_shared_za))
void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
const void *ptr, int64_t vnum);

// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
__attribute__((arm_streaming, arm_shared_za))
void svld1_ver_za8(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg, const void *ptr);
void svld1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
const void *ptr);

// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
// address given by ptr.
//
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
__attribute__((arm_streaming, arm_shared_za))
void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
const void *ptr, int64_t vnum);
```

#### LDR

``` c
// slice_offset fills the role of the usual vnum parameter.
__attribute__((arm_streaming_compatible, arm_shared_za))
void svldr_vnum_za(uint32_t slice_base, uint64_t slice_offset,
const void *ptr);
void svldr_za(uint32_t slice, const void *ptr);

// Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
// This can be done in a single instruction if vnum is a constant in the
// range [0, 15]. The intrinsic is synthetic for other vnum parameters.
__attribute__((arm_streaming_compatible, arm_shared_za))
void svldr_vnum_za(uint32_t slice, const void *ptr, int64_t vnum);
```

#### ST1B, ST1H, ST1W, ST1D, ST1Q

``` c
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
void svst1_hor_za8(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svst1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
void *ptr);

// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
// address given by ptr.
//
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
void *ptr, int64_t vnum);

// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
void svst1_ver_za8(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svst1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
void *ptr);

// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
// address given by ptr.
//
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
void *ptr, int64_t vnum);
```

#### STR

``` c
// slice_offset fills the role of the usual vnum parameter.
__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
void svstr_vnum_za(uint32_t slice_base, uint64_t slice_offset, void *ptr);
void svstr_za(uint32_t slice, void *ptr);

// Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
// This can be done in a single instruction if vnum is a constant in the
// range [0, 15]. The intrinsic is synthetic for other vnum parameters.
__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
void svstr_vnum_za(uint32_t slice, void *ptr, int64_t vnum);
```

#### MOVA
Expand All @@ -9140,32 +9160,27 @@ parameter both have type `svuint8_t`.
// And similarly for u8.
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
svint8_t svread_hor_za8[_s8]_m(svint8_t zd, svbool_t pg,
uint64_t tile, uint32_t slice_base,
uint64_t slice_offset);
uint64_t tile, uint32_t slice);

// And similarly for u16, bf16 and f16.
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
svint16_t svread_hor_za16[_s16]_m(svint16_t zd, svbool_t pg,
uint64_t tile, uint32_t slice_base,
uint64_t slice_offset);
uint64_t tile, uint32_t slice);

// And similarly for u32 and f32.
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
svint32_t svread_hor_za32[_s32]_m(svint32_t zd, svbool_t pg,
uint64_t tile, uint32_t slice_base,
uint64_t slice_offset);
uint64_t tile, uint32_t slice);

// And similarly for u64 and f64.
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
svint64_t svread_hor_za64[_s64]_m(svint64_t zd, svbool_t pg,
uint64_t tile, uint32_t slice_base,
uint64_t slice_offset);
uint64_t tile, uint32_t slice);

// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
svint8_t svread_hor_za128[_s8]_m(svint8_t zd, svbool_t pg,
uint64_t tile, uint32_t slice_base,
uint64_t slice_offset);
uint64_t tile, uint32_t slice);
```

Replacing `_hor` with `_ver` gives the associated vertical forms.
Expand All @@ -9177,32 +9192,27 @@ the `zn` parameter to the `_u8` intrinsic has type `svuint8_t`.
``` c
// And similarly for u8.
__attribute__((arm_streaming, arm_shared_za))
void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
svint8_t zn);

// And similarly for u16, bf16 and f16.
__attribute__((arm_streaming, arm_shared_za))
void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice, svbool_t pg,
svint16_t zn);

// And similarly for u32 and f32.
__attribute__((arm_streaming, arm_shared_za))
void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice, svbool_t pg,
svint32_t zn);

// And similarly for u64 and f64.
__attribute__((arm_streaming, arm_shared_za))
void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice, svbool_t pg,
svint64_t zn);

// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
__attribute__((arm_streaming, arm_shared_za))
void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice_base,
uint64_t slice_offset, svbool_t pg,
void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
svint8_t zn);
```

Expand Down

0 comments on commit 8445ddd

Please sign in to comment.