Skip to content

Commit

Permalink
i#5036 A64 scatter/gather, part 10: Non-fault loads (#6756)
Browse files Browse the repository at this point in the history
Adds support for non-fault loads to drx_expand_scatter_gather().
Non-fault loads (ldnf1*) work similarly to scalar+immediate predicated
contiguous ld1* loads, but with different behaviour if an element access
faults. This commit implements this behaviour and extends the
scatter/gather tests to include ldnf1* instructions.

Issue: #5036
  • Loading branch information
jackgallagher-arm authored Apr 8, 2024
1 parent 525020b commit 59a2c38
Show file tree
Hide file tree
Showing 11 changed files with 865 additions and 117 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,34 @@ Hello, world!
Basic counts tool results:
Total counts:
#ifdef __ARM_FEATURE_SVE2
724 total \(fetched\) instructions
270 total unique \(fetched\) instructions
772 total \(fetched\) instructions
286 total unique \(fetched\) instructions
#else
685 total \(fetched\) instructions
255 total unique \(fetched\) instructions
733 total \(fetched\) instructions
271 total unique \(fetched\) instructions
#endif
0 total non-fetched instructions
0 total prefetches
#ifdef __ARM_FEATURE_SVE2
#if (__ARM_FEATURE_SVE_BITS == 128)
1158 total data loads
1248 total data loads
873 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2070 total data loads
2234 total data loads
1615 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3894 total data loads
4206 total data loads
3099 total data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#else
#if (__ARM_FEATURE_SVE_BITS == 128)
1137 total data loads
1227 total data loads
861 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2035 total data loads
2199 total data loads
1595 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3831 total data loads
4143 total data loads
3063 total data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#endif /* __ARM_FEATURE_SVE2 */
Expand All @@ -41,34 +41,34 @@ Total counts:
.*
Thread .* counts:
#ifdef __ARM_FEATURE_SVE2
724 \(fetched\) instructions
270 unique \(fetched\) instructions
772 \(fetched\) instructions
286 unique \(fetched\) instructions
#else
685 \(fetched\) instructions
255 unique \(fetched\) instructions
733 \(fetched\) instructions
271 unique \(fetched\) instructions
#endif
0 non-fetched instructions
0 prefetches
#ifdef __ARM_FEATURE_SVE2
#if (__ARM_FEATURE_SVE_BITS == 128)
1158 data loads
1248 data loads
873 data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2070 data loads
2234 data loads
1615 data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3894 data loads
4206 data loads
3099 data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#else
#if (__ARM_FEATURE_SVE_BITS == 128)
1137 data loads
1227 data loads
861 data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2035 data loads
2199 data loads
1595 data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3831 data loads
4143 data loads
3063 data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#endif /* __ARM_FEATURE_SVE2 */
Expand Down
59 changes: 38 additions & 21 deletions clients/drcachesim/tests/allasm_scattergather_aarch64.asm
Original file line number Diff line number Diff line change
Expand Up @@ -296,23 +296,39 @@ test_scalar_plus_immediate:
ld1b DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ld1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1b DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 16
ldnf1b DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ldnf1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnt1b DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 16
ld1sb DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ld1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1sb DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ldnf1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ld1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ld1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ldnf1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnt1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ld1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ld1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnt1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ld1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnt1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
// Total: 104
// Total: 178

ld2b { DEST_REG1.b, DEST_REG2.b }, B_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 32
ld2h { DEST_REG1.h, DEST_REG2.h }, H_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 16
Expand All @@ -331,7 +347,8 @@ test_scalar_plus_immediate:
ld4w { DEST_REG1.s, DEST_REG2.s, DEST_REG3.s, DEST_REG4.s }, S_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 16
ld4d { DEST_REG1.d, DEST_REG2.d, DEST_REG3.d, DEST_REG4.d }, D_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 8
// Total: 120
// Total loads: 104 + 60 + 90 + 120 = 374

// Total loads: 178 + 60 + 90 + 120 = 448

st1b SRC_REG1.b, B_MASK_REG, [BUFFER_REG, #1, mul vl] // 16
st1b SRC_REG1.h, H_MASK_REG, [BUFFER_REG, #1, mul vl] // 8
Expand Down Expand Up @@ -440,7 +457,7 @@ _start:
bl test_scalar_plus_scalar // +(374 * vl_bytes/16) loads
// +(322 * vl_bytes/16) stores

bl test_scalar_plus_immediate // +(374 * vl_bytes/16) loads
bl test_scalar_plus_immediate // +(448 * vl_bytes/16) loads
// +(322 * vl_bytes/16) stores
bl test_replicating_loads // +60 loads
// +0 stores
Expand All @@ -450,11 +467,11 @@ _start:
#endif
// Running total:
// SVE only:
// Loads: (136 + 14 + 374 + 374) * vl_bytes/16 + 60 = 898 * vl_bytes/16 + 60
// Loads: (136 + 14 + 374 + 448) * vl_bytes/16 + 60 = 972 * vl_bytes/16 + 60
// Stores: (82 + 8 + 322 + 322) * vl_bytes/16 = 734 * vl_bytes/16

// Including SVE2:
// Loads: ((898 + 14) * vl_bytes/16) + 60 = (912 * vl_bytes/16) + 60
// Loads: ((972 + 14) * vl_bytes/16) + 60 = (986 * vl_bytes/16) + 60
// Stores: (734 + 8) * vl_bytes/16 = 742 * vl_bytes/16

/* Run all the instructions with no active elements */
Expand All @@ -475,11 +492,11 @@ _start:

// Running total (unchanged from above):
// SVE only:
// Loads: (898 * vl_bytes/16) + 60
// Loads: (972 * vl_bytes/16) + 60
// Stores: 734 * vl_bytes/16

// Including SVE2:
// Loads: (912 * vl_bytes/16) + 60
// Loads: (986 * vl_bytes/16) + 60
// Stores: 742 * vl_bytes/16

/* Run all instructions with one active element */
Expand All @@ -491,47 +508,47 @@ _start:
bl test_scalar_plus_vector // +52 loads, +31 stores
bl test_vector_plus_immediate // +7 loads, +4 stores
bl test_scalar_plus_scalar // +56 loads, +46 stores
bl test_scalar_plus_immediate // +56 loads, +46 stores
bl test_scalar_plus_immediate // +72 loads, +46 stores
bl test_replicating_loads // +8 loads, +0 stores
#ifdef __ARM_FEATURE_SVE2
bl test_vector_plus_scalar // +7 loads, +4 stores
#endif

// Running total:
// SVE only:
// Loads: (898 * vl_bytes/16) + 60 + 52 + 7 + 56 + 56 + 8 = (898 * vl_bytes/16) + 239
// Loads: (972 * vl_bytes/16) + 60 + 52 + 7 + 56 + 72 + 8 = (972 * vl_bytes/16) + 255
// Stores: (734 * vl_bytes/16) + 41 + 4 + 46 + 46 = (734 * vl_bytes/16) + 127

// Including SVE2:
// Loads: (912 * vl_bytes/16) + 239 + 7 = (912 * vl_bytes/16) + 246
// Loads: (986 * vl_bytes/16) + 255 + 7 = (986 * vl_bytes/16) + 262
// Stores: (742 * vl_bytes/16) + 127 + 4 = (742 * vl_bytes/16) + 131

// The functions in this file have the following instructions counts:
// _start 40 (+3 SVE2)
// test_scalar_plus_vector 84
// test_vector_plus_immediate 12
// test_scalar_plus_scalar 55
// test_scalar_plus_immediate 55
// test_scalar_plus_immediate 71
// test_replicating_loads 9
// test_vector_plus_scalar 12
// So there are 40 + 84 + 12 + 55 + 55 + 9 = 255 unique instructions
// (or 255 + 12 + 3 = 270 including SVE2)
// So there are 40 + 84 + 12 + 55 + 71 + 9 = 271 unique instructions
// (or 271 + 12 + 3 = 286 including SVE2)
// We run the test_* functions 3 times each so the total instruction executed is
// ((84 + 12 + 55 + 55 + 9) * 3) + 40 = (215 * 3) + 37 = 685
// (or 685 + 3 + (12 * 3) = 724 including SVE2)
// ((84 + 12 + 55 + 71 + 9) * 3) + 40 = (231 * 3) + 37 = 733
// (or 733 + 3 + (12 * 3) = 772 including SVE2)

// Totals:
// SVE only:
// Loads: (898 * vl_bytes/16) + 239
// Loads: (972 * vl_bytes/16) + 255
// Stores: (734 * vl_bytes/16) + 127
// Instructions: 685
// Unique instructions: 255
// Instructions: 733
// Unique instructions: 271

// Including SVE2:
// Loads: (912 * vl_bytes/16) + 246
// Loads: (986 * vl_bytes/16) + 262
// Stores: (742 * vl_bytes/16) + 131
// Instructions: 724
// Unique instructions: 270
// Instructions: 772
// Unique instructions: 286

// Exit.
mov w0, #1 // stdout
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,34 @@ Hello, world!
Basic counts tool results:
Total counts:
#ifdef __ARM_FEATURE_SVE2
724 total \(fetched\) instructions
270 total unique \(fetched\) instructions
772 total \(fetched\) instructions
286 total unique \(fetched\) instructions
#else
685 total \(fetched\) instructions
255 total unique \(fetched\) instructions
733 total \(fetched\) instructions
271 total unique \(fetched\) instructions
#endif
0 total non-fetched instructions
0 total prefetches
#ifdef __ARM_FEATURE_SVE2
#if (__ARM_FEATURE_SVE_BITS == 128)
1158 total data loads
1248 total data loads
873 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2070 total data loads
2234 total data loads
1615 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3894 total data loads
4206 total data loads
3099 total data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#else
#if (__ARM_FEATURE_SVE_BITS == 128)
1137 total data loads
1227 total data loads
861 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2035 total data loads
2199 total data loads
1595 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3831 total data loads
4143 total data loads
3063 total data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#endif /* __ARM_FEATURE_SVE2 */
Expand All @@ -40,35 +40,35 @@ Total counts:
.*
Thread .* counts:
#ifdef __ARM_FEATURE_SVE2
724 \(fetched\) instructions
270 unique \(fetched\) instructions
772 \(fetched\) instructions
286 unique \(fetched\) instructions
#else
685 \(fetched\) instructions
255 unique \(fetched\) instructions
733 \(fetched\) instructions
271 unique \(fetched\) instructions
#endif

0 non-fetched instructions
0 prefetches
#ifdef __ARM_FEATURE_SVE2
#if (__ARM_FEATURE_SVE_BITS == 128)
1158 data loads
1248 data loads
873 data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2070 data loads
2223 data loads
1615 data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3894 data loads
4206 data loads
3099 data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#else
#if (__ARM_FEATURE_SVE_BITS == 128)
1137 data loads
1227 data loads
861 data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2035 data loads
2199 data loads
1595 data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3831 data loads
4143 data loads
3063 data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#endif /* __ARM_FEATURE_SVE2 */
Expand Down
16 changes: 16 additions & 0 deletions clients/drcachesim/tests/scattergather-aarch64.templatex
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,22 @@ ld1rqw scalar\+immediate: PASS
ld1rqd scalar\+immediate: PASS
ld1rqd scalar\+immediate \(min index\): PASS
ld1rqd scalar\+immediate \(max index\): PASS
ldnf1b scalar\+immediate 8bit element: PASS
ldnf1b scalar\+immediate 16bit element: PASS
ldnf1b scalar\+immediate 32bit element: PASS
ldnf1b scalar\+immediate 64bit element: PASS
ldnf1sb scalar\+immediate 16bit element: PASS
ldnf1sb scalar\+immediate 32bit element: PASS
ldnf1sb scalar\+immediate 64bit element: PASS
ldnf1h scalar\+immediate 16bit element: PASS
ldnf1h scalar\+immediate 32bit element: PASS
ldnf1h scalar\+immediate 64bit element: PASS
ldnf1sh scalar\+immediate 32bit element: PASS
ldnf1sh scalar\+immediate 64bit element: PASS
ldnf1w scalar\+immediate 32bit element: PASS
ldnf1w scalar\+immediate 64bit element: PASS
ldnf1sw scalar\+immediate 64bit element: PASS
ldnf1d scalar\+immediate 64bit element: PASS
ld2b scalar\+immediate: PASS
ld2h scalar\+immediate: PASS
ld2w scalar\+immediate: PASS
Expand Down
Loading

0 comments on commit 59a2c38

Please sign in to comment.