From 59a2c38212f69c73004e3fdfddec7660cbc6eeaa Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Mon, 8 Apr 2024 13:35:02 +0100 Subject: [PATCH] i#5036 A64 scatter/gather, part 10: Non-fault loads (#6756) Adds support for non-fault loads to drx_expand_scatter_gather(). Non-fault loads (ldnf1*) work similarly to scalar+immediate predicated contiguous ld1* loads, but with different behaviour if an element access faults. This commit implements this behaviour and extends the scatter/gather tests to include ldnf1* instructions. Issue: #5036 --- ...attergather-basic-counts-aarch64.templatex | 40 +- .../tests/allasm_scattergather_aarch64.asm | 59 +- ...attergather-basic-counts-aarch64.templatex | 40 +- .../tests/scattergather-aarch64.templatex | 16 + ext/drx/scatter_gather_aarch64.c | 103 ++- ext/drx/scatter_gather_shared.c | 60 +- ext/drx/scatter_gather_shared.h | 5 + .../drx-scattergather-aarch64.cpp | 623 +++++++++++++++++- .../drx-scattergather-aarch64.templatex | 20 +- .../drx-scattergather-bbdup.dll.c | 8 - .../client-interface/drx-scattergather.dll.c | 8 - 11 files changed, 865 insertions(+), 117 deletions(-) diff --git a/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex b/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex index 25e6daaa7fa..ae6ade9817c 100644 --- a/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex +++ b/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex @@ -3,34 +3,34 @@ Hello, world! Basic counts tool results: Total counts: #ifdef __ARM_FEATURE_SVE2 - 724 total \(fetched\) instructions - 270 total unique \(fetched\) instructions + 772 total \(fetched\) instructions + 286 total unique \(fetched\) instructions #else - 685 total \(fetched\) instructions - 255 total unique \(fetched\) instructions + 733 total \(fetched\) instructions + 271 total unique \(fetched\) instructions #endif 0 total non-fetched instructions 0 total prefetches #ifdef __ARM_FEATURE_SVE2 #if (__ARM_FEATURE_SVE_BITS == 128) - 1158 total data loads + 1248 total data loads 873 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2070 total data loads + 2234 total data loads 1615 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3894 total data loads + 4206 total data loads 3099 total data stores #endif /* __ARM_FEATURE_SVE_BITS */ #else #if (__ARM_FEATURE_SVE_BITS == 128) - 1137 total data loads + 1227 total data loads 861 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2035 total data loads + 2199 total data loads 1595 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3831 total data loads + 4143 total data loads 3063 total data stores #endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE2 */ @@ -41,34 +41,34 @@ Total counts: .* Thread .* counts: #ifdef __ARM_FEATURE_SVE2 - 724 \(fetched\) instructions - 270 unique \(fetched\) instructions + 772 \(fetched\) instructions + 286 unique \(fetched\) instructions #else - 685 \(fetched\) instructions - 255 unique \(fetched\) instructions + 733 \(fetched\) instructions + 271 unique \(fetched\) instructions #endif 0 non-fetched instructions 0 prefetches #ifdef __ARM_FEATURE_SVE2 #if (__ARM_FEATURE_SVE_BITS == 128) - 1158 data loads + 1248 data loads 873 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2070 data loads + 2234 data loads 1615 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3894 data loads + 4206 data loads 3099 data stores #endif /* __ARM_FEATURE_SVE_BITS */ #else #if (__ARM_FEATURE_SVE_BITS == 128) - 1137 data loads + 1227 data loads 861 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2035 data loads + 2199 data loads 1595 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3831 data loads + 4143 data loads 3063 data stores #endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE2 */ diff --git a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm index 8e08ceee19b..08462f8799c 100644 --- a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm +++ b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm @@ -296,23 +296,39 @@ test_scalar_plus_immediate: ld1b DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8 ld1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 ld1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 + ldnf1b DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 16 + ldnf1b DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8 + ldnf1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 + ldnf1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 ldnt1b DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 16 ld1sb DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8 ld1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 ld1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 + ldnf1sb DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8 + ldnf1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 + ldnf1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 ld1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8 ld1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 ld1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 + ldnf1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8 + ldnf1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 + ldnf1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 ldnt1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8 ld1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 ld1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 + ldnf1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 + ldnf1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 ld1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 ld1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 + ldnf1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 + ldnf1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 ldnt1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4 ld1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 + ldnf1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 ld1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 + ldnf1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 ldnt1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2 - // Total: 104 + // Total: 178 ld2b { DEST_REG1.b, DEST_REG2.b }, B_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 32 ld2h { DEST_REG1.h, DEST_REG2.h }, H_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 16 @@ -331,7 +347,8 @@ test_scalar_plus_immediate: ld4w { DEST_REG1.s, DEST_REG2.s, DEST_REG3.s, DEST_REG4.s }, S_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 16 ld4d { DEST_REG1.d, DEST_REG2.d, DEST_REG3.d, DEST_REG4.d }, D_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 8 // Total: 120 - // Total loads: 104 + 60 + 90 + 120 = 374 + + // Total loads: 178 + 60 + 90 + 120 = 448 st1b SRC_REG1.b, B_MASK_REG, [BUFFER_REG, #1, mul vl] // 16 st1b SRC_REG1.h, H_MASK_REG, [BUFFER_REG, #1, mul vl] // 8 @@ -440,7 +457,7 @@ _start: bl test_scalar_plus_scalar // +(374 * vl_bytes/16) loads // +(322 * vl_bytes/16) stores - bl test_scalar_plus_immediate // +(374 * vl_bytes/16) loads + bl test_scalar_plus_immediate // +(448 * vl_bytes/16) loads // +(322 * vl_bytes/16) stores bl test_replicating_loads // +60 loads // +0 stores @@ -450,11 +467,11 @@ _start: #endif // Running total: // SVE only: - // Loads: (136 + 14 + 374 + 374) * vl_bytes/16 + 60 = 898 * vl_bytes/16 + 60 + // Loads: (136 + 14 + 374 + 448) * vl_bytes/16 + 60 = 972 * vl_bytes/16 + 60 // Stores: (82 + 8 + 322 + 322) * vl_bytes/16 = 734 * vl_bytes/16 // Including SVE2: - // Loads: ((898 + 14) * vl_bytes/16) + 60 = (912 * vl_bytes/16) + 60 + // Loads: ((972 + 14) * vl_bytes/16) + 60 = (986 * vl_bytes/16) + 60 // Stores: (734 + 8) * vl_bytes/16 = 742 * vl_bytes/16 /* Run all the instructions with no active elements */ @@ -475,11 +492,11 @@ _start: // Running total (unchanged from above): // SVE only: - // Loads: (898 * vl_bytes/16) + 60 + // Loads: (972 * vl_bytes/16) + 60 // Stores: 734 * vl_bytes/16 // Including SVE2: - // Loads: (912 * vl_bytes/16) + 60 + // Loads: (986 * vl_bytes/16) + 60 // Stores: 742 * vl_bytes/16 /* Run all instructions with one active element */ @@ -491,7 +508,7 @@ _start: bl test_scalar_plus_vector // +52 loads, +31 stores bl test_vector_plus_immediate // +7 loads, +4 stores bl test_scalar_plus_scalar // +56 loads, +46 stores - bl test_scalar_plus_immediate // +56 loads, +46 stores + bl test_scalar_plus_immediate // +72 loads, +46 stores bl test_replicating_loads // +8 loads, +0 stores #ifdef __ARM_FEATURE_SVE2 bl test_vector_plus_scalar // +7 loads, +4 stores @@ -499,11 +516,11 @@ _start: // Running total: // SVE only: - // Loads: (898 * vl_bytes/16) + 60 + 52 + 7 + 56 + 56 + 8 = (898 * vl_bytes/16) + 239 + // Loads: (972 * vl_bytes/16) + 60 + 52 + 7 + 56 + 72 + 8 = (972 * vl_bytes/16) + 255 // Stores: (734 * vl_bytes/16) + 41 + 4 + 46 + 46 = (734 * vl_bytes/16) + 127 // Including SVE2: - // Loads: (912 * vl_bytes/16) + 239 + 7 = (912 * vl_bytes/16) + 246 + // Loads: (986 * vl_bytes/16) + 255 + 7 = (986 * vl_bytes/16) + 262 // Stores: (742 * vl_bytes/16) + 127 + 4 = (742 * vl_bytes/16) + 131 // The functions in this file have the following instructions counts: @@ -511,27 +528,27 @@ _start: // test_scalar_plus_vector 84 // test_vector_plus_immediate 12 // test_scalar_plus_scalar 55 - // test_scalar_plus_immediate 55 + // test_scalar_plus_immediate 71 // test_replicating_loads 9 // test_vector_plus_scalar 12 - // So there are 40 + 84 + 12 + 55 + 55 + 9 = 255 unique instructions - // (or 255 + 12 + 3 = 270 including SVE2) + // So there are 40 + 84 + 12 + 55 + 71 + 9 = 271 unique instructions + // (or 271 + 12 + 3 = 286 including SVE2) // We run the test_* functions 3 times each so the total instruction executed is - // ((84 + 12 + 55 + 55 + 9) * 3) + 40 = (215 * 3) + 37 = 685 - // (or 685 + 3 + (12 * 3) = 724 including SVE2) + // ((84 + 12 + 55 + 71 + 9) * 3) + 40 = (231 * 3) + 37 = 733 + // (or 733 + 3 + (12 * 3) = 772 including SVE2) // Totals: // SVE only: - // Loads: (898 * vl_bytes/16) + 239 + // Loads: (972 * vl_bytes/16) + 255 // Stores: (734 * vl_bytes/16) + 127 - // Instructions: 685 - // Unique instructions: 255 + // Instructions: 733 + // Unique instructions: 271 // Including SVE2: - // Loads: (912 * vl_bytes/16) + 246 + // Loads: (986 * vl_bytes/16) + 262 // Stores: (742 * vl_bytes/16) + 131 - // Instructions: 724 - // Unique instructions: 270 + // Instructions: 772 + // Unique instructions: 286 // Exit. mov w0, #1 // stdout diff --git a/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex b/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex index d16476d530a..180aedd55d5 100644 --- a/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex +++ b/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex @@ -2,34 +2,34 @@ Hello, world! Basic counts tool results: Total counts: #ifdef __ARM_FEATURE_SVE2 - 724 total \(fetched\) instructions - 270 total unique \(fetched\) instructions + 772 total \(fetched\) instructions + 286 total unique \(fetched\) instructions #else - 685 total \(fetched\) instructions - 255 total unique \(fetched\) instructions + 733 total \(fetched\) instructions + 271 total unique \(fetched\) instructions #endif 0 total non-fetched instructions 0 total prefetches #ifdef __ARM_FEATURE_SVE2 #if (__ARM_FEATURE_SVE_BITS == 128) - 1158 total data loads + 1248 total data loads 873 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2070 total data loads + 2234 total data loads 1615 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3894 total data loads + 4206 total data loads 3099 total data stores #endif /* __ARM_FEATURE_SVE_BITS */ #else #if (__ARM_FEATURE_SVE_BITS == 128) - 1137 total data loads + 1227 total data loads 861 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2035 total data loads + 2199 total data loads 1595 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3831 total data loads + 4143 total data loads 3063 total data stores #endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE2 */ @@ -40,35 +40,35 @@ Total counts: .* Thread .* counts: #ifdef __ARM_FEATURE_SVE2 - 724 \(fetched\) instructions - 270 unique \(fetched\) instructions + 772 \(fetched\) instructions + 286 unique \(fetched\) instructions #else - 685 \(fetched\) instructions - 255 unique \(fetched\) instructions + 733 \(fetched\) instructions + 271 unique \(fetched\) instructions #endif 0 non-fetched instructions 0 prefetches #ifdef __ARM_FEATURE_SVE2 #if (__ARM_FEATURE_SVE_BITS == 128) - 1158 data loads + 1248 data loads 873 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2070 data loads + 2223 data loads 1615 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3894 data loads + 4206 data loads 3099 data stores #endif /* __ARM_FEATURE_SVE_BITS */ #else #if (__ARM_FEATURE_SVE_BITS == 128) - 1137 data loads + 1227 data loads 861 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2035 data loads + 2199 data loads 1595 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3831 data loads + 4143 data loads 3063 data stores #endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE2 */ diff --git a/clients/drcachesim/tests/scattergather-aarch64.templatex b/clients/drcachesim/tests/scattergather-aarch64.templatex index 1f735049665..021609331f0 100644 --- a/clients/drcachesim/tests/scattergather-aarch64.templatex +++ b/clients/drcachesim/tests/scattergather-aarch64.templatex @@ -222,6 +222,22 @@ ld1rqw scalar\+immediate: PASS ld1rqd scalar\+immediate: PASS ld1rqd scalar\+immediate \(min index\): PASS ld1rqd scalar\+immediate \(max index\): PASS +ldnf1b scalar\+immediate 8bit element: PASS +ldnf1b scalar\+immediate 16bit element: PASS +ldnf1b scalar\+immediate 32bit element: PASS +ldnf1b scalar\+immediate 64bit element: PASS +ldnf1sb scalar\+immediate 16bit element: PASS +ldnf1sb scalar\+immediate 32bit element: PASS +ldnf1sb scalar\+immediate 64bit element: PASS +ldnf1h scalar\+immediate 16bit element: PASS +ldnf1h scalar\+immediate 32bit element: PASS +ldnf1h scalar\+immediate 64bit element: PASS +ldnf1sh scalar\+immediate 32bit element: PASS +ldnf1sh scalar\+immediate 64bit element: PASS +ldnf1w scalar\+immediate 32bit element: PASS +ldnf1w scalar\+immediate 64bit element: PASS +ldnf1sw scalar\+immediate 64bit element: PASS +ldnf1d scalar\+immediate 64bit element: PASS ld2b scalar\+immediate: PASS ld2h scalar\+immediate: PASS ld2w scalar\+immediate: PASS diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c index f2556c0a61d..18c698767a0 100644 --- a/ext/drx/scatter_gather_aarch64.c +++ b/ext/drx/scatter_gather_aarch64.c @@ -42,6 +42,7 @@ #include "../ext_utils.h" #include "scatter_gather_shared.h" +#include #include #include /* for offsetof */ @@ -1427,8 +1428,8 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e * we didn't expand any instructions. This matches the behaviour of this function * for architectures with no scatter/gather expansion support. */ - if (sg_info.faulting_behavior != DRX_NORMAL_FAULTING) { - /* TODO i#5036: Add support for first-fault and non-fault accesses. */ + if (sg_info.faulting_behavior == DRX_FIRST_FAULTING) { + /* TODO i#5036: Add support for first-fault loads. */ return true; } @@ -1622,6 +1623,104 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e return res; } +dr_signal_action_t +drx_scatter_gather_signal_event(void *drcontext, dr_siginfo_t *info, instr_t *sg_inst) +{ + scatter_gather_info_t sg_info; + get_scatter_gather_info(sg_inst, &sg_info); + + if ((info->sig == SIGSEGV || info->sig == SIGBUS) && + sg_info.faulting_behavior == DRX_NON_FAULTING) { + /* The only SVE instructions which have non-faulting behaviour are + * predicated contiguous scalar+immediate loads (ldnf1[bhwd]). + * TODO i#5036: instr_compute_address() does not support vector addressing modes + * (scalar+vector, vector+immediate) which is fine for non-faulting + * loads, but when we add support for first-fault instructions we + * will need to switch this to use instr_compute_address_ex(). + */ + DR_ASSERT(!reg_is_z(sg_info.base_reg)); + DR_ASSERT(!reg_is_z(sg_info.index_reg)); + const app_pc load_min_addr = instr_compute_address(sg_inst, info->mcontext); + const app_pc load_max_addr = + load_min_addr + opnd_size_in_bytes(sg_info.scatter_gather_size); + if (info->access_address < load_min_addr || + info->access_address >= load_max_addr) { + /* The faulting address is out of range for the expanded ldnf instruction so + * the fault must have come from an instruction inserted by a client, rather + * than one of the expansion loads inserted by expand_scatter_gather() so we + * pass the fault on for the client to handle. + */ + return DR_SIGNAL_DELIVER; + } + /* Non-faulting loads do not generate a fault when one of the addresses it + * accesses faults. Instead it sets the value of the FFR to indicate which + * element faulted and execution continues. We implement that behaviour here + * by setting the FFR and redirecting to the next app instruction. + */ + + /* Skip to the next app instruction */ + info->mcontext->pc += instr_length(drcontext, sg_inst); + + /* allocate_zp_registers() is deterministic so we can call it again here and find + * out which registers are used in the expansion. + */ + spill_slot_state_t spill_slot_state; + init_spill_slot_state(&spill_slot_state); + + scratch_regs_t scratch_regs; + + allocate_zp_registers(sg_inst, &sg_info, &spill_slot_state, &scratch_regs); + + /* Set the FFR value + * + * The FFR is like a special purpose predicate register. When an element access + * faults, the corresponding bit in the FFR is set to 0 and all the higher bits + * are zeroed too. All bits lower than the faulting element are preserved. + * We can find out which element faulted by looking at the value of the register + * we use as a loop variable in the expansion code. It will contain a mask where + * a single bit is set which corresponds to the current loop iteration (the + * faulting element). + * Essentially we do: + * + * ffr = ffr & (loop_var - 1) + * + * but ffr is a dr_simd_t so we have to do it in 32-bit chunks. + */ + + const size_t loop_p_reg = scratch_regs.pred - DR_REG_P0; + + bool found_fault = false; + for (size_t i = 0; + i < sizeof(info->mcontext->ffr.u32) / sizeof(info->mcontext->ffr.u32[0]); + i++) { + if (found_fault) { + /* We have passed the element that faulted, all further bits are set to 0 + */ + info->mcontext->ffr.u32[i] = 0; + } else { + const uint loop_var = info->raw_mcontext->svep[loop_p_reg].u32[i]; + if (loop_var != 0) { + /* This chunk contains the bit for the faulting element so we need to + * mask this chunk. All bits before the faulting element are + * unchanged and bits after it are set to 0. + */ + info->mcontext->ffr.u32[i] &= loop_var - 1; + found_fault = true; + } else { + /* We haven't passed the faulting element yet so this chunk is + * unchanged. + */ + } + } + } + /* Suppress the signal and continue from the PC we set above (the next app + * instruction). */ + return DR_SIGNAL_REDIRECT; + } + + return DR_SIGNAL_DELIVER; +} + bool drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, instr_t *sg_inst) diff --git a/ext/drx/scatter_gather_shared.c b/ext/drx/scatter_gather_shared.c index 0cf6b81bc06..af305cc0413 100644 --- a/ext/drx/scatter_gather_shared.c +++ b/ext/drx/scatter_gather_shared.c @@ -108,34 +108,58 @@ drx_mark_scatter_gather_expanded(void) } static bool -drx_event_restore_state(void *drcontext, bool restore_memory, - dr_restore_state_info_t *info) +is_scatter_gather_fault(void *drcontext, dr_fault_fragment_info_t *fragment_info, + DR_PARAM_OUT instr_t *inst) { - instr_t inst; - bool success = true; - if (info->fragment_info.cache_start_pc == NULL) - return true; /* fault not in cache */ + if (fragment_info->cache_start_pc == NULL) + return false; /* fault not in cache */ if (dr_atomic_load32(&drx_scatter_gather_expanded) == 0) { - /* Nothing to do if nobody had never called expand_scatter_gather() before. */ - return true; + /* Nothing to do if nobody has ever called expand_scatter_gather() before. */ + return false; } - if (!info->fragment_info.app_code_consistent) { + if (!fragment_info->app_code_consistent) { /* Can't verify application code. * XXX i#2985: is it better to keep searching? */ - return true; + return false; } + byte *pc = decode(drcontext, dr_fragment_app_pc(fragment_info->tag), inst); + return pc != NULL && (instr_is_gather(inst) || instr_is_scatter(inst)); +} + +static bool +drx_event_restore_state(void *drcontext, bool restore_memory, + dr_restore_state_info_t *info) +{ + instr_t inst; instr_init(drcontext, &inst); - byte *pc = decode(drcontext, dr_fragment_app_pc(info->fragment_info.tag), &inst); - if (pc != NULL) { - if (instr_is_gather(&inst) || instr_is_scatter(&inst)) { - success = success && drx_scatter_gather_restore_state(drcontext, info, &inst); - } - } + + const bool success = + !is_scatter_gather_fault(drcontext, &info->fragment_info, &inst) || + drx_scatter_gather_restore_state(drcontext, info, &inst); + instr_free(drcontext, &inst); return success; } +#if defined(AARCH64) +static dr_signal_action_t +drx_event_signal(void *drcontext, dr_siginfo_t *info) +{ + instr_t inst; + instr_init(drcontext, &inst); + + dr_signal_action_t action = DR_SIGNAL_DELIVER; + + if (is_scatter_gather_fault(drcontext, &info->fault_fragment_info, &inst)) { + action = drx_scatter_gather_signal_event(drcontext, info, &inst); + } + + instr_free(drcontext, &inst); + return action; +} +#endif + /* Reserved note range values */ enum { SG_NOTE_EXPANDED_LD_ST, @@ -161,7 +185,8 @@ drx_scatter_gather_init() return false; if (!drmgr_register_thread_init_event(drx_scatter_gather_thread_init) || - !drmgr_register_thread_exit_event(drx_scatter_gather_thread_exit)) + !drmgr_register_thread_exit_event(drx_scatter_gather_thread_exit) + IF_AARCH64(|| !drmgr_register_signal_event(drx_event_signal))) return false; note_base = drmgr_reserve_note_range(SG_NOTE_COUNT); @@ -176,6 +201,7 @@ void drx_scatter_gather_exit() { drmgr_unregister_tls_field(drx_scatter_gather_tls_idx); + IF_AARCH64(drmgr_unregister_signal_event(drx_event_signal);) } bool diff --git a/ext/drx/scatter_gather_shared.h b/ext/drx/scatter_gather_shared.h index 5f49ed71d10..07014707e46 100644 --- a/ext/drx/scatter_gather_shared.h +++ b/ext/drx/scatter_gather_shared.h @@ -111,6 +111,11 @@ bool drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, instr_t *sg_inst); +#if defined(AARCH64) +dr_signal_action_t +drx_scatter_gather_signal_event(void *drcontext, dr_siginfo_t *info, instr_t *sg_inst); +#endif + /* Check if an instruction has been marked as a load or store that is part of a * scatter/gather instruction expansion. */ diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.cpp b/suite/tests/client-interface/drx-scattergather-aarch64.cpp index 6ff0f25eaec..828c70d5fe2 100644 --- a/suite/tests/client-interface/drx-scattergather-aarch64.cpp +++ b/suite/tests/client-interface/drx-scattergather-aarch64.cpp @@ -612,7 +612,7 @@ template struct test_case_base_t { virtual void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) = 0; + bool expected_fault, size_t faulting_element) = 0; virtual size_t num_values_accessed() const @@ -620,6 +620,18 @@ template struct test_case_base_t { return get_vl_bytes() / static_cast(element_size_); } + virtual void + check_fault(bool expected_fault, bool signal_handler_called) + { + if (!expected_fault && signal_handler_called) { + test_failed(); + print("Unexpected fault\n"); + } else if (expected_fault && !signal_handler_called) { + test_failed(); + print("Expected fault but signal handler not called\n"); + } + } + test_result_t run_test_case() { @@ -670,13 +682,7 @@ template struct test_case_base_t { } } - if (!expected_fault && signal_handler_called) { - test_failed(); - print("Unexpected fault\n"); - } else if (expected_fault && !signal_handler_called) { - test_failed(); - print("Expected fault but signal handler not called\n"); - } + check_fault(expected_fault, signal_handler_called); // Validate the output if: // - This is not a fault test (check the expanded instruction behaved @@ -685,7 +691,7 @@ template struct test_case_base_t { // (Check the scratch register state was correctly restored and none of // the registers are corrupted). if (!force_fault || expected_fault) { - check_output(pred, register_data, expected_fault); + check_output(pred, register_data, expected_fault, faulting_element); } }; @@ -1015,7 +1021,7 @@ struct scalar_plus_vector_load_test_case_t : public scalar_plus_vector_test_case void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { const auto vl_bytes = get_vl_bytes(); @@ -1766,7 +1772,7 @@ struct scalar_plus_vector_store_test_case_t : public scalar_plus_vector_test_cas void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { // Check that the values of the other Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { @@ -2227,7 +2233,7 @@ struct vector_plus_immediate_load_test_case_t void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { const auto vl_bytes = get_vl_bytes(); @@ -2546,7 +2552,7 @@ struct vector_plus_immediate_store_test_case_t void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { // Check that the values of the Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { @@ -2801,7 +2807,7 @@ struct scalar_plus_scalar_load_test_case_t void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { if (!expected_fault) { for (size_t i = 0; i < NUM_ZT; i++) { @@ -3670,7 +3676,7 @@ struct scalar_plus_scalar_store_test_case_t void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { // Check that the values of the Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { @@ -4146,7 +4152,7 @@ struct scalar_plus_immediate_load_test_case_t void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { if (!expected_fault) { for (size_t zt = 0; zt < NUM_ZT; zt++) { @@ -5636,6 +5642,583 @@ test_ld4_scalar_plus_immediate() # undef TEST_FUNC } +struct scalar_plus_immediate_non_fault_load_test_case_t + : public scalar_plus_immediate_load_test_case_t<1> { + + std::vector reference_data_fault_; + + template + scalar_plus_immediate_non_fault_load_test_case_t( + std::string name, test_func_t func, registers_used_t registers_used, + std::array, 1> + reference_data_128_no_fault, + std::array, 1> + reference_data_256_no_fault, + std::array, 1> + reference_data_512_no_fault, + std::array reference_data_128_fault, + std::array reference_data_256_fault, + std::array reference_data_512_fault, + + element_size_t data_size, std::ptrdiff_t offset) + : scalar_plus_immediate_load_test_case_t<1>( + name, func, registers_used, reference_data_128_no_fault, + reference_data_256_no_fault, reference_data_512_no_fault, data_size, offset) + { + const auto vl_bytes = get_vl_bytes(); + reference_data_fault_.resize(vl_bytes); + switch (vl_bytes) { + case 16: + assert(reference_data_128_fault.size() * sizeof(ELEMENT_T) == vl_bytes); + memcpy(reference_data_fault_.data(), reference_data_128_fault.data(), + vl_bytes); + break; + case 32: + assert(reference_data_256_fault.size() * sizeof(ELEMENT_T) == vl_bytes); + memcpy(reference_data_fault_.data(), reference_data_256_fault.data(), + vl_bytes); + break; + case 64: + assert(reference_data_512_fault.size() * sizeof(ELEMENT_T) == vl_bytes); + memcpy(reference_data_fault_.data(), reference_data_512_fault.data(), + vl_bytes); + break; + default: print("Unsupported vector length: %lu\n", vl_bytes); exit(1); + } + } + + void + check_fault(bool expected_fault, bool signal_handler_called) override + { + // Non-fault instructions should never trigger the signal handler. + if (signal_handler_called) { + test_failed(); + print("Unexpected fault\n"); + } + } + + void + check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, + bool expected_fault, size_t faulting_element) override + { + if (!expected_fault) { + // If there is no faulting element then this instruction behaves the same as + // a regular scalar+immediate load. + scalar_plus_immediate_load_test_case_t<1>::check_output( + pred, register_data, expected_fault, faulting_element); + return; + } + + // Check the FFR value + // First we need to find out which element is the first one to actually fault. + // `faulting_element` is the first element that is rigged to fall on a faulting + // address, but if that element is inactive, the first element to actually fault + // will be the next active element. + const auto element_size_bytes = static_cast(element_size_); + const auto num_mask_elements = TEST_VL_BYTES / element_size_bytes; + while ( + !element_is_active(faulting_element % num_mask_elements, pred, element_size_)) + faulting_element++; + + const auto original_ffr = register_data.before.get_ffr_value(); + predicate_reg_value128_t ffr_128 = 0; + memcpy(&ffr_128, original_ffr.data, sizeof(ffr_128)); + // All bits from the faulting element onwards are 0 so mask them out. + ffr_128 &= + (1 << ((faulting_element % num_mask_elements) * element_size_bytes)) - 1; + + std::vector expected_ffr_data(original_ffr.size, 0); + memcpy(expected_ffr_data.data(), original_ffr.data, + 2 * ((faulting_element * element_size_bytes) / 16)); + memcpy(&expected_ffr_data[2 * ((faulting_element * element_size_bytes) / 16)], + &ffr_128, sizeof(ffr_128)); + const scalable_reg_value_t expected_ffr { + expected_ffr_data.data(), + expected_ffr_data.size(), + }; + + const auto actual_ffr = register_data.after.get_ffr_value(); + + if (actual_ffr != expected_ffr) { + test_failed(); + print("predicate: "); + print_predicate( + register_data.before.get_p_register_value(registers_used_.governing_p)); + print("\noriginal ffr: "); + print_predicate(register_data.before.get_ffr_value()); + print("\nexpected ffr: "); + print_predicate(expected_ffr); + print("\nactual ffr: "); + print_predicate(actual_ffr); + print("\n"); + } + + assert(registers_used_.dest_z.size() == 1); + const auto dest_z = registers_used_.dest_z[0]; + + // Check destination register value. + if (faulting_element > 0) { + std::vector expected_output_data(reference_data_fault_); + apply_predicate_mask(expected_output_data, pred, element_size_); + const scalable_reg_value_t expected_output { + expected_output_data.data(), + expected_output_data.size(), + }; + + const auto output_value = register_data.after.get_z_register_value(dest_z); + + // Compare the output to the reference data up to where we hit the faulting + // element. + if (memcmp(expected_output.data, output_value.data, faulting_element) != 0) { + test_failed(); + print("predicate: "); + print_predicate(register_data.before.get_p_register_value( + registers_used_.governing_p)); + print("\nexpected: "); + print_vector(expected_output); + print("\nactual: "); + print_vector(output_value); + print("\n"); + } + } + + // Check that the values of the other Z registers have been preserved. + for (size_t i = 0; i < NUM_Z_REGS; i++) { + if (i != dest_z) + check_z_reg(i, register_data); + } + // Check that the values of the P registers have been preserved. + for (size_t i = 0; i < NUM_P_REGS; i++) { + check_p_reg(i, register_data); + } + } +}; + +test_result_t +test_ldnf1_scalar_plus_immediate() +{ +# define TEST_FUNC(ld_instruction) \ + [](scalar_plus_immediate_non_fault_load_test_case_t::test_ptrs_t &ptrs) { \ + asm(/* clang-format off */ \ + RESTORE_FFR(p_restore_base) \ + RESTORE_Z_REGISTERS(z_restore_base) \ + RESTORE_P_REGISTERS(p_restore_base) \ + ld_instruction "\n" \ + SAVE_Z_REGISTERS(z_save_base) \ + SAVE_P_REGISTERS(p_save_base) \ + SAVE_FFR(p_save_base) /* clang-format on */ \ + : \ + : [base] "r"(ptrs.base), [z_restore_base] "r"(ptrs.z_restore_base), \ + [z_save_base] "r"(ptrs.z_save_base), \ + [p_restore_base] "r"(ptrs.p_restore_base), \ + [p_save_base] "r"(ptrs.p_save_base) \ + : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ + } + + const auto vl_bytes = static_cast(get_vl_bytes()); + + return run_tests({ + /* { + * Test name, + * Function that executes the test instruction, + * Registers used {{zt}, pg}, + * Expected output data when no faults (128-bit vl), + * Expected output data when no faults (256-bit vl), + * Expected output data when no faults (512-bit vl), + * Expected output data when fault half way through the vector (128-bit vl), + * Expected output data when fault half way through the vector (256-bit vl), + * Expected output data when fault half way through the vector (512-bit vl), + * Data size (used to set the base ptr), + * Offset in bytes (#imm * vl_bytes) / (element_size / data_size) + * }, + */ + // LDNF1B instructions + { + "ldnf1b scalar+immediate 8bit element", + TEST_FUNC("ldnf1b z28.b, p3/z, [%[base], #-6, mul vl]"), + { /*zt=*/28, /*pg=*/3 }, + std::array, 1> { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15 }, + std::array, 1> { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, + 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 }, + std::array, 1> { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, + 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x00, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, + 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 }, + + std::array { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + std::array { 0xf2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + std::array { + 0xf4, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf3, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xf2, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + element_size_t::BYTE, + /*offset=*/-6 * vl_bytes, + }, + { + "ldnf1b scalar+immediate 16bit element", + TEST_FUNC("ldnf1b z31.h, p0/z, [%[base], #-5, mul vl]"), + { /*zt=*/31, /*pg=*/0 }, + std::array, 1> { 0x00f8, 0x00f7, 0x00f6, 0x00f5, + 0x00f4, 0x00f3, 0x00f2, 0x00f1 }, + std::array, 1> { + 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, 0x0023, 0x00f8, + 0x00f7, 0x00f6, 0x00f5, 0x00f4, 0x00f3, 0x00f2, 0x00f1 }, + std::array, 1> { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, + 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, 0x0023, + 0x00f8, 0x00f7, 0x00f6, 0x00f5, 0x00f4, 0x00f3, 0x00f2, 0x00f1 }, + std::array { 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x0000, 0x0000, + 0x0000, 0x0000 }, + std::array { 0x00f1, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x00ff, 0x00ff, 0x000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000 }, + std::array { + 0x00f2, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x00f1, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + element_size_t::BYTE, + /*offset=*/(-5 * vl_bytes) / 2, + }, + { + "ldnf1b scalar+immediate 32bit element", + TEST_FUNC("ldnf1b z2.s, p3/z, [%[base], #-4, mul vl]"), + { /*zt=*/2, /*pg=*/3 }, + std::array, 1> { 0x00000016, 0x00000017, 0x00000018, + 0x00000019 }, + std::array, 1> { 0x00000000, 0x00000001, 0x00000002, + 0x00000003, 0x00000004, 0x00000005, + 0x00000006, 0x00000007 }, + std::array, 1> { + 0x00000000, 0x00000001, 0x00000002, 0x00000003, 0x00000004, 0x00000005, + 0x00000006, 0x00000007, 0x00000008, 0x00000009, 0x00000010, 0x00000011, + 0x00000012, 0x00000013, 0x00000014, 0x00000015 }, + std::array { 0x000000ff, 0x000000ff, 0x00000000, 0x00000000 }, + std::array { 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + std::array { 0x000000f1, 0x000000ff, 0x000000ff, 0x000000ff, + 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + element_size_t::BYTE, + /*offset=*/(-4 * vl_bytes) / 4, + }, + { + "ldnf1b scalar+immediate 64bit element", + TEST_FUNC("ldnf1b z5.d, p6/z, [%[base], #-3, mul vl]"), + { /*zt=*/5, /*pg=*/6 }, + std::array, 1> { 0x00000000000000f6, + 0x00000000000000f5 }, + std::array, 1> { + 0x0000000000000020, 0x0000000000000021, 0x0000000000000022, + 0x0000000000000023 }, + std::array, 1> { + 0x0000000000000008, 0x0000000000000009, 0x0000000000000010, + 0x0000000000000011, 0x0000000000000012, 0x0000000000000013, + 0x0000000000000014, 0x0000000000000015 }, + std::array { 0x00000000000000ff, 0x0000000000000000 }, + std::array { 0x00000000000000ff, 0x00000000000000ff, + 0x0000000000000000, 0x0000000000000000 }, + std::array { 0x00000000000000ff, 0x00000000000000ff, + 0x00000000000000ff, 0x00000000000000ff, + 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000 }, + element_size_t::BYTE, + /*offset=*/(-3 * vl_bytes) / 8, + }, + // LDNF1SB instructions + { + "ldnf1sb scalar+immediate 16bit element", + TEST_FUNC("ldnf1sb z8.h, p5/z, [%[base], #-2, mul vl]"), + { /*zt=*/8, /*pg=*/5 }, + std::array, 1> { 0x0016, 0x0017, 0x0018, 0x0019, + 0x0020, 0x0021, 0x0022, 0x0023 }, + std::array, 1> { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, + 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015 }, + std::array, 1> { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, + 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, 0x0023, + -8, -9, -10, -11, -12, -13, -14, -15 }, + std::array { -1, -1, -1, -1, 0, 0, 0, 0 }, + std::array { -15, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, + 0, 0 }, + std::array { -14, -1, -1, -1, -1, -1, -1, -1, -15, -1, -1, + -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + element_size_t::BYTE, + /*offset=*/(-2 * vl_bytes) / 2, + }, + { + "ldnf1sb scalar+immediate 32bit element", + TEST_FUNC("ldnf1sb z11.s, p2/z, [%[base], #-1, mul vl]"), + { /*zt=*/11, /*pg=*/2 }, + std::array, 1> { -12, -13, -14, -15 }, + std::array, 1> { -8, -9, -10, -11, -12, -13, -14, + -15 }, + std::array, 1> { + 0x00000016, 0x00000017, 0x00000018, 0x00000019, 0x00000020, 0x00000021, + 0x00000022, 0x00000023, -8, -9, -10, -11, -12, -13, -14, -15 }, + + std::array { -1, -1, 0, 0 }, + std::array { -1, -1, -1, -1, 0, 0, 0, 0 }, + std::array { -15, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, + 0, 0 }, + element_size_t::BYTE, + /*offset=*/(-1 * vl_bytes) / 4, + }, + { + "ldnf1sb scalar+immediate 64bit element", + TEST_FUNC("ldnf1sb z14.d, p1/z, [%[base], #0, mul vl]"), + { /*zt=*/14, /*pg=*/1 }, + std::array, 1> { 0x0000000000000000, + 0x0000000000000001 }, + std::array, 1> { + 0x0000000000000000, 0x0000000000000001, 0x0000000000000002, + 0x0000000000000003 }, + std::array, 1> { + 0x0000000000000000, 0x0000000000000001, 0x0000000000000002, + 0x0000000000000003, 0x0000000000000004, 0x0000000000000005, + 0x0000000000000006, 0x0000000000000007 }, + + std::array { -1, 0 }, + std::array { -1, -1, 0, 0 }, + std::array { -1, -1, -1, -1, 0, 0, 0, 0 }, + element_size_t::BYTE, + /*offset=*/(0 * vl_bytes) / 8, + }, + // LDNF1H instructions + { + "ldnf1h scalar+immediate 16bit element", + TEST_FUNC("ldnf1h z17.h, p4/z, [%[base], #1, mul vl]"), + { /*zt=*/17, /*pg=*/4 }, + std::array, 1> { 0x0008, 0x0009, 0x0010, 0x0011, + 0x0012, 0x0013, 0x0014, 0x0015 }, + std::array, 1> { + 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, 0x0023, 0xfff8, + 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1 }, + std::array, 1> { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, + 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, 0x0023, + 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1 }, + + std::array { 0xfff1, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, + 0x0000, 0x0000 }, + std::array { 0xfff2, 0xffff, 0xffff, 0xffff, 0xfff1, 0xffff, + 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000 }, + std::array { + 0xfff4, 0xffff, 0xffff, 0xffff, 0xfff3, 0xffff, 0xffff, 0xffff, + 0xfff2, 0xffff, 0xffff, 0xffff, 0xfff1, 0xffff, 0xffff, 0xffff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + element_size_t::HALF, + /*offset=*/1 * vl_bytes, + }, + { + "ldnf1h scalar+immediate 32bit element", + TEST_FUNC("ldnf1h z20.s, p7/z, [%[base], #2, mul vl]"), + { /*zt=*/20, /*pg=*/7 }, + std::array, 1> { 0x00000008, 0x00000009, 0x00000010, + 0x00000011 }, + std::array, 1> { 0x00000016, 0x00000017, 0x00000018, + 0x00000019, 0x00000020, 0x00000021, + 0x00000022, 0x00000023 }, + std::array, 1> { + 0x00000000, 0x00000001, 0x00000002, 0x00000003, 0x00000004, 0x00000005, + 0x00000006, 0x00000007, 0x00000008, 0x00000009, 0x00000010, 0x00000011, + 0x00000012, 0x00000013, 0x00000014, 0x00000015 }, + + std::array { 0x0000ffff, 0x0000ffff, 0x00000000, 0x00000000 }, + std::array { 0x0000fff1, 0x0000ffff, 0x0000ffff, 0x0000ffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + std::array { 0x0000fff2, 0x0000ffff, 0x0000ffff, 0x0000ffff, + 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + element_size_t::HALF, + /*offset=*/(2 * vl_bytes) / 2, + }, + { + "ldnf1h scalar+immediate 64bit element", + TEST_FUNC("ldnf1h z23.d, p4/z, [%[base], #3, mul vl]"), + { /*zt=*/23, /*pg=*/4 }, + std::array, 1> { 0x0000000000000006, + 0x0000000000000007 }, + std::array, 1> { + 0x0000000000000012, 0x0000000000000013, 0x0000000000000014, + 0x0000000000000015 }, + std::array, 1> { + 0x000000000000fff8, 0x000000000000fff7, 0x000000000000fff6, + 0x000000000000fff5, 0x000000000000fff4, 0x000000000000fff3, + 0x000000000000fff2, 0x000000000000fff1 }, + + std::array { 0x000000000000ffff, 0x0000000000000000 }, + std::array { 0x000000000000ffff, 0x000000000000ffff, + 0x0000000000000000, 0x0000000000000000 }, + std::array { 0x000000000000fff1, 0x000000000000ffff, + 0x000000000000ffff, 0x000000000000ffff, + 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000 }, + element_size_t::HALF, + /*offset=*/(3 * vl_bytes) / 4, + }, + // LDNF1SH instructions + { + "ldnf1sh scalar+immediate 32bit element", + TEST_FUNC("ldnf1sh z26.s, p1/z, [%[base], #4, mul vl]"), + { /*zt=*/26, /*pg=*/1 }, + std::array, 1> { 0x00000016, 0x00000017, 0x00000018, + 0x00000019 }, + std::array, 1> { 0x00000000, 0x00000001, 0x00000002, + 0x00000003, 0x00000004, 0x00000005, + 0x00000006, 0x00000007 }, + std::array, 1> { + 0x00000000, 0x00000001, 0x00000002, 0x00000003, 0x00000004, 0x00000005, + 0x00000006, 0x00000007, 0x00000008, 0x00000009, 0x00000010, 0x00000011, + 0x00000012, 0x00000013, 0x00000014, 0x00000015 }, + + std::array { -1, -1, 0, 0 }, + std::array { -15, -1, -1, -1, 0, 0, 0, 0 }, + std::array { -14, -1, -1, -1, -15, -1, -1, -1, 0, 0, 0, 0, 0, 0, + 0, 0 }, + element_size_t::HALF, + /*offset=*/(4 * vl_bytes) / 2, + }, + { + "ldnf1sh scalar+immediate 64bit element", + TEST_FUNC("ldnf1sh z29.d, p2/z, [%[base], #5, mul vl]"), + { /*zt=*/29, /*pg=*/2 }, + std::array, 1> { 0x0000000000000010, + 0x0000000000000011 }, + std::array, 1> { + 0x0000000000000020, 0x0000000000000021, 0x0000000000000022, + 0x0000000000000023 }, + std::array, 1> { + 0x0000000000000008, 0x0000000000000009, 0x0000000000000010, + 0x0000000000000011, 0x0000000000000012, 0x0000000000000013, + 0x0000000000000014, 0x0000000000000015 }, + + std::array { -1, 0 }, + std::array { -1, -1, 0, 0 }, + std::array { -15, -1, -1, -1, 0, 0, 0, 0 }, + element_size_t::HALF, + /*offset=*/(5 * vl_bytes) / 4, + }, + // LDNF1W instructions + { + "ldnf1w scalar+immediate 32bit element", + TEST_FUNC("ldnf1w z0.s, p5/z, [%[base], #6, mul vl]"), + { /*zt=*/0, /*pg=*/5 }, + std::array, 1> { 0xfffffff8, 0xfffffff7, 0xfffffff6, + 0xfffffff5 }, + std::array, 1> { 0x00000016, 0x00000017, 0x00000018, + 0x00000019, 0x00000020, 0x00000021, + 0x00000022, 0x00000023 }, + std::array, 1> { + 0x00000000, 0x00000001, 0x00000002, 0x00000003, 0x00000004, 0x00000005, + 0x00000006, 0x00000007, 0x00000008, 0x00000009, 0x00000010, 0x00000011, + 0x00000012, 0x00000013, 0x00000014, 0x00000015 }, + + std::array { 0xfffffff1, 0xffffffff, 0x00000000, 0x00000000 }, + std::array { 0xfffffff2, 0xffffffff, 0xfffffff2, 0xffffffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + std::array { 0xfffffff4, 0xffffffff, 0xfffffff3, 0xffffffff, + 0xfffffff2, 0xffffffff, 0xfffffff1, 0xffffffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + element_size_t::SINGLE, + /*offset=*/6 * vl_bytes, + }, + { + "ldnf1w scalar+immediate 64bit element", + TEST_FUNC("ldnf1w z3.d, p6/z, [%[base], #-6, mul vl]"), + { /*zt=*/3, /*pg=*/6 }, + std::array, 1> { 0x0000000000000020, + 0x0000000000000021 }, + std::array, 1> { + 0x0000000000000008, 0x0000000000000009, 0x0000000000000010, + 0x0000000000000011 }, + std::array, 1> { + 0x0000000000000016, 0x0000000000000017, 0x0000000000000018, + 0x0000000000000019, 0x0000000000000020, 0x0000000000000021, + 0x0000000000000022, 0x0000000000000023 }, + + std::array { 0xffffffffffffffff, 0x0000000000000000 }, + std::array { 0xfffffffffffffff1, 0xffffffffffffffff, + 0x0000000000000000, 0x0000000000000000 }, + std::array { 0xfffffffffffffff2, 0xffffffffffffffff, + 0xfffffffffffffff1, 0xffffffffffffffff, + 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000 }, + element_size_t::SINGLE, + /*offset=*/(-6 * vl_bytes) / 2, + }, + // LDNF1SW instructions + { + "ldnf1sw scalar+immediate 64bit element", + TEST_FUNC("ldnf1sw z6.d, p3/z, [%[base], #-5, mul vl]"), + { /*zt=*/6, /*pg=*/3 }, + std::array, 1> { 0x0000000000000022, + 0x0000000000000023 }, + std::array, 1> { + 0x0000000000000012, 0x0000000000000013, 0x0000000000000014, + 0x0000000000000015 }, + std::array, 1> { -8, -9, -10, -11, -12, -13, -14, + -15 }, + + std::array { -1, 0 }, + std::array { -15, -1, 0, 0 }, + std::array { -14, -1, -15, -1, 0, 0, 0, 0 }, + element_size_t::SINGLE, + /*offset=*/(-5 * vl_bytes) / 2, + }, + // LDNF1D instructions + { + "ldnf1d scalar+immediate 64bit element", + TEST_FUNC("ldnf1d z9.d, p0/z, [%[base], #-4, mul vl]"), + { /*zt=*/9, /*pg=*/0 }, + std::array, 1> { 0xfffffffffffffff8, + 0xfffffffffffffff7 }, + std::array, 1> { + 0x0000000000000016, 0x0000000000000017, 0x0000000000000018, + 0x0000000000000019 }, + std::array, 1> { + 0x0000000000000000, 0x0000000000000001, 0x0000000000000002, + 0x0000000000000003, 0x0000000000000004, 0x0000000000000005, + 0x0000000000000006, 0x0000000000000007 }, + + std::array { 0xfffffffffffffff1, 0x0000000000000000 }, + std::array { 0xfffffffffffffff2, 0xfffffffffffffff1, + 0x0000000000000000, 0x0000000000000000 }, + std::array { 0xfffffffffffffff4, 0xfffffffffffffff3, + 0xfffffffffffffff2, 0xfffffffffffffff1, + 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000 }, + element_size_t::DOUBLE, + /*offset=*/-4 * vl_bytes, + }, + }); +# undef TEST_FUNC +} + template struct scalar_plus_immediate_store_test_case_t : public test_case_base_t { @@ -5727,7 +6310,7 @@ struct scalar_plus_immediate_store_test_case_t void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { // Check that the values of the Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { @@ -6308,7 +6891,7 @@ struct vector_plus_scalar_load_test_case_t void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { if (!expected_fault) { const auto vl_bytes = get_vl_bytes(); @@ -6560,7 +7143,7 @@ struct vector_plus_scalar_store_test_case_t void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, - bool expected_fault) override + bool expected_fault, size_t faulting_element) override { // Check that the values of the Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { @@ -6732,6 +7315,8 @@ main(int argc, char **argv) status = FAIL; if (test_ld1_scalar_plus_immediate() == FAIL) status = FAIL; + if (test_ldnf1_scalar_plus_immediate() == FAIL) + status = FAIL; if (test_ld2_scalar_plus_immediate() == FAIL) status = FAIL; if (test_ld3_scalar_plus_immediate() == FAIL) diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.templatex b/suite/tests/client-interface/drx-scattergather-aarch64.templatex index d0514131a60..057a4907ac4 100644 --- a/suite/tests/client-interface/drx-scattergather-aarch64.templatex +++ b/suite/tests/client-interface/drx-scattergather-aarch64.templatex @@ -222,6 +222,22 @@ ld1rqw scalar\+immediate: PASS ld1rqd scalar\+immediate: PASS ld1rqd scalar\+immediate \(min index\): PASS ld1rqd scalar\+immediate \(max index\): PASS +ldnf1b scalar\+immediate 8bit element: PASS +ldnf1b scalar\+immediate 16bit element: PASS +ldnf1b scalar\+immediate 32bit element: PASS +ldnf1b scalar\+immediate 64bit element: PASS +ldnf1sb scalar\+immediate 16bit element: PASS +ldnf1sb scalar\+immediate 32bit element: PASS +ldnf1sb scalar\+immediate 64bit element: PASS +ldnf1h scalar\+immediate 16bit element: PASS +ldnf1h scalar\+immediate 32bit element: PASS +ldnf1h scalar\+immediate 64bit element: PASS +ldnf1sh scalar\+immediate 32bit element: PASS +ldnf1sh scalar\+immediate 64bit element: PASS +ldnf1w scalar\+immediate 32bit element: PASS +ldnf1w scalar\+immediate 64bit element: PASS +ldnf1sw scalar\+immediate 64bit element: PASS +ldnf1d scalar\+immediate 64bit element: PASS ld2b scalar\+immediate: PASS ld2h scalar\+immediate: PASS ld2w scalar\+immediate: PASS @@ -296,9 +312,9 @@ stnt1d vector\+scalar 64bit unscaled offset \(repeated base\): PASS #endif /* __ARM_FEATURE_SVE2 */ #ifndef TEST_SAMPLE_CLIENT #if defined(__ARM_FEATURE_SVE2) -event_exit, 3744 scatter/gather instructions +event_exit, 3936 scatter/gather instructions #elif defined( __ARM_FEATURE_SVE) -event_exit, 3564 scatter/gather instructions +event_exit, 3756 scatter/gather instructions #else event_exit, 0 scatter/gather instructions #endif /* __ARM_FEATURE_SVE */ diff --git a/suite/tests/client-interface/drx-scattergather-bbdup.dll.c b/suite/tests/client-interface/drx-scattergather-bbdup.dll.c index a6c322e40ef..77bf00289c2 100644 --- a/suite/tests/client-interface/drx-scattergather-bbdup.dll.c +++ b/suite/tests/client-interface/drx-scattergather-bbdup.dll.c @@ -260,10 +260,6 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, } else if (instr_is_scatter(instr)) { scatter_gather_present = true; #if defined(X86) - /* TODO i#5036: Port this code to AArch64 to test state restoration of - * clobbered predicate registers (when we have added support for state - * restoration). - */ } else if (instr_is_mov_constant(instr, &val) && val == TEST_AVX512_GATHER_MASK_CLOBBER_MARKER) { instr_t *next_instr = instr_get_next(instr); @@ -345,10 +341,6 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, CHECK((scatter_gather_present IF_X64(&&expanded)) || (expansion_ok && !expanded), "drx_expand_scatter_gather() bad OUT values"); #if defined(X86) - /* TODO i#5036: Port this code to AArch64 to test state restoration of clobbered - * predicate registers (when we have added support for state - * restoration). - */ for (instr = instrlist_first(bb); instr != NULL; instr = instr_get_next(instr)) { if (instr_get_opcode(instr) == OP_kandnw && (instr_get_app_pc(instr) == mask_clobber_test_avx512_gather_pc || diff --git a/suite/tests/client-interface/drx-scattergather.dll.c b/suite/tests/client-interface/drx-scattergather.dll.c index d3273cb0b34..0db0f76e73a 100644 --- a/suite/tests/client-interface/drx-scattergather.dll.c +++ b/suite/tests/client-interface/drx-scattergather.dll.c @@ -185,10 +185,6 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, } else if (instr_is_scatter(instr)) { scatter_gather_present = true; #if defined(X86) - /* TODO i#5036: Port this code to AArch64 to test state restoration of - * clobbered predicate registers (when we have added support for state - * restoration). - */ } else if (instr_is_mov_constant(instr, &val) && val == TEST_AVX512_GATHER_MASK_CLOBBER_MARKER) { instr_t *next_instr = instr_get_next(instr); @@ -270,10 +266,6 @@ event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, CHECK((scatter_gather_present IF_X64(&&expanded)) || (expansion_ok && !expanded), "drx_expand_scatter_gather() bad OUT values"); #if defined(X86) - /* TODO i#5036: Port this code to AArch64 to test state restoration of clobbered - * predicate registers (when we have added support for state - * restoration). - */ for (instr = instrlist_first(bb); instr != NULL; instr = instr_get_next(instr)) { if (instr_get_opcode(instr) == OP_kandnw && (instr_get_app_pc(instr) == mask_clobber_test_avx512_gather_pc ||