From 0c641f1f9c55662353f2f50a74c8d8701af4d691 Mon Sep 17 00:00:00 2001 From: Mirko Brkusanin Date: Thu, 10 Dec 2020 12:40:49 +0100 Subject: [PATCH] [AMDGPU] Resolve issues when picking between ds_read/write and ds_read2/write2 Both ds_read_b128 and ds_read2_b64 are valid for 128bit 16-byte aligned loads but the one that will be selected is determined either by the order in tablegen or by the AddedComplexity attribute. Currently ds_read_b128 has priority. While ds_read2_b64 has lower alignment requirements, we cannot always restrict ds_read_b128 to 16-byte alignment because of unaligned-access-mode option. This was causing ds_read_b128 to be selected for 8-byte aligned loads regardles of chosen access mode. To resolve this we use two patterns for selecting ds_read_b128. One requires alignment of 16-byte and the other requires unaligned-access-mode option. Same goes for ds_write2_b64 and ds_write_b128. Differential Revision: https://reviews.llvm.org/D92767 Change-Id: I195255b869d3a2d72e36bf70f29a52e797d98036 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 8 ++--- llvm/lib/Target/AMDGPU/DSInstructions.td | 21 ++++++------ .../GlobalISel/inst-select-load-local-128.mir | 22 +++++++------ .../AMDGPU/GlobalISel/lds-misaligned-bug.ll | 32 ++++++++++--------- .../AMDGPU/GlobalISel/load-local.128.ll | 2 +- .../AMDGPU/GlobalISel/store-local.128.ll | 2 +- .../test/CodeGen/AMDGPU/lds-misaligned-bug.ll | 32 ++++++++++--------- llvm/test/CodeGen/AMDGPU/load-local.128.ll | 2 +- llvm/test/CodeGen/AMDGPU/store-local.128.ll | 4 +-- 9 files changed, 65 insertions(+), 60 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ce304a2d40860c..afb9e3d2fa4f76 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1064,11 +1064,6 @@ def isGFX7GFX10 : "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>; -def isGFX7GFX8 : - Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" - "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<(all_of FeatureSouthernIslands, FeatureCIInsts)>; - def isGFX7GFX8GFX9 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" @@ -1274,6 +1269,9 @@ def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">; def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">; +def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">, + AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>; + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 2e38619e2333e1..328c81005df449 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -680,7 +680,7 @@ foreach vt = VReg_64.RegTypes in { defm : DSReadPat_mc ; } -let SubtargetPredicate = isGFX7GFX8 in { +let SubtargetPredicate = isGFX7Plus in { foreach vt = VReg_96.RegTypes in { defm : DSReadPat_mc ; @@ -690,9 +690,7 @@ foreach vt = VReg_128.RegTypes in { defm : DSReadPat_mc ; } -} - -let SubtargetPredicate = isGFX9Plus in { +let SubtargetPredicate = HasUnalignedAccessMode in { foreach vt = VReg_96.RegTypes in { defm : DSReadPat_mc ; @@ -702,7 +700,9 @@ foreach vt = VReg_128.RegTypes in { defm : DSReadPat_mc ; } -} +} // End SubtargetPredicate = HasUnalignedAccessMode + +} // End SubtargetPredicate = isGFX7Plus } // End AddedComplexity = 100 @@ -835,7 +835,7 @@ foreach vt = VReg_64.RegTypes in { defm : DSWritePat_mc ; } -let SubtargetPredicate = isGFX7GFX8 in { +let SubtargetPredicate = isGFX7Plus in { foreach vt = VReg_96.RegTypes in { defm : DSWritePat_mc ; @@ -845,9 +845,7 @@ foreach vt = VReg_128.RegTypes in { defm : DSWritePat_mc ; } -} - -let SubtargetPredicate = isGFX9Plus in { +let SubtargetPredicate = HasUnalignedAccessMode in { foreach vt = VReg_96.RegTypes in { defm : DSWritePat_mc ; @@ -857,9 +855,12 @@ foreach vt = VReg_128.RegTypes in { defm : DSWritePat_mc ; } -} +} // End SubtargetPredicate = HasUnalignedAccessMode + +} // End SubtargetPredicate = isGFX7Plus } // End AddedComplexity = 100 + class DSAtomicRetPat : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), (inst $ptr, getVregSrcForVT.ret:$value, offset:$offset, (i1 gds)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir index e7c646ee73a7bb..71fc286dc75c22 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir @@ -50,8 +50,8 @@ body: | ; GFX9-LABEL: name: load_local_v4s32_align_8 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]] + ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 16, align 8, addrspace 3) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load 16, align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -78,8 +78,8 @@ body: | ; GFX9-LABEL: name: load_local_v4s32_align_8_offset_160 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 400, 0, implicit $exec :: (load 16, align 8, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]] + ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 50, 51, 0, implicit $exec :: (load 16, align 8, addrspace 3) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 400 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -110,8 +110,10 @@ body: | ; GFX9-LABEL: name: load_local_v4s32_align_8_offset_320 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 4000, 0, implicit $exec :: (load 16, align 8, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]] + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec + ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load 16, align 8, addrspace 3) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4000 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -140,8 +142,8 @@ body: | ; GFX9-LABEL: name: load_local_v2s64 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]] + ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 16, align 8, addrspace 3) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load 16, align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -196,8 +198,8 @@ body: | ; GFX9-LABEL: name: load_local_s128 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]] + ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 16, align 8, addrspace 3) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s128) = G_LOAD %0 :: (load 16, align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll index b0f664726b265e..9edbdfc2c2477b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll @@ -1,8 +1,8 @@ -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s ; GCN-LABEL: test_local_misaligned_v2: ; GCN-DAG: ds_read2_b32 @@ -22,10 +22,10 @@ bb: } ; GCN-LABEL: test_local_misaligned_v4: -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_write2_b32 -; GCN-DAG: ds_write2_b32 +; ALIGNED-DAG: ds_read2_b32 +; ALIGNED-DAG: ds_read2_b32 +; ALIGNED-DAG: ds_write2_b32 +; ALIGNED-DAG: ds_write2_b32 ; UNALIGNED-DAG: ds_read_b128 ; UNALIGNED-DAG: ds_write_b128 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { @@ -47,10 +47,10 @@ bb: } ; GCN-LABEL: test_local_misaligned_v3: -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_read_b32 -; GCN-DAG: ds_write2_b32 -; GCN-DAG: ds_write_b32 +; ALIGNED-DAG: ds_read2_b32 +; ALIGNED-DAG: ds_read_b32 +; ALIGNED-DAG: ds_write2_b32 +; ALIGNED-DAG: ds_write_b32 ; UNALIGNED-DAG: ds_read_b96 ; UNALIGNED-DAG: ds_write_b96 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { @@ -106,8 +106,10 @@ bb: } ; GCN-LABEL: test_local_v4_aligned8: -; GCN-DAG: ds_read_b128 -; GCN-DAG: ds_write_b128 +; ALIGNED-DAG: ds_read2_b64 +; ALIGNED-DAG: ds_write2_b64 +; UNALIGNED-DAG: ds_read_b128 +; UNALIGNED-DAG: ds_write_b128 define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll index 6cc9dbd1793bfa..c7f74cb3b48939 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -267,7 +267,7 @@ define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) { ; GFX9-LABEL: load_lds_v4i32_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 4973706f8e2e5c..8aa0f99cd862c5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -250,7 +250,7 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll index 76ee829ad7933c..6aba2b5bf2b739 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s -; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s -; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED,VECT %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s +; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s +; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s ; GCN-LABEL: test_local_misaligned_v2: ; GCN-DAG: ds_read2_b32 @@ -22,10 +22,10 @@ bb: } ; GCN-LABEL: test_local_misaligned_v4: -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_write2_b32 -; GCN-DAG: ds_write2_b32 +; ALIGNED-DAG: ds_read2_b32 +; ALIGNED-DAG: ds_read2_b32 +; ALIGNED-DAG: ds_write2_b32 +; ALIGNED-DAG: ds_write2_b32 ; UNALIGNED-DAG: ds_read_b128 ; UNALIGNED-DAG: ds_write_b128 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { @@ -47,10 +47,10 @@ bb: } ; GCN-LABEL: test_local_misaligned_v3: -; GCN-DAG: ds_read2_b32 -; GCN-DAG: ds_read_b32 -; GCN-DAG: ds_write2_b32 -; GCN-DAG: ds_write_b32 +; ALIGNED-DAG: ds_read2_b32 +; ALIGNED-DAG: ds_read_b32 +; ALIGNED-DAG: ds_write2_b32 +; ALIGNED-DAG: ds_write_b32 ; UNALIGNED-DAG: ds_read_b96 ; UNALIGNED-DAG: ds_write_b96 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { @@ -221,8 +221,10 @@ bb: } ; GCN-LABEL: test_local_v4_aligned8: -; GCN-DAG: ds_read_b128 -; GCN-DAG: ds_write_b128 +; ALIGNED-DAG: ds_read2_b64 +; ALIGNED-DAG: ds_write2_b64 +; UNALIGNED-DAG: ds_read_b128 +; UNALIGNED-DAG: ds_write_b128 define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll index 716237a9955f7c..f5bd05a558fe47 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -325,7 +325,7 @@ define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) { ; GFX9-LABEL: load_lds_v4i32_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index 7d9fff7b456877..4805758bab7e43 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -340,10 +340,10 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align8: