| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s |
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s |
| |
| define <2 x half> @chain_hi_to_lo_private() { |
| ; GFX900-LABEL: chain_hi_to_lo_private: |
| ; GFX900: ; %bb.0: ; %bb |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 |
| ; GFX900-NEXT: s_nop 0 |
| ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR-LABEL: chain_hi_to_lo_private: |
| ; FLATSCR: ; %bb.0: ; %bb |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR-NEXT: s_mov_b32 s0, 2 |
| ; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 |
| ; FLATSCR-NEXT: s_mov_b32 s0, 0 |
| ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private: |
| ; GFX10_DEFAULT: ; %bb.0: ; %bb |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10_DEFAULT-NEXT: s_clause 0x1 |
| ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 |
| ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private: |
| ; FLATSCR_GFX10: ; %bb.0: ; %bb |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 2 |
| ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 |
| ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_private: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2 |
| ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s0 |
| ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 |
| ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0 |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_private: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 2 |
| ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s0 |
| ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 |
| ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0 |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1 |
| %load_lo = load half, ptr addrspace(5) %gep_lo |
| %load_hi = load half, ptr addrspace(5) null |
| |
| %temp = insertelement <2 x half> poison, half %load_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base_lo, ptr addrspace(5) %base_hi) { |
| ; GFX900-LABEL: chain_hi_to_lo_private_different_bases: |
| ; GFX900: ; %bb.0: ; %bb |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX900-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen |
| ; GFX900-NEXT: s_nop 0 |
| ; GFX900-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases: |
| ; FLATSCR: ; %bb.0: ; %bb |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR-NEXT: scratch_load_ushort v0, v0, off |
| ; FLATSCR-NEXT: s_nop 0 |
| ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_different_bases: |
| ; GFX10_DEFAULT: ; %bb.0: ; %bb |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10_DEFAULT-NEXT: s_clause 0x1 |
| ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen |
| ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_different_bases: |
| ; FLATSCR_GFX10: ; %bb.0: ; %bb |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off |
| ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_private_different_bases: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, v0, off |
| ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, v0, off |
| ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %load_lo = load half, ptr addrspace(5) %base_lo |
| %load_hi = load half, ptr addrspace(5) %base_hi |
| |
| %temp = insertelement <2 x half> poison, half %load_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) { |
| ; GFX900-LABEL: chain_hi_to_lo_arithmatic: |
| ; GFX900: ; %bb.0: ; %bb |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 |
| ; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX900-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR-LABEL: chain_hi_to_lo_arithmatic: |
| ; FLATSCR: ; %bb.0: ; %bb |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR-NEXT: v_add_f16_e32 v1, 1.0, v1 |
| ; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 |
| ; FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_arithmatic: |
| ; GFX10_DEFAULT: ; %bb.0: ; %bb |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10_DEFAULT-NEXT: v_add_f16_e32 v1, 1.0, v1 |
| ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_arithmatic: |
| ; FLATSCR_GFX10: ; %bb.0: ; %bb |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR_GFX10-NEXT: v_add_f16_e32 v1, 1.0, v1 |
| ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 |
| ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_arithmatic: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l |
| ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_arithmatic: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1 |
| ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %arith_lo = fadd half %in, 1.0 |
| %load_hi = load half, ptr addrspace(5) %base |
| |
| %temp = insertelement <2 x half> poison, half %arith_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| define <2 x half> @chain_hi_to_lo_group() { |
| ; GCN-LABEL: chain_hi_to_lo_group: |
| ; GCN: ; %bb.0: ; %bb |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: ds_read_u16 v0, v1 offset:2 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: ds_read_u16_d16_hi v0, v1 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_group: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: ds_read_u16 v0, v1 offset:2 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_group: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v1 offset:2 |
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v1 |
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_group: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-FAKE16-NEXT: ds_load_u16 v0, v1 offset:2 |
| ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v1 |
| ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds half, ptr addrspace(3) null, i64 1 |
| %load_lo = load half, ptr addrspace(3) %gep_lo |
| %load_hi = load half, ptr addrspace(3) null |
| |
| %temp = insertelement <2 x half> poison, half %load_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| define <2 x half> @chain_hi_to_lo_group_different_bases(ptr addrspace(3) %base_lo, ptr addrspace(3) %base_hi) { |
| ; GCN-LABEL: chain_hi_to_lo_group_different_bases: |
| ; GCN: ; %bb.0: ; %bb |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: ds_read_u16 v0, v0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: ds_read_u16_d16_hi v0, v1 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_group_different_bases: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: ds_read_u16 v0, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_different_bases: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 |
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v1 |
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_different_bases: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 |
| ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v1 |
| ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %load_lo = load half, ptr addrspace(3) %base_lo |
| %load_hi = load half, ptr addrspace(3) %base_hi |
| |
| %temp = insertelement <2 x half> poison, half %load_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| define <2 x half> @chain_hi_to_lo_global() { |
| ; GCN-LABEL: chain_hi_to_lo_global: |
| ; GCN: ; %bb.0: ; %bb |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, 2 |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: global_load_ushort v0, v[0:1], off |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: v_mov_b32_e32 v2, 0 |
| ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_global: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: global_load_ushort v0, v[0:1], off |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2 |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2 |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1 |
| %load_lo = load half, ptr addrspace(1) %gep_lo |
| %load_hi = load half, ptr addrspace(1) null |
| |
| %temp = insertelement <2 x half> poison, half %load_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_lo, ptr addrspace(1) %base_hi) { |
| ; GCN-LABEL: chain_hi_to_lo_global_different_bases: |
| ; GCN: ; %bb.0: ; %bb |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: global_load_ushort v0, v[0:1], off |
| ; GCN-NEXT: s_nop 0 |
| ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_global_different_bases: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: global_load_ushort v0, v[0:1], off |
| ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_different_bases: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off |
| ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off |
| ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %load_lo = load half, ptr addrspace(1) %base_lo |
| %load_hi = load half, ptr addrspace(1) %base_hi |
| |
| %temp = insertelement <2 x half> poison, half %load_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| define <2 x half> @chain_hi_to_lo_flat() { |
| ; GCN-LABEL: chain_hi_to_lo_flat: |
| ; GCN: ; %bb.0: ; %bb |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, 2 |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: flat_load_ushort v0, v[0:1] |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| ; GCN-NEXT: v_mov_b32_e32 v2, 0 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_flat: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: flat_load_ushort v0, v[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2 |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2] |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2 |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1] |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2] |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds half, ptr null, i64 1 |
| %load_lo = load half, ptr %gep_lo |
| %load_hi = load half, ptr null |
| |
| %temp = insertelement <2 x half> poison, half %load_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_hi) { |
| ; GCN-LABEL: chain_hi_to_lo_flat_different_bases: |
| ; GCN: ; %bb.0: ; %bb |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: flat_load_ushort v0, v[0:1] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_flat_different_bases: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: flat_load_ushort v0, v[0:1] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3] |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3] |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1] |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3] |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %load_lo = load half, ptr %base_lo |
| %load_hi = load half, ptr %base_hi |
| |
| %temp = insertelement <2 x half> poison, half %load_lo, i32 0 |
| %result = insertelement <2 x half> %temp, half %load_hi, i32 1 |
| |
| ret <2 x half> %result |
| } |
| |
| ; Make sure we don't lose any of the private stores. |
| define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 { |
| ; GFX900-LABEL: vload2_private: |
| ; GFX900: ; %bb.0: ; %entry |
| ; GFX900-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 |
| ; GFX900-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX900-NEXT: s_add_u32 s0, s0, s17 |
| ; GFX900-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX900-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 |
| ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 |
| ; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 |
| ; GFX900-NEXT: s_waitcnt vmcnt(1) |
| ; GFX900-NEXT: v_mov_b32_e32 v1, v0 |
| ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4 |
| ; GFX900-NEXT: s_waitcnt vmcnt(1) |
| ; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] |
| ; GFX900-NEXT: s_endpgm |
| ; |
| ; FLATSCR-LABEL: vload2_private: |
| ; FLATSCR: ; %bb.0: ; %entry |
| ; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 |
| ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 |
| ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 |
| ; FLATSCR-NEXT: s_mov_b32 s4, 0 |
| ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) |
| ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: scratch_store_short off, v0, s4 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: scratch_store_short off, v0, s4 offset:2 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 |
| ; FLATSCR-NEXT: s_mov_b32 s0, 0 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 offset:2 |
| ; FLATSCR-NEXT: scratch_load_ushort v3, off, s0 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(1) |
| ; FLATSCR-NEXT: v_mov_b32_e32 v1, v0 |
| ; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 |
| ; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(1) |
| ; FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] |
| ; FLATSCR-NEXT: s_endpgm |
| ; |
| ; GFX10_DEFAULT-LABEL: vload2_private: |
| ; GFX10_DEFAULT: ; %bb.0: ; %entry |
| ; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 |
| ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s17 |
| ; GFX10_DEFAULT-NEXT: s_addc_u32 s1, s1, 0 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10_DEFAULT-NEXT: s_clause 0x1 |
| ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 |
| ; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 |
| ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] |
| ; GFX10_DEFAULT-NEXT: s_endpgm |
| ; |
| ; FLATSCR_GFX10-LABEL: vload2_private: |
| ; FLATSCR_GFX10: ; %bb.0: ; %entry |
| ; FLATSCR_GFX10-NEXT: s_add_u32 s8, s8, s13 |
| ; FLATSCR_GFX10-NEXT: s_addc_u32 s9, s9, 0 |
| ; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 |
| ; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 |
| ; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 |
| ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; FLATSCR_GFX10-NEXT: s_mov_b32 s4, 0 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s4 offset:2 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; FLATSCR_GFX10-NEXT: s_clause 0x1 |
| ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 offset:2 |
| ; FLATSCR_GFX10-NEXT: scratch_load_ushort v3, off, s0 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v1, v0 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 |
| ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] |
| ; FLATSCR_GFX10-NEXT: s_endpgm |
| ; |
| ; GFX11-TRUE16-LABEL: vload2_private: |
| ; GFX11-TRUE16: ; %bb.0: ; %entry |
| ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off dlc |
| ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:2 |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc |
| ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:4 |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc |
| ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, off offset:2 |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h |
| ; GFX11-TRUE16-NEXT: s_clause 0x1 |
| ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off |
| ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3] |
| ; GFX11-TRUE16-NEXT: s_endpgm |
| ; |
| ; GFX11-FAKE16-LABEL: vload2_private: |
| ; GFX11-FAKE16: ; %bb.0: ; %entry |
| ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off dlc |
| ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2 |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc |
| ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4 |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc |
| ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-FAKE16-NEXT: s_clause 0x1 |
| ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, off offset:2 |
| ; GFX11-FAKE16-NEXT: scratch_load_u16 v3, off, off |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0 |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 |
| ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[2:3] |
| ; GFX11-FAKE16-NEXT: s_endpgm |
| entry: |
| %loc = alloca [3 x i16], align 2, addrspace(5) |
| %tmp = load i16, ptr addrspace(1) %in, align 2 |
| store volatile i16 %tmp, ptr addrspace(5) %loc |
| %arrayidx.1 = getelementptr inbounds i16, ptr addrspace(1) %in, i64 1 |
| %tmp1 = load i16, ptr addrspace(1) %arrayidx.1, align 2 |
| %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 1 |
| store volatile i16 %tmp1, ptr addrspace(5) %loc.2.sroa_idx3 |
| %arrayidx.2 = getelementptr inbounds i16, ptr addrspace(1) %in, i64 2 |
| %tmp2 = load i16, ptr addrspace(1) %arrayidx.2, align 2 |
| %loc.4.sroa_idx = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 2 |
| store volatile i16 %tmp2, ptr addrspace(5) %loc.4.sroa_idx |
| %loc.0. = load <2 x i16>, ptr addrspace(5) %loc, align 2 |
| store <2 x i16> %loc.0., ptr addrspace(1) %out, align 4 |
| %loc.2.sroa_idx = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 1 |
| %loc.2. = load <2 x i16>, ptr addrspace(5) %loc.2.sroa_idx, align 2 |
| %arrayidx6 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 1 |
| store <2 x i16> %loc.2., ptr addrspace(1) %arrayidx6, align 4 |
| ret void |
| } |
| |
| ; There is another instruction between the misordered instruction and |
| ; the value dependent load, so a simple operand check is insufficient. |
| define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) { |
| ; GCN-LABEL: chain_hi_to_lo_group_other_dep: |
| ; GCN: ; %bb.0: ; %bb |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: ds_read_u16_d16_hi v1, v0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] |
| ; GCN-NEXT: ds_read_u16_d16 v1, v0 offset:2 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: v_mov_b32_e32 v0, v1 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_group_other_dep: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: ds_read_u16_d16_hi v1, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] |
| ; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: chain_hi_to_lo_group_other_dep: |
| ; GFX11: ; %bb.0: ; %bb |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: ds_load_u16_d16_hi v1, v0 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] |
| ; GFX11-NEXT: ds_load_u16_d16 v1, v0 offset:2 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1 |
| %load_lo = load i16, ptr addrspace(3) %gep_lo |
| %load_hi = load i16, ptr addrspace(3) %ptr |
| %to.hi = insertelement <2 x i16> poison, i16 %load_hi, i32 1 |
| %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> |
| %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 |
| ret <2 x i16> %result |
| } |
| |
| ; The volatile operations aren't put on the same chain |
| define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %ptr) { |
| ; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: |
| ; GFX900: ; %bb.0: ; %bb |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX900-NEXT: ds_read_u16 v1, v0 offset:2 |
| ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 |
| ; GFX900-NEXT: s_mov_b32 s4, 0xffff |
| ; GFX900-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 |
| ; GFX900-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: |
| ; FLATSCR: ; %bb.0: ; %bb |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR-NEXT: ds_read_u16 v1, v0 offset:2 |
| ; FLATSCR-NEXT: ds_read_u16_d16_hi v0, v0 |
| ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff |
| ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) |
| ; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v1, v0 |
| ; FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 |
| ; GFX10-NEXT: ds_read_u16_d16_hi v0, v0 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: ds_load_u16_d16 v1, v0 offset:2 |
| ; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v0 |
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: ds_load_u16 v1, v0 offset:2 |
| ; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v0 |
| ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1 |
| %load_lo = load volatile i16, ptr addrspace(3) %gep_lo |
| %load_hi = load volatile i16, ptr addrspace(3) %ptr |
| %to.hi = insertelement <2 x i16> poison, i16 %load_hi, i32 1 |
| %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> |
| %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 |
| ret <2 x i16> %result |
| } |
| |
| define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) { |
| ; GFX900-LABEL: chain_hi_to_lo_private_other_dep: |
| ; GFX900: ; %bb.0: ; %bb |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] |
| ; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX900-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep: |
| ; FLATSCR: ; %bb.0: ; %bb |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] |
| ; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2 |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 |
| ; FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep: |
| ; GFX10_DEFAULT: ; %bb.0: ; %bb |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] |
| ; GFX10_DEFAULT-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 |
| ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep: |
| ; FLATSCR_GFX10: ; %bb.0: ; %bb |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] |
| ; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2 |
| ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 |
| ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: chain_hi_to_lo_private_other_dep: |
| ; GFX11: ; %bb.0: ; %bb |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] |
| ; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1 |
| %load_lo = load i16, ptr addrspace(5) %gep_lo |
| %load_hi = load i16, ptr addrspace(5) %ptr |
| %to.hi = insertelement <2 x i16> poison, i16 %load_hi, i32 1 |
| %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> |
| %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 |
| ret <2 x i16> %result |
| } |
| |
| define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) { |
| ; GFX900-LABEL: chain_hi_to_lo_global_other_dep: |
| ; GFX900: ; %bb.0: ; %bb |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off glc |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: s_mov_b32 s4, 0xffff |
| ; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 |
| ; GFX900-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR-LABEL: chain_hi_to_lo_global_other_dep: |
| ; FLATSCR: ; %bb.0: ; %bb |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: global_load_short_d16_hi v0, v[0:1], off glc |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff |
| ; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 |
| ; FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_global_other_dep: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_other_dep: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 glc dlc |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 1 |
| %load_lo = load volatile i16, ptr addrspace(1) %gep_lo |
| %load_hi = load volatile i16, ptr addrspace(1) %ptr |
| %to.hi = insertelement <2 x i16> poison, i16 %load_hi, i32 1 |
| %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> |
| %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 |
| ret <2 x i16> %result |
| } |
| |
| define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) { |
| ; GFX900-LABEL: chain_hi_to_lo_flat_other_dep: |
| ; GFX900: ; %bb.0: ; %bb |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX900-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1] glc |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) |
| ; GFX900-NEXT: s_mov_b32 s4, 0xffff |
| ; GFX900-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 |
| ; GFX900-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR-LABEL: chain_hi_to_lo_flat_other_dep: |
| ; FLATSCR: ; %bb.0: ; %bb |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: flat_load_short_d16_hi v0, v[0:1] glc |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) |
| ; FLATSCR-NEXT: s_mov_b32 s0, 0xffff |
| ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) |
| ; FLATSCR-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; FLATSCR-NEXT: v_bfi_b32 v0, s0, v2, v0 |
| ; FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 2 |
| ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo |
| ; GFX10-NEXT: flat_load_ushort v2, v[2:3] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] |
| ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1 |
| %load_lo = load volatile i16, ptr addrspace(0) %gep_lo |
| %load_hi = load volatile i16, ptr addrspace(0) %ptr |
| %to.hi = insertelement <2 x i16> poison, i16 %load_hi, i32 1 |
| %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> |
| %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 |
| ret <2 x i16> %result |
| } |
| |
| define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, ptr addrspace(3) %may.alias) { |
| ; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store: |
| ; GFX900: ; %bb.0: ; %bb |
| ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7b |
| ; GFX900-NEXT: ds_read_u16 v2, v0 |
| ; GFX900-NEXT: ds_write_b16 v1, v3 |
| ; GFX900-NEXT: ds_read_u16 v0, v0 offset:2 |
| ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 |
| ; GFX900-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 |
| ; GFX900-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; FLATSCR-LABEL: chain_hi_to_lo_group_may_alias_store: |
| ; FLATSCR: ; %bb.0: ; %bb |
| ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0x7b |
| ; FLATSCR-NEXT: ds_read_u16 v2, v0 |
| ; FLATSCR-NEXT: ds_write_b16 v1, v3 |
| ; FLATSCR-NEXT: ds_read_u16 v0, v0 offset:2 |
| ; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 |
| ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) |
| ; FLATSCR-NEXT: v_perm_b32 v0, v2, v0, s0 |
| ; FLATSCR-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store: |
| ; GFX10: ; %bb.0: ; %bb |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b |
| ; GFX10-NEXT: ds_read_u16 v3, v0 |
| ; GFX10-NEXT: ds_write_b16 v1, v2 |
| ; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_may_alias_store: |
| ; GFX11-TRUE16: ; %bb.0: ; %bb |
| ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x7b |
| ; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v2, v0 |
| ; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2 |
| ; GFX11-TRUE16-NEXT: ds_load_u16_d16 v2, v0 offset:2 |
| ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 |
| ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_may_alias_store: |
| ; GFX11-FAKE16: ; %bb.0: ; %bb |
| ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0x7b |
| ; GFX11-FAKE16-NEXT: ds_load_u16 v3, v0 |
| ; GFX11-FAKE16-NEXT: ds_store_b16 v1, v2 |
| ; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:2 |
| ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 |
| ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1 |
| %load_hi = load i16, ptr addrspace(3) %ptr |
| store i16 123, ptr addrspace(3) %may.alias |
| %load_lo = load i16, ptr addrspace(3) %gep_lo |
| |
| %to.hi = insertelement <2 x i16> poison, i16 %load_hi, i32 1 |
| %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0 |
| ret <2 x i16> %result |
| } |