; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s

define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX7-LABEL: s_mul_i16:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_mov_b32 s2, 0xffff
; GFX8-NEXT:    s_and_b32 s0, s0, s2
; GFX8-NEXT:    s_and_b32 s1, s1, s2
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s2, 0xffff
; GFX9-NEXT:    s_and_b32 s0, s0, s2
; GFX9-NEXT:    s_and_b32 s1, s1, s2
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX7-LABEL: v_mul_i16:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s4, 0xffff
; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
; GFX7-LABEL: s_mul_i16_zeroext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_mov_b32 s2, 0xffff
; GFX8-NEXT:    s_and_b32 s0, s0, s2
; GFX8-NEXT:    s_and_b32 s1, s1, s2
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    s_and_b32 s0, s0, s2
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s2, 0xffff
; GFX9-NEXT:    s_and_b32 s0, s0, s2
; GFX9-NEXT:    s_and_b32 s1, s1, s2
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    s_and_b32 s0, s0, s2
; GFX9-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX7-LABEL: v_mul_i16_zeroext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s4, 0xffff
; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_zeroext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
; GFX7-LABEL: s_mul_i16_signext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    s_sext_i32_i16 s0, s0
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_mov_b32 s2, 0xffff
; GFX8-NEXT:    s_and_b32 s0, s0, s2
; GFX8-NEXT:    s_and_b32 s1, s1, s2
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    s_sext_i32_i16 s0, s0
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s2, 0xffff
; GFX9-NEXT:    s_and_b32 s0, s0, s2
; GFX9-NEXT:    s_and_b32 s1, s1, s2
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    s_sext_i32_i16 s0, s0
; GFX9-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX7-LABEL: v_mul_i16_signext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    s_mov_b32 s4, 0xffff
; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_signext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GCN-LABEL: s_mul_i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_mul_i32 s0, s0, s1
; GCN-NEXT:    ; return to shader part epilog
  %result = mul i32 %num, %den
  ret i32 %result
}

define i32 @v_mul_i32(i32 %num, i32 %den) {
; GCN-LABEL: v_mul_i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mul_lo_u32 v0, v0, v1
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i32 %num, %den
  ret i32 %result
}

define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GCN-LABEL: s_mul_v2i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_mul_i32 s0, s0, s2
; GCN-NEXT:    s_mul_i32 s1, s1, s3
; GCN-NEXT:    ; return to shader part epilog
  %result = mul <2 x i32> %num, %den
  ret <2 x i32> %result
}

define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GCN-LABEL: v_mul_v2i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mul_lo_u32 v0, v0, v2
; GCN-NEXT:    v_mul_lo_u32 v1, v1, v3
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = mul <2 x i32> %num, %den
  ret <2 x i32> %result
}

define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX7-LABEL: s_mul_i64:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s2
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    s_mul_i32 s4, s0, s2
; GFX7-NEXT:    s_mul_i32 s1, s1, s2
; GFX7-NEXT:    s_mul_i32 s0, s0, s3
; GFX7-NEXT:    s_add_i32 s1, s1, s0
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
; GFX7-NEXT:    s_mov_b32 s0, s4
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i64:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    s_mul_i32 s4, s0, s2
; GFX8-NEXT:    s_mul_i32 s1, s1, s2
; GFX8-NEXT:    s_mul_i32 s0, s0, s3
; GFX8-NEXT:    s_add_i32 s1, s1, s0
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s1, v0
; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
; GFX8-NEXT:    s_mov_b32 s0, s4
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s1, s1, s2
; GFX9-NEXT:    s_mul_i32 s3, s0, s3
; GFX9-NEXT:    s_mul_i32 s4, s0, s2
; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s2
; GFX9-NEXT:    s_add_i32 s1, s1, s3
; GFX9-NEXT:    s_add_i32 s1, s1, s0
; GFX9-NEXT:    s_mov_b32 s0, s4
; GFX9-NEXT:    ; return to shader part epilog
  %result = mul i64 %num, %den
  ret i64 %result
}

define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX7-LABEL: v_mul_i64:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mul_lo_u32 v4, v0, v3
; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v2
; GFX7-NEXT:    v_mul_lo_u32 v3, v0, v2
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v2
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, v3
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i64:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u32 v4, v0, v3
; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
; GFX8-NEXT:    v_mul_lo_u32 v3, v0, v2
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v2
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, v3
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v3
; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v4
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i64 %num, %den
  ret i64 %result
}

define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX7-LABEL: s_mul_i96:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s3
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    v_mov_b32_e32 v2, s1
; GFX7-NEXT:    s_mul_i32 s7, s1, s3
; GFX7-NEXT:    s_mul_i32 s8, s0, s4
; GFX7-NEXT:    s_add_u32 s7, s7, s8
; GFX7-NEXT:    v_mov_b32_e32 v3, s4
; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s3
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
; GFX7-NEXT:    s_mul_i32 s7, s1, s4
; GFX7-NEXT:    s_mul_i32 s2, s2, s3
; GFX7-NEXT:    v_mul_hi_u32 v3, s0, v3
; GFX7-NEXT:    s_cselect_b32 s8, 1, 0
; GFX7-NEXT:    s_mul_i32 s6, s0, s3
; GFX7-NEXT:    s_mul_i32 s5, s0, s5
; GFX7-NEXT:    s_add_i32 s0, s2, s7
; GFX7-NEXT:    s_add_i32 s0, s0, s5
; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
; GFX7-NEXT:    s_and_b32 s8, s8, 1
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s8, v1
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
; GFX7-NEXT:    v_readfirstlane_b32 s2, v1
; GFX7-NEXT:    s_mov_b32 s0, s6
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i96:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s3
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    v_mov_b32_e32 v2, s1
; GFX8-NEXT:    s_mul_i32 s7, s1, s3
; GFX8-NEXT:    s_mul_i32 s8, s0, s4
; GFX8-NEXT:    s_add_u32 s7, s7, s8
; GFX8-NEXT:    v_mov_b32_e32 v3, s4
; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s3
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s7, v0
; GFX8-NEXT:    s_mul_i32 s7, s1, s4
; GFX8-NEXT:    s_mul_i32 s2, s2, s3
; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v3
; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
; GFX8-NEXT:    s_mul_i32 s6, s0, s3
; GFX8-NEXT:    s_mul_i32 s5, s0, s5
; GFX8-NEXT:    s_add_i32 s0, s2, s7
; GFX8-NEXT:    s_add_i32 s0, s0, s5
; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT:    s_and_b32 s8, s8, 1
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s8, v1
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
; GFX8-NEXT:    s_mov_b32 s0, s6
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i96:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s7, s1, s3
; GFX9-NEXT:    s_mul_i32 s8, s0, s4
; GFX9-NEXT:    s_add_u32 s7, s7, s8
; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s3
; GFX9-NEXT:    s_and_b32 s8, s8, 1
; GFX9-NEXT:    s_add_u32 s7, s7, s9
; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
; GFX9-NEXT:    s_and_b32 s9, s9, 1
; GFX9-NEXT:    s_add_i32 s8, s8, s9
; GFX9-NEXT:    s_mul_i32 s9, s1, s4
; GFX9-NEXT:    s_mul_i32 s2, s2, s3
; GFX9-NEXT:    s_mul_i32 s5, s0, s5
; GFX9-NEXT:    s_add_i32 s2, s2, s9
; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s3
; GFX9-NEXT:    s_add_i32 s2, s2, s5
; GFX9-NEXT:    s_mul_i32 s6, s0, s3
; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s4
; GFX9-NEXT:    s_add_i32 s1, s2, s1
; GFX9-NEXT:    s_add_i32 s0, s1, s0
; GFX9-NEXT:    s_add_i32 s2, s0, s8
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    s_mov_b32 s1, s7
; GFX9-NEXT:    ; return to shader part epilog
  %result = mul i96 %num, %den
  %cast = bitcast i96 %result to <3 x i32>
  ret <3 x i32> %cast
}

define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX7-LABEL: v_mul_i96:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mul_lo_u32 v7, v1, v3
; GFX7-NEXT:    v_mul_lo_u32 v8, v0, v4
; GFX7-NEXT:    v_mul_hi_u32 v9, v0, v3
; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v3
; GFX7-NEXT:    v_mul_lo_u32 v5, v0, v5
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
; GFX7-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v4
; GFX7-NEXT:    v_mul_hi_u32 v1, v1, v3
; GFX7-NEXT:    v_mul_lo_u32 v6, v0, v3
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v4
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v0, v8
; GFX7-NEXT:    v_mov_b32_e32 v0, v6
; GFX7-NEXT:    v_mov_b32_e32 v1, v7
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i96:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v3
; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
; GFX8-NEXT:    v_mul_hi_u32 v9, v0, v3
; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v3
; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v5
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v4
; GFX8-NEXT:    v_mul_hi_u32 v1, v1, v3
; GFX8-NEXT:    v_mul_lo_u32 v6, v0, v3
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v4
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v9
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v8
; GFX8-NEXT:    v_mov_b32_e32 v0, v6
; GFX8-NEXT:    v_mov_b32_e32 v1, v7
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i96:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v3
; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v4
; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v3
; GFX9-NEXT:    v_mul_lo_u32 v10, v1, v4
; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v3
; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v5
; GFX9-NEXT:    v_mul_hi_u32 v1, v1, v3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
; GFX9-NEXT:    v_mul_hi_u32 v0, v0, v4
; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT:    v_add_u32_e32 v2, v2, v10
; GFX9-NEXT:    v_add_u32_e32 v3, v8, v9
; GFX9-NEXT:    v_add3_u32 v1, v2, v5, v1
; GFX9-NEXT:    v_add3_u32 v2, v1, v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v0, v6
; GFX9-NEXT:    v_mov_b32_e32 v1, v7
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i96 %num, %den
  ret i96 %result
}

define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX7-LABEL: s_mul_i128:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s4
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    s_mul_i32 s9, s1, s4
; GFX7-NEXT:    s_mul_i32 s10, s0, s5
; GFX7-NEXT:    s_add_u32 s9, s9, s10
; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s9, v0
; GFX7-NEXT:    s_and_b32 s10, s10, 1
; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s10, v1
; GFX7-NEXT:    s_mul_i32 s9, s2, s4
; GFX7-NEXT:    s_mul_i32 s10, s1, s5
; GFX7-NEXT:    v_mov_b32_e32 v2, s1
; GFX7-NEXT:    s_add_u32 s9, s9, s10
; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s4
; GFX7-NEXT:    s_mul_i32 s11, s0, s6
; GFX7-NEXT:    s_and_b32 s10, s10, 1
; GFX7-NEXT:    s_add_u32 s9, s9, s11
; GFX7-NEXT:    v_mov_b32_e32 v3, s5
; GFX7-NEXT:    s_cselect_b32 s11, 1, 0
; GFX7-NEXT:    v_mul_hi_u32 v4, s0, v3
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s9, v2
; GFX7-NEXT:    s_and_b32 s11, s11, 1
; GFX7-NEXT:    s_add_i32 s10, s10, s11
; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s10, v5
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
; GFX7-NEXT:    v_mov_b32_e32 v4, s2
; GFX7-NEXT:    v_mov_b32_e32 v5, s6
; GFX7-NEXT:    s_mul_i32 s5, s2, s5
; GFX7-NEXT:    s_mul_i32 s3, s3, s4
; GFX7-NEXT:    v_mul_hi_u32 v4, v4, s4
; GFX7-NEXT:    s_mul_i32 s8, s0, s4
; GFX7-NEXT:    s_mul_i32 s9, s1, s6
; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
; GFX7-NEXT:    s_mul_i32 s7, s0, s7
; GFX7-NEXT:    v_mul_hi_u32 v5, s0, v5
; GFX7-NEXT:    s_add_i32 s0, s3, s5
; GFX7-NEXT:    s_add_i32 s0, s0, s9
; GFX7-NEXT:    s_add_i32 s0, s0, s7
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
; GFX7-NEXT:    v_readfirstlane_b32 s2, v1
; GFX7-NEXT:    v_readfirstlane_b32 s3, v2
; GFX7-NEXT:    s_mov_b32 s0, s8
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i128:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s4
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    s_mul_i32 s9, s1, s4
; GFX8-NEXT:    s_mul_i32 s10, s0, s5
; GFX8-NEXT:    s_add_u32 s9, s9, s10
; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s9, v0
; GFX8-NEXT:    s_and_b32 s10, s10, 1
; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s10, v1
; GFX8-NEXT:    s_mul_i32 s9, s2, s4
; GFX8-NEXT:    s_mul_i32 s10, s1, s5
; GFX8-NEXT:    v_mov_b32_e32 v2, s1
; GFX8-NEXT:    s_add_u32 s9, s9, s10
; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s4
; GFX8-NEXT:    s_mul_i32 s11, s0, s6
; GFX8-NEXT:    s_and_b32 s10, s10, 1
; GFX8-NEXT:    s_add_u32 s9, s9, s11
; GFX8-NEXT:    v_mov_b32_e32 v3, s5
; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
; GFX8-NEXT:    v_mul_hi_u32 v4, s0, v3
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s9, v2
; GFX8-NEXT:    s_and_b32 s11, s11, 1
; GFX8-NEXT:    s_add_i32 s10, s10, s11
; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s10, v5
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NEXT:    v_mov_b32_e32 v5, s6
; GFX8-NEXT:    s_mul_i32 s5, s2, s5
; GFX8-NEXT:    s_mul_i32 s3, s3, s4
; GFX8-NEXT:    v_mul_hi_u32 v4, v4, s4
; GFX8-NEXT:    s_mul_i32 s8, s0, s4
; GFX8-NEXT:    s_mul_i32 s9, s1, s6
; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
; GFX8-NEXT:    s_mul_i32 s7, s0, s7
; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v5
; GFX8-NEXT:    s_add_i32 s0, s3, s5
; GFX8-NEXT:    s_add_i32 s0, s0, s9
; GFX8-NEXT:    s_add_i32 s0, s0, s7
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
; GFX8-NEXT:    s_mov_b32 s0, s8
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s9, s1, s4
; GFX9-NEXT:    s_mul_i32 s10, s0, s5
; GFX9-NEXT:    s_add_u32 s9, s9, s10
; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s4
; GFX9-NEXT:    s_and_b32 s10, s10, 1
; GFX9-NEXT:    s_add_u32 s9, s9, s11
; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
; GFX9-NEXT:    s_and_b32 s11, s11, 1
; GFX9-NEXT:    s_add_i32 s10, s10, s11
; GFX9-NEXT:    s_mul_i32 s11, s2, s4
; GFX9-NEXT:    s_mul_i32 s12, s1, s5
; GFX9-NEXT:    s_add_u32 s11, s11, s12
; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
; GFX9-NEXT:    s_mul_i32 s13, s0, s6
; GFX9-NEXT:    s_and_b32 s12, s12, 1
; GFX9-NEXT:    s_add_u32 s11, s11, s13
; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
; GFX9-NEXT:    s_and_b32 s13, s13, 1
; GFX9-NEXT:    s_mul_hi_u32 s14, s1, s4
; GFX9-NEXT:    s_add_i32 s12, s12, s13
; GFX9-NEXT:    s_add_u32 s11, s11, s14
; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
; GFX9-NEXT:    s_and_b32 s13, s13, 1
; GFX9-NEXT:    s_mul_hi_u32 s15, s0, s5
; GFX9-NEXT:    s_add_i32 s12, s12, s13
; GFX9-NEXT:    s_add_u32 s11, s11, s15
; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
; GFX9-NEXT:    s_and_b32 s13, s13, 1
; GFX9-NEXT:    s_add_i32 s12, s12, s13
; GFX9-NEXT:    s_add_u32 s10, s11, s10
; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
; GFX9-NEXT:    s_and_b32 s11, s11, 1
; GFX9-NEXT:    s_add_i32 s12, s12, s11
; GFX9-NEXT:    s_mul_i32 s11, s2, s5
; GFX9-NEXT:    s_mul_i32 s3, s3, s4
; GFX9-NEXT:    s_mul_i32 s13, s1, s6
; GFX9-NEXT:    s_add_i32 s3, s3, s11
; GFX9-NEXT:    s_mul_i32 s7, s0, s7
; GFX9-NEXT:    s_add_i32 s3, s3, s13
; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s4
; GFX9-NEXT:    s_add_i32 s3, s3, s7
; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s5
; GFX9-NEXT:    s_add_i32 s2, s3, s2
; GFX9-NEXT:    s_mul_i32 s8, s0, s4
; GFX9-NEXT:    s_add_i32 s1, s2, s1
; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s6
; GFX9-NEXT:    s_add_i32 s0, s1, s0
; GFX9-NEXT:    s_add_i32 s3, s0, s12
; GFX9-NEXT:    s_mov_b32 s0, s8
; GFX9-NEXT:    s_mov_b32 s1, s9
; GFX9-NEXT:    s_mov_b32 s2, s10
; GFX9-NEXT:    ; return to shader part epilog
  %result = mul i128 %num, %den
  %cast = bitcast i128 %result to <4 x i32>
  ret <4 x i32> %cast
}

define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-LABEL: v_mul_i128:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v4
; GFX7-NEXT:    v_mul_lo_u32 v10, v0, v5
; GFX7-NEXT:    v_mul_hi_u32 v11, v0, v4
; GFX7-NEXT:    v_mul_lo_u32 v12, v1, v5
; GFX7-NEXT:    v_mul_lo_u32 v13, v0, v6
; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT:    v_mul_lo_u32 v11, v2, v4
; GFX7-NEXT:    v_mul_hi_u32 v14, v1, v4
; GFX7-NEXT:    v_mul_hi_u32 v15, v0, v5
; GFX7-NEXT:    v_mul_lo_u32 v3, v3, v4
; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
; GFX7-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
; GFX7-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
; GFX7-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
; GFX7-NEXT:    v_mul_lo_u32 v12, v2, v5
; GFX7-NEXT:    v_mul_lo_u32 v13, v1, v6
; GFX7-NEXT:    v_mul_lo_u32 v7, v0, v7
; GFX7-NEXT:    v_mul_hi_u32 v2, v2, v4
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
; GFX7-NEXT:    v_mul_hi_u32 v1, v1, v5
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
; GFX7-NEXT:    v_mul_lo_u32 v8, v0, v4
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v6
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v0, v11
; GFX7-NEXT:    v_mov_b32_e32 v0, v8
; GFX7-NEXT:    v_mov_b32_e32 v1, v9
; GFX7-NEXT:    v_mov_b32_e32 v2, v10
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v4
; GFX8-NEXT:    v_mul_lo_u32 v10, v0, v5
; GFX8-NEXT:    v_mul_hi_u32 v11, v0, v4
; GFX8-NEXT:    v_mul_lo_u32 v12, v1, v5
; GFX8-NEXT:    v_mul_lo_u32 v13, v0, v6
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v10
; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v11
; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT:    v_mul_lo_u32 v11, v2, v4
; GFX8-NEXT:    v_mul_hi_u32 v14, v1, v4
; GFX8-NEXT:    v_mul_hi_u32 v15, v0, v5
; GFX8-NEXT:    v_mul_lo_u32 v3, v3, v4
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v12
; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v13
; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v14
; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v15
; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v12, v11
; GFX8-NEXT:    v_mul_lo_u32 v12, v2, v5
; GFX8-NEXT:    v_mul_lo_u32 v13, v1, v6
; GFX8-NEXT:    v_mul_lo_u32 v7, v0, v7
; GFX8-NEXT:    v_mul_hi_u32 v2, v2, v4
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v12
; GFX8-NEXT:    v_mul_hi_u32 v1, v1, v5
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v13
; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v6
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v0, v11
; GFX8-NEXT:    v_mov_b32_e32 v0, v8
; GFX8-NEXT:    v_mov_b32_e32 v1, v9
; GFX8-NEXT:    v_mov_b32_e32 v2, v10
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v4
; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v4
; GFX9-NEXT:    v_mul_lo_u32 v12, v1, v5
; GFX9-NEXT:    v_mul_lo_u32 v13, v0, v6
; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v10
; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v11
; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT:    v_add_u32_e32 v10, v10, v11
; GFX9-NEXT:    v_mul_lo_u32 v11, v2, v4
; GFX9-NEXT:    v_mul_hi_u32 v14, v1, v4
; GFX9-NEXT:    v_mul_hi_u32 v15, v0, v5
; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v4
; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v12
; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v13
; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v14
; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v15
; GFX9-NEXT:    v_add3_u32 v12, v12, v13, v14
; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v11, v12, v13, v11
; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v5
; GFX9-NEXT:    v_mul_lo_u32 v13, v1, v6
; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v7
; GFX9-NEXT:    v_mul_hi_u32 v2, v2, v4
; GFX9-NEXT:    v_mul_hi_u32 v1, v1, v5
; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v4
; GFX9-NEXT:    v_mul_hi_u32 v0, v0, v6
; GFX9-NEXT:    v_add_u32_e32 v3, v3, v12
; GFX9-NEXT:    v_add3_u32 v3, v3, v13, v7
; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
; GFX9-NEXT:    v_add3_u32 v3, v1, v0, v11
; GFX9-NEXT:    v_mov_b32_e32 v0, v8
; GFX9-NEXT:    v_mov_b32_e32 v1, v9
; GFX9-NEXT:    v_mov_b32_e32 v2, v10
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i128 %num, %den
  ret i128 %result
}

define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-LABEL: s_mul_i256:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s8
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    s_mul_i32 s17, s1, s8
; GFX7-NEXT:    s_mul_i32 s18, s0, s9
; GFX7-NEXT:    s_add_u32 s17, s17, s18
; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s17, v0
; GFX7-NEXT:    s_and_b32 s18, s18, 1
; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s18, v1
; GFX7-NEXT:    s_mul_i32 s17, s2, s8
; GFX7-NEXT:    s_mul_i32 s18, s1, s9
; GFX7-NEXT:    v_mov_b32_e32 v2, s1
; GFX7-NEXT:    s_add_u32 s17, s17, s18
; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s8
; GFX7-NEXT:    s_mul_i32 s19, s0, s10
; GFX7-NEXT:    s_and_b32 s18, s18, 1
; GFX7-NEXT:    s_add_u32 s17, s17, s19
; GFX7-NEXT:    v_mov_b32_e32 v3, s9
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_mul_hi_u32 v4, s0, v3
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s17, v2
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s18, v5
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
; GFX7-NEXT:    s_mul_i32 s17, s3, s8
; GFX7-NEXT:    s_mul_i32 s18, s2, s9
; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT:    s_add_u32 s17, s17, s18
; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT:    s_mul_i32 s19, s1, s10
; GFX7-NEXT:    s_and_b32 s18, s18, 1
; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT:    s_add_u32 s17, s17, s19
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
; GFX7-NEXT:    v_mov_b32_e32 v4, s2
; GFX7-NEXT:    v_mul_hi_u32 v5, v4, s8
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    s_mul_i32 s20, s0, s11
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    s_add_u32 s17, s17, s20
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s17, v5
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_mov_b32_e32 v6, s10
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s18, v8
; GFX7-NEXT:    v_mul_hi_u32 v7, s0, v6
; GFX7-NEXT:    s_mul_i32 s17, s4, s8
; GFX7-NEXT:    s_mul_i32 s18, s3, s9
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
; GFX7-NEXT:    s_add_u32 s17, s17, s18
; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
; GFX7-NEXT:    s_mul_i32 s19, s2, s10
; GFX7-NEXT:    s_and_b32 s18, s18, 1
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
; GFX7-NEXT:    s_add_u32 s17, s17, s19
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT:    s_mul_i32 s20, s1, s11
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX7-NEXT:    s_add_u32 s17, s17, s20
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
; GFX7-NEXT:    v_mov_b32_e32 v5, s3
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_mul_hi_u32 v7, v5, s8
; GFX7-NEXT:    s_mul_i32 s21, s0, s12
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    s_add_u32 s17, s17, s21
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, s17, v7
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_mul_hi_u32 v4, v4, s9
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v11, vcc, s18, v11
; GFX7-NEXT:    s_mul_i32 s17, s5, s8
; GFX7-NEXT:    s_mul_i32 s18, s4, s9
; GFX7-NEXT:    s_add_u32 s17, s17, s18
; GFX7-NEXT:    v_mul_hi_u32 v8, s1, v6
; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX7-NEXT:    v_mov_b32_e32 v9, s11
; GFX7-NEXT:    s_mul_i32 s19, s3, s10
; GFX7-NEXT:    s_and_b32 s18, s18, 1
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
; GFX7-NEXT:    s_add_u32 s17, s17, s19
; GFX7-NEXT:    v_mul_hi_u32 v10, s0, v9
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT:    s_mul_i32 s20, s2, s11
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
; GFX7-NEXT:    s_add_u32 s17, s17, s20
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT:    s_mul_i32 s21, s1, s12
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT:    s_add_u32 s17, s17, s21
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
; GFX7-NEXT:    v_mov_b32_e32 v7, s4
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_mul_hi_u32 v8, v7, s8
; GFX7-NEXT:    s_mul_i32 s22, s0, s13
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    s_add_u32 s17, s17, s22
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s17, v8
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_mul_hi_u32 v10, v5, s9
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v14, vcc, s18, v14
; GFX7-NEXT:    s_mul_i32 s17, s6, s8
; GFX7-NEXT:    s_mul_i32 s18, s5, s9
; GFX7-NEXT:    s_add_u32 s17, s17, s18
; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
; GFX7-NEXT:    v_mul_hi_u32 v6, s2, v6
; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT:    s_mul_i32 s19, s4, s10
; GFX7-NEXT:    s_and_b32 s18, s18, 1
; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT:    s_add_u32 s17, s17, s19
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
; GFX7-NEXT:    v_mul_hi_u32 v11, s1, v9
; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT:    v_mov_b32_e32 v12, s12
; GFX7-NEXT:    s_mul_i32 s20, s3, s11
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
; GFX7-NEXT:    s_add_u32 s17, s17, s20
; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v12
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT:    s_mul_i32 s21, s2, s12
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
; GFX7-NEXT:    s_add_u32 s17, s17, s21
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
; GFX7-NEXT:    s_mul_i32 s22, s1, s13
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX7-NEXT:    s_add_u32 s17, s17, s22
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
; GFX7-NEXT:    v_mov_b32_e32 v8, s5
; GFX7-NEXT:    v_mul_hi_u32 v10, v8, s8
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    s_mul_i32 s23, s0, s14
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    s_add_u32 s17, s17, s23
; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
; GFX7-NEXT:    v_mul_hi_u32 v11, v7, s9
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s17, v10
; GFX7-NEXT:    s_and_b32 s19, s19, 1
; GFX7-NEXT:    s_add_i32 s18, s18, s19
; GFX7-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v17, vcc, s18, v17
; GFX7-NEXT:    v_mul_hi_u32 v5, v5, s10
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT:    v_mul_hi_u32 v13, s2, v9
; GFX7-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
; GFX7-NEXT:    v_mul_hi_u32 v14, s1, v12
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT:    v_mov_b32_e32 v15, s13
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT:    v_mul_hi_u32 v16, s0, v15
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT:    v_mov_b32_e32 v13, s14
; GFX7-NEXT:    s_mul_i32 s7, s7, s8
; GFX7-NEXT:    s_mul_i32 s17, s6, s9
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
; GFX7-NEXT:    s_mul_i32 s16, s0, s8
; GFX7-NEXT:    s_mul_i32 s5, s5, s10
; GFX7-NEXT:    s_mul_i32 s15, s0, s15
; GFX7-NEXT:    v_mul_hi_u32 v13, s0, v13
; GFX7-NEXT:    s_add_i32 s0, s7, s17
; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX7-NEXT:    s_mul_i32 s4, s4, s11
; GFX7-NEXT:    s_add_i32 s0, s0, s5
; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
; GFX7-NEXT:    v_mov_b32_e32 v10, s6
; GFX7-NEXT:    s_mul_i32 s11, s3, s12
; GFX7-NEXT:    s_add_i32 s0, s0, s4
; GFX7-NEXT:    s_mul_i32 s12, s2, s13
; GFX7-NEXT:    s_add_i32 s0, s0, s11
; GFX7-NEXT:    v_mul_hi_u32 v10, v10, s8
; GFX7-NEXT:    s_mul_i32 s13, s1, s14
; GFX7-NEXT:    s_add_i32 s0, s0, s12
; GFX7-NEXT:    v_mul_hi_u32 v8, v8, s9
; GFX7-NEXT:    s_add_i32 s0, s0, s13
; GFX7-NEXT:    v_mul_hi_u32 v7, v7, s10
; GFX7-NEXT:    v_mul_hi_u32 v9, s3, v9
; GFX7-NEXT:    s_add_i32 s0, s0, s15
; GFX7-NEXT:    v_mul_hi_u32 v11, s2, v12
; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s0, v10
; GFX7-NEXT:    v_mul_hi_u32 v12, s1, v15
; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
; GFX7-NEXT:    v_readfirstlane_b32 s2, v1
; GFX7-NEXT:    v_readfirstlane_b32 s3, v2
; GFX7-NEXT:    v_readfirstlane_b32 s4, v3
; GFX7-NEXT:    v_readfirstlane_b32 s5, v4
; GFX7-NEXT:    v_readfirstlane_b32 s6, v5
; GFX7-NEXT:    v_readfirstlane_b32 s7, v6
; GFX7-NEXT:    s_mov_b32 s0, s16
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i256:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s8
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    s_mul_i32 s17, s1, s8
; GFX8-NEXT:    s_mul_i32 s18, s0, s9
; GFX8-NEXT:    s_add_u32 s17, s17, s18
; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s17, v0
; GFX8-NEXT:    s_and_b32 s18, s18, 1
; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s18, v1
; GFX8-NEXT:    s_mul_i32 s17, s2, s8
; GFX8-NEXT:    s_mul_i32 s18, s1, s9
; GFX8-NEXT:    v_mov_b32_e32 v2, s1
; GFX8-NEXT:    s_add_u32 s17, s17, s18
; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s8
; GFX8-NEXT:    s_mul_i32 s19, s0, s10
; GFX8-NEXT:    s_and_b32 s18, s18, 1
; GFX8-NEXT:    s_add_u32 s17, s17, s19
; GFX8-NEXT:    v_mov_b32_e32 v3, s9
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_mul_hi_u32 v4, s0, v3
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s17, v2
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s18, v5
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT:    s_mul_i32 s17, s3, s8
; GFX8-NEXT:    s_mul_i32 s18, s2, s9
; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT:    s_add_u32 s17, s17, s18
; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT:    s_mul_i32 s19, s1, s10
; GFX8-NEXT:    s_and_b32 s18, s18, 1
; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT:    s_add_u32 s17, s17, s19
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT:    v_mov_b32_e32 v4, s2
; GFX8-NEXT:    v_mul_hi_u32 v5, v4, s8
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    s_mul_i32 s20, s0, s11
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    s_add_u32 s17, s17, s20
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s17, v5
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_mov_b32_e32 v6, s10
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s18, v8
; GFX8-NEXT:    v_mul_hi_u32 v7, s0, v6
; GFX8-NEXT:    s_mul_i32 s17, s4, s8
; GFX8-NEXT:    s_mul_i32 s18, s3, s9
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT:    s_add_u32 s17, s17, s18
; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v8, v5
; GFX8-NEXT:    s_mul_i32 s19, s2, s10
; GFX8-NEXT:    s_and_b32 s18, s18, 1
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT:    s_add_u32 s17, s17, s19
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT:    s_mul_i32 s20, s1, s11
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT:    s_add_u32 s17, s17, s20
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT:    v_mov_b32_e32 v5, s3
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_mul_hi_u32 v7, v5, s8
; GFX8-NEXT:    s_mul_i32 s21, s0, s12
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    s_add_u32 s17, s17, s21
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s17, v7
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_mul_hi_u32 v4, v4, s9
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s18, v11
; GFX8-NEXT:    s_mul_i32 s17, s5, s8
; GFX8-NEXT:    s_mul_i32 s18, s4, s9
; GFX8-NEXT:    s_add_u32 s17, s17, s18
; GFX8-NEXT:    v_mul_hi_u32 v8, s1, v6
; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT:    v_mov_b32_e32 v9, s11
; GFX8-NEXT:    s_mul_i32 s19, s3, s10
; GFX8-NEXT:    s_and_b32 s18, s18, 1
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v11, v7
; GFX8-NEXT:    s_add_u32 s17, s17, s19
; GFX8-NEXT:    v_mul_hi_u32 v10, s0, v9
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT:    s_mul_i32 s20, s2, s11
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v10
; GFX8-NEXT:    s_add_u32 s17, s17, s20
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT:    s_mul_i32 s21, s1, s12
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT:    s_add_u32 s17, s17, s21
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT:    v_mov_b32_e32 v7, s4
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_mul_hi_u32 v8, v7, s8
; GFX8-NEXT:    s_mul_i32 s22, s0, s13
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    s_add_u32 s17, s17, s22
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s17, v8
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_mul_hi_u32 v10, v5, s9
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s18, v14
; GFX8-NEXT:    s_mul_i32 s17, s6, s8
; GFX8-NEXT:    s_mul_i32 s18, s5, s9
; GFX8-NEXT:    s_add_u32 s17, s17, s18
; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
; GFX8-NEXT:    v_mul_hi_u32 v6, s2, v6
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT:    s_mul_i32 s19, s4, s10
; GFX8-NEXT:    s_and_b32 s18, s18, 1
; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT:    s_add_u32 s17, s17, s19
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v14, v10
; GFX8-NEXT:    v_mul_hi_u32 v11, s1, v9
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT:    v_mov_b32_e32 v12, s12
; GFX8-NEXT:    s_mul_i32 s20, s3, s11
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
; GFX8-NEXT:    s_add_u32 s17, s17, s20
; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v12
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT:    s_mul_i32 s21, s2, s12
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
; GFX8-NEXT:    s_add_u32 s17, s17, s21
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
; GFX8-NEXT:    s_mul_i32 s22, s1, s13
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT:    s_add_u32 s17, s17, s22
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT:    v_mov_b32_e32 v8, s5
; GFX8-NEXT:    v_mul_hi_u32 v10, v8, s8
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    s_mul_i32 s23, s0, s14
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    s_add_u32 s17, s17, s23
; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
; GFX8-NEXT:    v_mul_hi_u32 v11, v7, s9
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s17, v10
; GFX8-NEXT:    s_and_b32 s19, s19, 1
; GFX8-NEXT:    s_add_i32 s18, s18, s19
; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s18, v17
; GFX8-NEXT:    v_mul_hi_u32 v5, v5, s10
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT:    v_mul_hi_u32 v13, s2, v9
; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v17, v11
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v10, v5
; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
; GFX8-NEXT:    v_mul_hi_u32 v14, s1, v12
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v13
; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT:    v_mov_b32_e32 v15, s13
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT:    v_mul_hi_u32 v16, s0, v15
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v14
; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v16
; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT:    v_mov_b32_e32 v13, s14
; GFX8-NEXT:    s_mul_i32 s7, s7, s8
; GFX8-NEXT:    s_mul_i32 s17, s6, s9
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
; GFX8-NEXT:    s_mul_i32 s16, s0, s8
; GFX8-NEXT:    s_mul_i32 s5, s5, s10
; GFX8-NEXT:    s_mul_i32 s15, s0, s15
; GFX8-NEXT:    v_mul_hi_u32 v13, s0, v13
; GFX8-NEXT:    s_add_i32 s0, s7, s17
; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT:    s_mul_i32 s4, s4, s11
; GFX8-NEXT:    s_add_i32 s0, s0, s5
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v10, v6
; GFX8-NEXT:    v_mov_b32_e32 v10, s6
; GFX8-NEXT:    s_mul_i32 s11, s3, s12
; GFX8-NEXT:    s_add_i32 s0, s0, s4
; GFX8-NEXT:    s_mul_i32 s12, s2, s13
; GFX8-NEXT:    s_add_i32 s0, s0, s11
; GFX8-NEXT:    v_mul_hi_u32 v10, v10, s8
; GFX8-NEXT:    s_mul_i32 s13, s1, s14
; GFX8-NEXT:    s_add_i32 s0, s0, s12
; GFX8-NEXT:    v_mul_hi_u32 v8, v8, s9
; GFX8-NEXT:    s_add_i32 s0, s0, s13
; GFX8-NEXT:    v_mul_hi_u32 v7, v7, s10
; GFX8-NEXT:    v_mul_hi_u32 v9, s3, v9
; GFX8-NEXT:    s_add_i32 s0, s0, s15
; GFX8-NEXT:    v_mul_hi_u32 v11, s2, v12
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s0, v10
; GFX8-NEXT:    v_mul_hi_u32 v12, s1, v15
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v12
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v13
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
; GFX8-NEXT:    v_readfirstlane_b32 s4, v3
; GFX8-NEXT:    v_readfirstlane_b32 s5, v4
; GFX8-NEXT:    v_readfirstlane_b32 s6, v5
; GFX8-NEXT:    v_readfirstlane_b32 s7, v6
; GFX8-NEXT:    s_mov_b32 s0, s16
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i256:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s17, s1, s8
; GFX9-NEXT:    s_mul_i32 s18, s0, s9
; GFX9-NEXT:    s_add_u32 s17, s17, s18
; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s19, s0, s8
; GFX9-NEXT:    s_and_b32 s18, s18, 1
; GFX9-NEXT:    s_add_u32 s17, s17, s19
; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
; GFX9-NEXT:    s_and_b32 s19, s19, 1
; GFX9-NEXT:    s_add_i32 s18, s18, s19
; GFX9-NEXT:    s_mul_i32 s19, s2, s8
; GFX9-NEXT:    s_mul_i32 s20, s1, s9
; GFX9-NEXT:    s_add_u32 s19, s19, s20
; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
; GFX9-NEXT:    s_mul_i32 s21, s0, s10
; GFX9-NEXT:    s_and_b32 s20, s20, 1
; GFX9-NEXT:    s_add_u32 s19, s19, s21
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_and_b32 s21, s21, 1
; GFX9-NEXT:    s_mul_hi_u32 s22, s1, s8
; GFX9-NEXT:    s_add_i32 s20, s20, s21
; GFX9-NEXT:    s_add_u32 s19, s19, s22
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_and_b32 s21, s21, 1
; GFX9-NEXT:    s_mul_hi_u32 s23, s0, s9
; GFX9-NEXT:    s_add_i32 s20, s20, s21
; GFX9-NEXT:    s_add_u32 s19, s19, s23
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_and_b32 s21, s21, 1
; GFX9-NEXT:    s_add_i32 s20, s20, s21
; GFX9-NEXT:    s_add_u32 s18, s19, s18
; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
; GFX9-NEXT:    s_and_b32 s19, s19, 1
; GFX9-NEXT:    s_add_i32 s20, s20, s19
; GFX9-NEXT:    s_mul_i32 s19, s3, s8
; GFX9-NEXT:    s_mul_i32 s21, s2, s9
; GFX9-NEXT:    s_add_u32 s19, s19, s21
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_mul_i32 s22, s1, s10
; GFX9-NEXT:    s_and_b32 s21, s21, 1
; GFX9-NEXT:    s_add_u32 s19, s19, s22
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_and_b32 s22, s22, 1
; GFX9-NEXT:    s_mul_i32 s23, s0, s11
; GFX9-NEXT:    s_add_i32 s21, s21, s22
; GFX9-NEXT:    s_add_u32 s19, s19, s23
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_and_b32 s22, s22, 1
; GFX9-NEXT:    s_mul_hi_u32 s24, s2, s8
; GFX9-NEXT:    s_add_i32 s21, s21, s22
; GFX9-NEXT:    s_add_u32 s19, s19, s24
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_and_b32 s22, s22, 1
; GFX9-NEXT:    s_mul_hi_u32 s25, s1, s9
; GFX9-NEXT:    s_add_i32 s21, s21, s22
; GFX9-NEXT:    s_add_u32 s19, s19, s25
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_and_b32 s22, s22, 1
; GFX9-NEXT:    s_mul_hi_u32 s26, s0, s10
; GFX9-NEXT:    s_add_i32 s21, s21, s22
; GFX9-NEXT:    s_add_u32 s19, s19, s26
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_and_b32 s22, s22, 1
; GFX9-NEXT:    s_add_i32 s21, s21, s22
; GFX9-NEXT:    s_add_u32 s19, s19, s20
; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
; GFX9-NEXT:    s_and_b32 s20, s20, 1
; GFX9-NEXT:    s_add_i32 s21, s21, s20
; GFX9-NEXT:    s_mul_i32 s20, s4, s8
; GFX9-NEXT:    s_mul_i32 s22, s3, s9
; GFX9-NEXT:    s_add_u32 s20, s20, s22
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_mul_i32 s23, s2, s10
; GFX9-NEXT:    s_and_b32 s22, s22, 1
; GFX9-NEXT:    s_add_u32 s20, s20, s23
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_mul_i32 s24, s1, s11
; GFX9-NEXT:    s_add_i32 s22, s22, s23
; GFX9-NEXT:    s_add_u32 s20, s20, s24
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_mul_i32 s25, s0, s12
; GFX9-NEXT:    s_add_i32 s22, s22, s23
; GFX9-NEXT:    s_add_u32 s20, s20, s25
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_mul_hi_u32 s26, s3, s8
; GFX9-NEXT:    s_add_i32 s22, s22, s23
; GFX9-NEXT:    s_add_u32 s20, s20, s26
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_mul_hi_u32 s27, s2, s9
; GFX9-NEXT:    s_add_i32 s22, s22, s23
; GFX9-NEXT:    s_add_u32 s20, s20, s27
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_mul_hi_u32 s28, s1, s10
; GFX9-NEXT:    s_add_i32 s22, s22, s23
; GFX9-NEXT:    s_add_u32 s20, s20, s28
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_mul_hi_u32 s29, s0, s11
; GFX9-NEXT:    s_add_i32 s22, s22, s23
; GFX9-NEXT:    s_add_u32 s20, s20, s29
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_add_i32 s22, s22, s23
; GFX9-NEXT:    s_add_u32 s20, s20, s21
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_and_b32 s21, s21, 1
; GFX9-NEXT:    s_add_i32 s22, s22, s21
; GFX9-NEXT:    s_mul_i32 s21, s5, s8
; GFX9-NEXT:    s_mul_i32 s23, s4, s9
; GFX9-NEXT:    s_add_u32 s21, s21, s23
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_mul_i32 s24, s3, s10
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_add_u32 s21, s21, s24
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_mul_i32 s25, s2, s11
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s25
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_mul_i32 s26, s1, s12
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s26
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_mul_i32 s27, s0, s13
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s27
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_mul_hi_u32 s28, s4, s8
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s28
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_mul_hi_u32 s29, s3, s9
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s29
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_mul_hi_u32 s30, s2, s10
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s30
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_mul_hi_u32 s31, s1, s11
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s31
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_mul_hi_u32 s33, s0, s12
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s33
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_add_i32 s23, s23, s24
; GFX9-NEXT:    s_add_u32 s21, s21, s22
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_and_b32 s22, s22, 1
; GFX9-NEXT:    s_add_i32 s23, s23, s22
; GFX9-NEXT:    s_mul_i32 s22, s6, s8
; GFX9-NEXT:    s_mul_i32 s24, s5, s9
; GFX9-NEXT:    s_add_u32 s22, s22, s24
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_mul_i32 s25, s4, s10
; GFX9-NEXT:    s_and_b32 s24, s24, 1
; GFX9-NEXT:    s_add_u32 s22, s22, s25
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_i32 s26, s3, s11
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s26
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_i32 s27, s2, s12
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s27
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_i32 s28, s1, s13
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s28
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_i32 s29, s0, s14
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s29
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_hi_u32 s30, s5, s8
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s30
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_hi_u32 s31, s4, s9
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s31
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_hi_u32 s33, s3, s10
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s33
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_hi_u32 s34, s2, s11
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s34
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_hi_u32 s35, s1, s12
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s35
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_mul_hi_u32 s36, s0, s13
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s36
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_and_b32 s25, s25, 1
; GFX9-NEXT:    s_add_i32 s24, s24, s25
; GFX9-NEXT:    s_add_u32 s22, s22, s23
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_and_b32 s23, s23, 1
; GFX9-NEXT:    s_add_i32 s24, s24, s23
; GFX9-NEXT:    s_mul_i32 s23, s6, s9
; GFX9-NEXT:    s_mul_i32 s7, s7, s8
; GFX9-NEXT:    s_mul_i32 s25, s5, s10
; GFX9-NEXT:    s_add_i32 s7, s7, s23
; GFX9-NEXT:    s_mul_i32 s26, s4, s11
; GFX9-NEXT:    s_add_i32 s7, s7, s25
; GFX9-NEXT:    s_mul_i32 s27, s3, s12
; GFX9-NEXT:    s_add_i32 s7, s7, s26
; GFX9-NEXT:    s_mul_i32 s28, s2, s13
; GFX9-NEXT:    s_add_i32 s7, s7, s27
; GFX9-NEXT:    s_mul_i32 s29, s1, s14
; GFX9-NEXT:    s_add_i32 s7, s7, s28
; GFX9-NEXT:    s_mul_i32 s15, s0, s15
; GFX9-NEXT:    s_add_i32 s7, s7, s29
; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
; GFX9-NEXT:    s_add_i32 s7, s7, s15
; GFX9-NEXT:    s_mul_hi_u32 s5, s5, s9
; GFX9-NEXT:    s_add_i32 s6, s7, s6
; GFX9-NEXT:    s_add_i32 s5, s6, s5
; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s10
; GFX9-NEXT:    s_add_i32 s4, s5, s4
; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s11
; GFX9-NEXT:    s_add_i32 s3, s4, s3
; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s12
; GFX9-NEXT:    s_add_i32 s2, s3, s2
; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s13
; GFX9-NEXT:    s_mul_i32 s16, s0, s8
; GFX9-NEXT:    s_add_i32 s1, s2, s1
; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s14
; GFX9-NEXT:    s_add_i32 s0, s1, s0
; GFX9-NEXT:    s_add_i32 s7, s0, s24
; GFX9-NEXT:    s_mov_b32 s0, s16
; GFX9-NEXT:    s_mov_b32 s1, s17
; GFX9-NEXT:    s_mov_b32 s2, s18
; GFX9-NEXT:    s_mov_b32 s3, s19
; GFX9-NEXT:    s_mov_b32 s4, s20
; GFX9-NEXT:    s_mov_b32 s5, s21
; GFX9-NEXT:    s_mov_b32 s6, s22
; GFX9-NEXT:    ; return to shader part epilog
  %result = mul i256 %num, %den
  %cast = bitcast i256 %result to <8 x i32>
  ret <8 x i32> %cast
}

define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mul_lo_u32 v16, v1, v8
; GFX7-NEXT:    v_mul_lo_u32 v17, v0, v9
; GFX7-NEXT:    v_mul_hi_u32 v18, v0, v8
; GFX7-NEXT:    v_mul_lo_u32 v19, v2, v8
; GFX7-NEXT:    v_mul_lo_u32 v20, v1, v9
; GFX7-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
; GFX7-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
; GFX7-NEXT:    v_mul_lo_u32 v18, v0, v10
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT:    v_mul_hi_u32 v21, v1, v8
; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v18, v21
; GFX7-NEXT:    v_mul_hi_u32 v21, v0, v9
; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT:    v_mul_lo_u32 v22, v0, v11
; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v18, v21
; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
; GFX7-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX7-NEXT:    v_mul_lo_u32 v20, v3, v8
; GFX7-NEXT:    v_mul_lo_u32 v21, v2, v9
; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT:    v_mul_lo_u32 v19, v1, v10
; GFX7-NEXT:    v_mul_lo_u32 v23, v1, v11
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT:    v_mul_hi_u32 v22, v2, v8
; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT:    v_mul_lo_u32 v7, v7, v8
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT:    v_mul_hi_u32 v22, v1, v9
; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT:    v_mul_lo_u32 v15, v0, v15
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT:    v_mul_hi_u32 v22, v0, v10
; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX7-NEXT:    v_mul_lo_u32 v21, v4, v8
; GFX7-NEXT:    v_mul_lo_u32 v22, v3, v9
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT:    v_mul_lo_u32 v20, v2, v10
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT:    v_mul_lo_u32 v23, v0, v12
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v3, v8
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v2, v9
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v1, v10
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v0, v11
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT:    v_mul_lo_u32 v22, v5, v8
; GFX7-NEXT:    v_mul_lo_u32 v23, v4, v9
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT:    v_mul_lo_u32 v21, v3, v10
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v23, v22
; GFX7-NEXT:    v_mul_lo_u32 v23, v2, v11
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_mul_lo_u32 v23, v1, v12
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_mul_lo_u32 v23, v0, v13
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v4, v8
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v3, v9
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v2, v10
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v1, v11
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_mul_hi_u32 v23, v0, v12
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT:    v_mul_lo_u32 v22, v6, v8
; GFX7-NEXT:    v_mul_lo_u32 v23, v5, v9
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_mul_lo_u32 v23, v4, v10
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v24, v23
; GFX7-NEXT:    v_mul_lo_u32 v24, v3, v11
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_lo_u32 v24, v2, v12
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_lo_u32 v24, v1, v13
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_lo_u32 v24, v0, v14
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_hi_u32 v24, v5, v8
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_hi_u32 v24, v4, v9
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_hi_u32 v24, v3, v10
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_hi_u32 v24, v2, v11
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_hi_u32 v24, v1, v12
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_mul_hi_u32 v24, v0, v13
; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v23, v22
; GFX7-NEXT:    v_mul_lo_u32 v22, v0, v8
; GFX7-NEXT:    v_mul_hi_u32 v8, v6, v8
; GFX7-NEXT:    v_mul_lo_u32 v6, v6, v9
; GFX7-NEXT:    v_mul_hi_u32 v9, v5, v9
; GFX7-NEXT:    v_mul_lo_u32 v5, v5, v10
; GFX7-NEXT:    v_mul_hi_u32 v10, v4, v10
; GFX7-NEXT:    v_mul_lo_u32 v4, v4, v11
; GFX7-NEXT:    v_mul_hi_u32 v11, v3, v11
; GFX7-NEXT:    v_mul_lo_u32 v3, v3, v12
; GFX7-NEXT:    v_mul_hi_u32 v12, v2, v12
; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v13
; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
; GFX7-NEXT:    v_mul_hi_u32 v13, v1, v13
; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v14
; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, v14
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v0, v23
; GFX7-NEXT:    v_mov_b32_e32 v0, v22
; GFX7-NEXT:    v_mov_b32_e32 v1, v16
; GFX7-NEXT:    v_mov_b32_e32 v2, v17
; GFX7-NEXT:    v_mov_b32_e32 v3, v18
; GFX7-NEXT:    v_mov_b32_e32 v4, v19
; GFX7-NEXT:    v_mov_b32_e32 v5, v20
; GFX7-NEXT:    v_mov_b32_e32 v6, v21
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u32 v16, v1, v8
; GFX8-NEXT:    v_mul_lo_u32 v17, v0, v9
; GFX8-NEXT:    v_mul_hi_u32 v18, v0, v8
; GFX8-NEXT:    v_mul_lo_u32 v19, v2, v8
; GFX8-NEXT:    v_mul_lo_u32 v20, v1, v9
; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v17
; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v18
; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v18
; GFX8-NEXT:    v_mul_lo_u32 v18, v0, v10
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT:    v_mul_hi_u32 v21, v1, v8
; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v21
; GFX8-NEXT:    v_mul_hi_u32 v21, v0, v9
; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT:    v_mul_lo_u32 v22, v0, v11
; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v21
; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v18, v17
; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX8-NEXT:    v_mul_lo_u32 v20, v3, v8
; GFX8-NEXT:    v_mul_lo_u32 v21, v2, v9
; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT:    v_mul_lo_u32 v19, v1, v10
; GFX8-NEXT:    v_mul_lo_u32 v23, v1, v11
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT:    v_mul_hi_u32 v22, v2, v8
; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT:    v_mul_lo_u32 v7, v7, v8
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT:    v_mul_hi_u32 v22, v1, v9
; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT:    v_mul_lo_u32 v15, v0, v15
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT:    v_mul_hi_u32 v22, v0, v10
; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX8-NEXT:    v_mul_lo_u32 v21, v4, v8
; GFX8-NEXT:    v_mul_lo_u32 v22, v3, v9
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT:    v_mul_lo_u32 v20, v2, v10
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT:    v_mul_lo_u32 v23, v0, v12
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v3, v8
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v2, v9
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v1, v10
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v0, v11
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT:    v_mul_lo_u32 v22, v5, v8
; GFX8-NEXT:    v_mul_lo_u32 v23, v4, v9
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT:    v_mul_lo_u32 v21, v3, v10
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v23, v22
; GFX8-NEXT:    v_mul_lo_u32 v23, v2, v11
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_mul_lo_u32 v23, v1, v12
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_mul_lo_u32 v23, v0, v13
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v4, v8
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v3, v9
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v2, v10
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v1, v11
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_mul_hi_u32 v23, v0, v12
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT:    v_mul_lo_u32 v22, v6, v8
; GFX8-NEXT:    v_mul_lo_u32 v23, v5, v9
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_mul_lo_u32 v23, v4, v10
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v24, v23
; GFX8-NEXT:    v_mul_lo_u32 v24, v3, v11
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_lo_u32 v24, v2, v12
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_lo_u32 v24, v1, v13
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_lo_u32 v24, v0, v14
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_hi_u32 v24, v5, v8
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_hi_u32 v24, v4, v9
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_hi_u32 v24, v3, v10
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_hi_u32 v24, v2, v11
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_hi_u32 v24, v1, v12
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_mul_hi_u32 v24, v0, v13
; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v22
; GFX8-NEXT:    v_mul_lo_u32 v22, v0, v8
; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
; GFX8-NEXT:    v_mul_lo_u32 v6, v6, v9
; GFX8-NEXT:    v_mul_hi_u32 v9, v5, v9
; GFX8-NEXT:    v_mul_lo_u32 v5, v5, v10
; GFX8-NEXT:    v_mul_hi_u32 v10, v4, v10
; GFX8-NEXT:    v_mul_lo_u32 v4, v4, v11
; GFX8-NEXT:    v_mul_hi_u32 v11, v3, v11
; GFX8-NEXT:    v_mul_lo_u32 v3, v3, v12
; GFX8-NEXT:    v_mul_hi_u32 v12, v2, v12
; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v13
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
; GFX8-NEXT:    v_mul_hi_u32 v13, v1, v13
; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v14
; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v15
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v8
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v9
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v10
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, v14
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v11
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v12
; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v13
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v0, v23
; GFX8-NEXT:    v_mov_b32_e32 v0, v22
; GFX8-NEXT:    v_mov_b32_e32 v1, v16
; GFX8-NEXT:    v_mov_b32_e32 v2, v17
; GFX8-NEXT:    v_mov_b32_e32 v3, v18
; GFX8-NEXT:    v_mov_b32_e32 v4, v19
; GFX8-NEXT:    v_mov_b32_e32 v5, v20
; GFX8-NEXT:    v_mov_b32_e32 v6, v21
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u32 v16, v2, v8
; GFX9-NEXT:    v_mul_lo_u32 v17, v1, v9
; GFX9-NEXT:    v_mul_lo_u32 v18, v0, v10
; GFX9-NEXT:    v_mul_hi_u32 v19, v1, v8
; GFX9-NEXT:    v_mul_lo_u32 v20, v1, v8
; GFX9-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v17
; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v18
; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v16, v19
; GFX9-NEXT:    v_mul_lo_u32 v21, v0, v9
; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v18, v17, v18, v16
; GFX9-NEXT:    v_mul_hi_u32 v16, v0, v8
; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, v20, v21
; GFX9-NEXT:    v_mul_hi_u32 v21, v0, v9
; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v16, vcc, v17, v16
; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v21
; GFX9-NEXT:    v_add_u32_e32 v17, v20, v17
; GFX9-NEXT:    v_mul_lo_u32 v21, v3, v8
; GFX9-NEXT:    v_mul_lo_u32 v22, v2, v9
; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, v19, v17
; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v18, v18, v20, v19
; GFX9-NEXT:    v_mul_lo_u32 v19, v1, v10
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v21, v22
; GFX9-NEXT:    v_mul_lo_u32 v22, v0, v11
; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v20, v19
; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v22
; GFX9-NEXT:    v_mul_hi_u32 v23, v2, v8
; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v20, v21, v20, v22
; GFX9-NEXT:    v_mul_hi_u32 v21, v1, v9
; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v23
; GFX9-NEXT:    v_mul_hi_u32 v23, v0, v10
; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v21
; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v20, v20, v22, v21
; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v23
; GFX9-NEXT:    v_mul_lo_u32 v22, v4, v8
; GFX9-NEXT:    v_mul_lo_u32 v23, v3, v9
; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v18, vcc, v19, v18
; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v19, v20, v21, v19
; GFX9-NEXT:    v_mul_lo_u32 v20, v2, v10
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v22, v23
; GFX9-NEXT:    v_mul_lo_u32 v23, v1, v11
; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v21, v20
; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v21, v22, v21, v23
; GFX9-NEXT:    v_mul_lo_u32 v22, v0, v12
; GFX9-NEXT:    v_mul_hi_u32 v23, v3, v8
; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v8
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v21, v21, v22, v23
; GFX9-NEXT:    v_mul_hi_u32 v22, v2, v9
; GFX9-NEXT:    v_mul_hi_u32 v23, v1, v10
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v21, v21, v22, v23
; GFX9-NEXT:    v_mul_hi_u32 v22, v0, v11
; GFX9-NEXT:    v_mul_lo_u32 v23, v3, v10
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v19, vcc, v20, v19
; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v20, v21, v22, v20
; GFX9-NEXT:    v_mul_lo_u32 v21, v5, v8
; GFX9-NEXT:    v_mul_lo_u32 v22, v4, v9
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v22
; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_mul_lo_u32 v23, v2, v11
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT:    v_mul_lo_u32 v23, v1, v12
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_mul_lo_u32 v23, v0, v13
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT:    v_mul_hi_u32 v23, v4, v8
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_mul_hi_u32 v23, v3, v9
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT:    v_mul_hi_u32 v23, v2, v10
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_mul_hi_u32 v23, v1, v11
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT:    v_mul_hi_u32 v23, v0, v12
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v20, vcc, v21, v20
; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v21, v22, v23, v21
; GFX9-NEXT:    v_mul_lo_u32 v22, v6, v8
; GFX9-NEXT:    v_mul_lo_u32 v23, v5, v9
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT:    v_mul_lo_u32 v23, v4, v10
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT:    v_mul_lo_u32 v23, v3, v11
; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v23, v24, v25, v23
; GFX9-NEXT:    v_mul_lo_u32 v24, v2, v12
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_mul_lo_u32 v24, v1, v13
; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT:    v_mul_lo_u32 v24, v0, v14
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_mul_hi_u32 v24, v5, v8
; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT:    v_mul_hi_u32 v24, v4, v9
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_mul_hi_u32 v24, v3, v10
; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT:    v_mul_hi_u32 v24, v2, v11
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_mul_hi_u32 v24, v1, v12
; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT:    v_mul_hi_u32 v24, v0, v13
; GFX9-NEXT:    v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT:    v_add_co_u32_e32 v21, vcc, v22, v21
; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT:    v_add3_u32 v22, v23, v24, v22
; GFX9-NEXT:    v_mul_lo_u32 v23, v6, v9
; GFX9-NEXT:    v_mul_lo_u32 v24, v4, v11
; GFX9-NEXT:    v_mul_hi_u32 v4, v4, v10
; GFX9-NEXT:    v_mul_hi_u32 v6, v6, v8
; GFX9-NEXT:    v_add_u32_e32 v7, v7, v23
; GFX9-NEXT:    v_mul_lo_u32 v23, v5, v10
; GFX9-NEXT:    v_mul_hi_u32 v5, v5, v9
; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v11
; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v12
; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v12
; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v13
; GFX9-NEXT:    v_mul_hi_u32 v11, v1, v13
; GFX9-NEXT:    v_mul_lo_u32 v12, v1, v14
; GFX9-NEXT:    v_mul_lo_u32 v13, v0, v15
; GFX9-NEXT:    v_add3_u32 v7, v7, v23, v24
; GFX9-NEXT:    v_add3_u32 v2, v7, v3, v2
; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v8
; GFX9-NEXT:    v_add3_u32 v2, v2, v12, v13
; GFX9-NEXT:    v_mul_hi_u32 v0, v0, v14
; GFX9-NEXT:    v_add3_u32 v2, v2, v6, v5
; GFX9-NEXT:    v_add3_u32 v2, v2, v4, v9
; GFX9-NEXT:    v_add3_u32 v2, v2, v10, v11
; GFX9-NEXT:    v_add3_u32 v7, v2, v0, v22
; GFX9-NEXT:    v_mov_b32_e32 v0, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, v16
; GFX9-NEXT:    v_mov_b32_e32 v2, v17
; GFX9-NEXT:    v_mov_b32_e32 v3, v18
; GFX9-NEXT:    v_mov_b32_e32 v4, v19
; GFX9-NEXT:    v_mov_b32_e32 v5, v20
; GFX9-NEXT:    v_mov_b32_e32 v6, v21
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i256 %num, %den
  ret i256 %result
}
