Commits (138)
......@@ -412,13 +412,33 @@
- changes:
*gallium_core_file_list
when: on_success
- changes:
- changes: &iris_file_list
- src/gallium/drivers/iris/**/*
- src/gallium/winsys/iris/**/*
- src/intel/**/*
when: on_success
- when: never
# Unfortunately YAML doesn't let us concatenate arrays, so we have to do the
# rules duplication manually
.iris-rules-performance:
stage: intel
rules:
- *ignore_scheduled_pipelines
# Run only on pre-merge pipelines from Marge
- if: '$GITLAB_USER_LOGIN != "marge-bot" || $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME != $CI_COMMIT_REF_NAME'
when: never
- changes:
*mesa_core_file_list
when: manual
- changes:
*gallium_core_file_list
when: manual
- changes:
*iris_file_list
when: manual
- when: never
.anv-rules:
stage: intel
rules:
......
......@@ -3561,11 +3561,6 @@ interpolation should be done at, one of ``TGSI_INTERPOLATE_LOC_*``. Note that
when per-sample shading is enabled, the implementation may choose to
interpolate at the sample irrespective of the Location field.
The CylindricalWrap bitfield specifies which register components
should be subject to cylindrical wrapping when interpolating by the
rasteriser. If TGSI_CYLINDRICAL_WRAP_X is set to 1, the X component
should be interpolated according to cylindrical wrapping rules.
Declaration Sampler View
^^^^^^^^^^^^^^^^^^^^^^^^
......
# ACO crash
dEQP-VK.graphicsfuzz.cov-nested-loops-global-loop-counter-do-while-accumulate-float
# ACO crash
dEQP-VK.ssbo.phys.layout.random.8bit.all_per_block_buffers.46
......@@ -7,3 +7,6 @@ dEQP-VK.wsi.*
# Exclude this test which timeout most of the time.
dEQP-VK.memory.pipeline_barrier.transfer_src_transfer_dst.1048576
# ACO hang
dEQP-VK.memory_model.shared.*
......@@ -1421,13 +1421,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
}
case nir_op_inot: {
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->dest.dest.ssa.bit_size == 1) {
assert(src.regClass() == bld.lm);
assert(dst.regClass() == bld.lm);
/* Don't use s_andn2 here, this allows the optimizer to make a better decision */
Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
} else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
} else if (dst.regClass() == v2) {
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
......
......@@ -3448,6 +3448,14 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
bool
to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
/* Check every operand to make sure they are suitable. */
for (Operand& op : instr->operands) {
if (!op.isTemp())
return false;
if (!ctx.info[op.tempId()].is_uniform_bool() && !ctx.info[op.tempId()].is_uniform_bitwise())
return false;
}
switch (instr->opcode) {
case aco_opcode::s_and_b32:
case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
......
......@@ -545,6 +545,8 @@ radv_is_storage_image_format_supported(struct radv_physical_device *physical_dev
case V_008F14_IMG_DATA_FORMAT_4_4_4_4:
/* TODO: FMASK formats. */
return true;
case V_008F14_IMG_DATA_FORMAT_5_9_9_9:
return physical_device->rad_info.chip_class >= GFX10_3;
default:
return false;
}
......
......@@ -117,9 +117,6 @@ radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
bool optimize_conservatively, bool allow_copies)
{
bool progress;
unsigned lower_flrp = (shader->options->lower_flrp16 ? 16 : 0) |
(shader->options->lower_flrp32 ? 32 : 0) |
(shader->options->lower_flrp64 ? 64 : 0);
do {
progress = false;
......@@ -162,21 +159,6 @@ radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
NIR_PASS(progress, shader, nir_opt_constant_folding);
NIR_PASS(progress, shader, nir_opt_algebraic);
if (lower_flrp != 0) {
bool lower_flrp_progress = false;
NIR_PASS(lower_flrp_progress, shader, nir_lower_flrp, lower_flrp,
false /* always_precise */);
if (lower_flrp_progress) {
NIR_PASS(progress, shader, nir_opt_constant_folding);
progress = true;
}
/* Nothing should rematerialize any flrps, so we only
* need to do this lowering once.
*/
lower_flrp = 0;
}
NIR_PASS(progress, shader, nir_opt_undef);
NIR_PASS(progress, shader, nir_opt_shrink_vectors,
!device->instance->disable_shrink_image_store);
......@@ -311,8 +293,7 @@ lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
def = nir_build_load_global(&b, 1, 64, addr, .access = ACCESS_NON_WRITEABLE,
.align_mul = 8, .align_offset = 0);
} else {
def = nir_vec3(&b, nir_channel(&b, intrin->src[0].ssa, 0),
nir_channel(&b, intrin->src[0].ssa, 1), nir_imm_int(&b, 0));
def = nir_vector_insert_imm(&b, intrin->src[0].ssa, nir_imm_int(&b, 0), 2);
}
break;
case nir_intrinsic_vulkan_resource_index: {
......@@ -323,8 +304,6 @@ lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
nir_ssa_def *new_res = nir_vulkan_resource_index(
&b, 3, 32, intrin->src[0].ssa, .desc_set = desc_set, .binding = binding,
.desc_type = nir_intrinsic_desc_type(intrin));
nir_ssa_def *set_ptr = nir_channel(&b, new_res, 0);
nir_ssa_def *binding_ptr = nir_channel(&b, new_res, 1);
nir_ssa_def *stride;
if (desc_layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
......@@ -333,15 +312,14 @@ lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
} else {
stride = nir_imm_int(&b, desc_layout->binding[binding].size);
}
def = nir_vec3(&b, set_ptr, binding_ptr, stride);
def = nir_vector_insert_imm(&b, new_res, stride, 2);
break;
}
case nir_intrinsic_vulkan_resource_reindex: {
nir_ssa_def *set_ptr = nir_channel(&b, intrin->src[0].ssa, 0);
nir_ssa_def *binding_ptr = nir_channel(&b, intrin->src[0].ssa, 1);
nir_ssa_def *stride = nir_channel(&b, intrin->src[0].ssa, 2);
binding_ptr = nir_iadd(&b, binding_ptr, nir_imul(&b, intrin->src[1].ssa, stride));
def = nir_vec3(&b, set_ptr, binding_ptr, stride);
def = nir_vector_insert_imm(&b, intrin->src[0].ssa, binding_ptr, 1);
break;
}
case nir_intrinsic_is_sparse_texels_resident:
......@@ -700,6 +678,14 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
*/
nir_lower_var_copies(nir);
unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
(nir->options->lower_flrp32 ? 32 : 0) |
(nir->options->lower_flrp64 ? 64 : 0);
if (lower_flrp != 0) {
if (nir_lower_flrp(nir, lower_flrp, false /* always_precise */))
NIR_PASS_V(nir, nir_opt_constant_folding);
}
const nir_opt_access_options opt_access_options = {
.is_vulkan = true,
.infer_non_readable = true,
......
......@@ -175,7 +175,7 @@ vir_setup_def_use(struct v3d_compile *c)
}
if (inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
inst->qpu.flags.auf != V3D_QPU_UF_NONE) {
inst->qpu.flags.muf != V3D_QPU_UF_NONE) {
flags_inst = NULL;
}
......
......@@ -1355,12 +1355,18 @@ nir_src_is_dynamically_uniform(nir_src src)
if (src.ssa->parent_instr->type == nir_instr_type_load_const)
return true;
/* As are uniform variables */
if (src.ssa->parent_instr->type == nir_instr_type_intrinsic) {
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src.ssa->parent_instr);
/* As are uniform variables */
if (intr->intrinsic == nir_intrinsic_load_uniform &&
nir_src_is_dynamically_uniform(intr->src[0]))
return true;
/* Push constant loads always use uniform offsets. */
if (intr->intrinsic == nir_intrinsic_load_push_constant)
return true;
if (intr->intrinsic == nir_intrinsic_load_deref &&
nir_deref_mode_is(nir_src_as_deref(intr->src[0]), nir_var_mem_push_const))
return true;
}
/* Operating together dynamically uniform expressions produces a
......
......@@ -2938,6 +2938,21 @@ nir_block_ends_in_jump(nir_block *block)
nir_block_last_instr(block)->type == nir_instr_type_jump;
}
static inline bool
nir_block_ends_in_return_or_halt(nir_block *block)
{
if (exec_list_is_empty(&block->instr_list))
return false;
nir_instr *instr = nir_block_last_instr(block);
if (instr->type != nir_instr_type_jump)
return false;
nir_jump_instr *jump_instr = nir_instr_as_jump(instr);
return jump_instr->type == nir_jump_return ||
jump_instr->type == nir_jump_halt;
}
static inline bool
nir_block_ends_in_break(nir_block *block)
{
......@@ -3688,6 +3703,12 @@ typedef struct nir_shader_compiler_options {
* for rect texture lowering. */
bool has_txs;
/** Backend supports sdot_4x8 and udot_4x8 opcodes. */
bool has_dot_4x8;
/** Backend supports sudot_4x8 opcodes. */
bool has_sudot_4x8;
/* Whether to generate only scoped_barrier intrinsics instead of the set of
* memory and control barrier intrinsics based on GLSL.
*/
......
......@@ -151,6 +151,14 @@ instr_cost(nir_instr *instr, const nir_shader_compiler_options *options)
nir_alu_instr *alu = nir_instr_as_alu(instr);
const nir_op_info *info = &nir_op_infos[alu->op];
unsigned cost = 1;
if (alu->op == nir_op_flrp) {
if ((options->lower_flrp16 && nir_dest_bit_size(alu->dest.dest) == 16) ||
(options->lower_flrp32 && nir_dest_bit_size(alu->dest.dest) == 32) ||
(options->lower_flrp64 && nir_dest_bit_size(alu->dest.dest) == 64))
cost *= 3;
}
/* Assume everything 16 or 32-bit is cheap.
*
......@@ -159,7 +167,7 @@ instr_cost(nir_instr *instr, const nir_shader_compiler_options *options)
*/
if (nir_dest_bit_size(alu->dest.dest) < 64 &&
nir_src_bit_size(alu->src[0].src) < 64)
return 1;
return cost;
bool is_fp64 = nir_dest_bit_size(alu->dest.dest) == 64 &&
nir_alu_type_get_base_type(info->output_type) == nir_type_float;
......@@ -171,7 +179,6 @@ instr_cost(nir_instr *instr, const nir_shader_compiler_options *options)
if (is_fp64) {
/* If it's something lowered normally, it's expensive. */
unsigned cost = 1;
if (options->lower_doubles_options &
nir_lower_doubles_op_to_options_mask(alu->op))
cost *= 20;
......@@ -188,13 +195,13 @@ instr_cost(nir_instr *instr, const nir_shader_compiler_options *options)
if (alu->op == nir_op_idiv || alu->op == nir_op_udiv ||
alu->op == nir_op_imod || alu->op == nir_op_umod ||
alu->op == nir_op_irem)
return 100;
return cost * 100;
/* Other int64 lowering isn't usually all that expensive */
return 5;
return cost * 5;
}
return 1;
return cost;
}
}
......
......@@ -81,6 +81,23 @@ lower_alu_instr(nir_builder *bld, nir_alu_instr *alu, unsigned bit_size)
lowered_dst = nir_ishr_imm(bld, lowered_dst, dst_bit_size);
} else {
lowered_dst = nir_build_alu_src_arr(bld, op, srcs);
/* The add_sat and sub_sat instructions need to clamp the result to the
* range of the original type.
*/
if (op == nir_op_iadd_sat || op == nir_op_isub_sat) {
const int64_t int_max = u_intN_max(dst_bit_size);
const int64_t int_min = u_intN_min(dst_bit_size);
lowered_dst = nir_iclamp(bld, lowered_dst,
nir_imm_intN_t(bld, int_min, bit_size),
nir_imm_intN_t(bld, int_max, bit_size));
} else if (op == nir_op_uadd_sat || op == nir_op_usub_sat) {
const uint64_t uint_max = u_uintN_max(dst_bit_size);
lowered_dst = nir_umin(bld, lowered_dst,
nir_imm_intN_t(bld, uint_max, bit_size));
}
}
......
......@@ -786,16 +786,12 @@ build_addr_iadd(nir_builder *b, nir_ssa_def *addr,
case nir_address_format_64bit_bounded_global:
assert(addr->num_components == 4);
assert(addr->bit_size == offset->bit_size);
return nir_vec4(b, nir_channel(b, addr, 0),
nir_channel(b, addr, 1),
nir_channel(b, addr, 2),
nir_iadd(b, nir_channel(b, addr, 3), offset));
return nir_vector_insert_imm(b, addr, nir_iadd(b, nir_channel(b, addr, 3), offset), 3);
case nir_address_format_32bit_index_offset:
assert(addr->num_components == 2);
assert(addr->bit_size == offset->bit_size);
return nir_vec2(b, nir_channel(b, addr, 0),
nir_iadd(b, nir_channel(b, addr, 1), offset));
return nir_vector_insert_imm(b, addr, nir_iadd(b, nir_channel(b, addr, 1), offset), 1);
case nir_address_format_32bit_index_offset_pack64:
assert(addr->num_components == 1);
......@@ -807,8 +803,7 @@ build_addr_iadd(nir_builder *b, nir_ssa_def *addr,
case nir_address_format_vec2_index_32bit_offset:
assert(addr->num_components == 3);
assert(offset->bit_size == 32);
return nir_vec3(b, nir_channel(b, addr, 0), nir_channel(b, addr, 1),
nir_iadd(b, nir_channel(b, addr, 2), offset));
return nir_vector_insert_imm(b, addr, nir_iadd(b, nir_channel(b, addr, 2), offset), 2);
case nir_address_format_62bit_generic:
assert(addr->num_components == 1);
......
......@@ -1314,3 +1314,110 @@ unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32,
"dst.x = src0.x | ((uint64_t)src0.y << 32);")
unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64,
"dst.x = src0.x; dst.y = src0.x >> 32;")
# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32. The int8
# components are sign-extended to 32-bits, and a dot-product is performed on
# the resulting vectors. src2 is added to the result of the dot-product.
opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
False, _2src_commutative, """
const int32_t v0x = (int8_t)(src0 );
const int32_t v0y = (int8_t)(src0 >> 8);
const int32_t v0z = (int8_t)(src0 >> 16);
const int32_t v0w = (int8_t)(src0 >> 24);
const int32_t v1x = (int8_t)(src1 );
const int32_t v1y = (int8_t)(src1 >> 8);
const int32_t v1z = (int8_t)(src1 >> 16);
const int32_t v1w = (int8_t)(src1 >> 24);
dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
""")
# Like sdot_4x8_iadd, but unsigned.
opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
False, _2src_commutative, """
const uint32_t v0x = (uint8_t)(src0 );
const uint32_t v0y = (uint8_t)(src0 >> 8);
const uint32_t v0z = (uint8_t)(src0 >> 16);
const uint32_t v0w = (uint8_t)(src0 >> 24);
const uint32_t v1x = (uint8_t)(src1 );
const uint32_t v1y = (uint8_t)(src1 >> 8);
const uint32_t v1z = (uint8_t)(src1 >> 16);
const uint32_t v1w = (uint8_t)(src1 >> 24);
dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
""")
# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and
# src2 is an int32. The 8-bit components are extended to 32-bits, and a
# dot-product is performed on the resulting vectors. src2 is added to the
# result of the dot-product.
#
# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
# and source 1 mean that this opcode is not 2-source commutative
opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
False, "", """
const int32_t v0x = (int8_t)(src0 );
const int32_t v0y = (int8_t)(src0 >> 8);
const int32_t v0z = (int8_t)(src0 >> 16);
const int32_t v0w = (int8_t)(src0 >> 24);
const uint32_t v1x = (uint8_t)(src1 );
const uint32_t v1y = (uint8_t)(src1 >> 8);
const uint32_t v1z = (uint8_t)(src1 >> 16);
const uint32_t v1w = (uint8_t)(src1 >> 24);
dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
""")
# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
False, _2src_commutative, """
const int64_t v0x = (int8_t)(src0 );
const int64_t v0y = (int8_t)(src0 >> 8);
const int64_t v0z = (int8_t)(src0 >> 16);
const int64_t v0w = (int8_t)(src0 >> 24);
const int64_t v1x = (int8_t)(src1 );
const int64_t v1y = (int8_t)(src1 >> 8);
const int64_t v1z = (int8_t)(src1 >> 16);
const int64_t v1w = (int8_t)(src1 >> 24);
const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
""")
# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff].
opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
False, _2src_commutative, """
const uint64_t v0x = (uint8_t)(src0 );
const uint64_t v0y = (uint8_t)(src0 >> 8);
const uint64_t v0z = (uint8_t)(src0 >> 16);
const uint64_t v0w = (uint8_t)(src0 >> 24);
const uint64_t v1x = (uint8_t)(src1 );
const uint64_t v1y = (uint8_t)(src1 >> 8);
const uint64_t v1z = (uint8_t)(src1 >> 16);
const uint64_t v1w = (uint8_t)(src1 >> 24);
const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
""")
# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
#
# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
# and source 1 mean that this opcode is not 2-source commutative
opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
False, "", """
const int64_t v0x = (int8_t)(src0 );
const int64_t v0y = (int8_t)(src0 >> 8);
const int64_t v0z = (int8_t)(src0 >> 16);
const int64_t v0w = (int8_t)(src0 >> 24);
const uint64_t v1x = (uint8_t)(src1 );
const uint64_t v1y = (uint8_t)(src1 >> 8);
const uint64_t v1z = (uint8_t)(src1 >> 16);
const uint64_t v1w = (uint8_t)(src1 >> 24);
const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
""")
......@@ -192,15 +192,80 @@ optimizations = [
# flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
(('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
(('sdot_4x8_iadd', a, 0, b), b),
(('udot_4x8_uadd', a, 0, b), b),
(('sdot_4x8_iadd_sat', a, 0, b), b),
(('udot_4x8_uadd_sat', a, 0, b), b),
# sudot_4x8_iadd is not commutative at all, so the patterns must be
# duplicated with zeros on each of the first positions.
(('sudot_4x8_iadd', a, 0, b), b),
(('sudot_4x8_iadd', 0, a, b), b),
(('sudot_4x8_iadd_sat', a, 0, b), b),
(('sudot_4x8_iadd_sat', 0, a, b), b),
(('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),
(('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),
(('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),
# Try to let constant folding eliminate the dot-product part. These are
# safe because the dot product cannot overflow 32 bits.
(('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),
(('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),
(('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),
(('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),
(('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),
(('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),
(('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),
(('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_add_sat'),
(('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_add_sat'),
(('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_add_sat'),
]
# Shorthand for the expansion of just the dot product part of the [iu]dp4a
# instructions.
sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
optimizations.extend([
(('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
(('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_dot_4x8'),
(('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
# For the unsigned dot-product, the largest possible value 4*(255*255) =
# 0x3f804, so we don't have to worry about that intermediate result
# overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant
# that is less than 0xfffc07fc, then the result cannot overflow ever.
(('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
(('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_dot_4x8'),
# For the signed dot-product, the largest positive value is 4*(-128*-128) =
# 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We
# don't have to worry about that intermediate result overflowing or
# underflowing.
(('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
(('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
])
# Float sizes
for s in [16, 32, 64]:
optimizations.extend([
(('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
(('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
(('~flrp@{}'.format(s), ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
(('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
(('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
(('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
......@@ -652,10 +717,10 @@ optimizations.extend([
# fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
(('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
# max(-min(b, a), b) -> max(b, -a)
# min(-max(b, a), b) -> min(-b, -a)
(('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', b, ('fneg', a))),
(('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', b), ('fneg', a))),
# max(-min(b, a), b) -> max(abs(b), -a)
# min(-max(b, a), b) -> min(-abs(b), -a)
(('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),
(('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),
# If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) =
# fsat(b-a).
......@@ -1362,6 +1427,15 @@ optimizations.extend([
(('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
(('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
(('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
(('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
(('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),
(('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),
(('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),
(('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),
(('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),
(('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),
])
# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
......@@ -2471,6 +2545,11 @@ late_optimizations = [
(('ishr', a, 0), a),
(('ishr', a, -32), a),
(('ushr', a, 0), a),
(('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),
(('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),
(('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
(('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
]
# A few more extract cases we'd rather leave late
......
......@@ -222,6 +222,74 @@ is_src_scalarizable(nir_src *src)
}
}
static bool
is_binding_dynamically_uniform(nir_src src)
{
nir_binding binding = nir_chase_binding(src);
if (!binding.success)
return false;
for (unsigned i = 0; i < binding.num_indices; i++) {
if (!nir_src_is_dynamically_uniform(binding.indices[i]))
return false;
}
return true;
}
static void
pin_intrinsic(nir_intrinsic_instr *intrin)
{
nir_instr *instr = &intrin->instr;
if (!nir_intrinsic_can_reorder(intrin)) {
instr->pass_flags = GCM_INSTR_PINNED;
return;
}
instr->pass_flags = 0;
/* If the intrinsic requires a uniform source, we can't safely move it across non-uniform
* control flow if it's not uniform at the point it's defined.
* Stores and atomics can never be re-ordered, so we don't have to consider them here.
*/
bool non_uniform = nir_intrinsic_has_access(intrin) &&
(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM);
if (!non_uniform &&
(intrin->intrinsic == nir_intrinsic_load_ubo ||
intrin->intrinsic == nir_intrinsic_load_ssbo ||
intrin->intrinsic == nir_intrinsic_get_ubo_size ||
intrin->intrinsic == nir_intrinsic_get_ssbo_size ||
nir_intrinsic_has_image_dim(intrin) ||
((intrin->intrinsic == nir_intrinsic_load_deref ||
intrin->intrinsic == nir_intrinsic_deref_buffer_array_length) &&
nir_deref_mode_may_be(nir_src_as_deref(intrin->src[0]),
nir_var_mem_ubo | nir_var_mem_ssbo)))) {
if (!is_binding_dynamically_uniform(intrin->src[0]))
instr->pass_flags = GCM_INSTR_PINNED;
} else if (intrin->intrinsic == nir_intrinsic_load_push_constant) {
if (!nir_src_is_dynamically_uniform(intrin->src[0]))
instr->pass_flags = GCM_INSTR_PINNED;
} else if (intrin->intrinsic == nir_intrinsic_load_deref &&
nir_deref_mode_is(nir_src_as_deref(intrin->src[0]),
nir_var_mem_push_const)) {
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
while (deref->deref_type != nir_deref_type_var) {
if ((deref->deref_type == nir_deref_type_array ||
deref->deref_type == nir_deref_type_ptr_as_array) &&
!nir_src_is_dynamically_uniform(deref->arr.index)) {
instr->pass_flags = GCM_INSTR_PINNED;
return;
}
deref = nir_deref_instr_parent(deref);
if (!deref) {
instr->pass_flags = GCM_INSTR_PINNED;
return;
}
}
}
}
/* Walks the instruction list and marks immovable instructions as pinned or
* placed.
*
......@@ -265,24 +333,47 @@ gcm_pin_instructions(nir_function_impl *impl, struct gcm_state *state)
}
break;
case nir_instr_type_tex:
if (nir_tex_instr_has_implicit_derivative(nir_instr_as_tex(instr)))
case nir_instr_type_tex: {
nir_tex_instr *tex = nir_instr_as_tex(instr);
if (nir_tex_instr_has_implicit_derivative(tex))
instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY;
for (unsigned i = 0; i < tex->num_srcs; i++) {
nir_tex_src *src = &tex->src[i];
switch (src->src_type) {
case nir_tex_src_texture_deref:
if (!tex->texture_non_uniform && !is_binding_dynamically_uniform(src->src))
instr->pass_flags = GCM_INSTR_PINNED;
break;
case nir_tex_src_sampler_deref:
if (!tex->sampler_non_uniform && !is_binding_dynamically_uniform(src->src))
instr->pass_flags = GCM_INSTR_PINNED;
break;
case nir_tex_src_texture_offset:
case nir_tex_src_texture_handle:
if (!tex->texture_non_uniform && !nir_src_is_dynamically_uniform(src->src))
instr->pass_flags = GCM_INSTR_PINNED;
break;
case nir_tex_src_sampler_offset:
case nir_tex_src_sampler_handle:
if (!tex->sampler_non_uniform && !nir_src_is_dynamically_uniform(src->src))
instr->pass_flags = GCM_INSTR_PINNED;
break;
default:
break;
}
}
break;
}
case nir_instr_type_deref:
case nir_instr_type_load_const:
instr->pass_flags = 0;
break;
case nir_instr_type_intrinsic: {
if (nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr))) {
instr->pass_flags = 0;
} else {
instr->pass_flags = GCM_INSTR_PINNED;
}
case nir_instr_type_intrinsic:
pin_intrinsic(nir_instr_as_intrinsic(instr));
break;
}
case nir_instr_type_jump:
case nir_instr_type_ssa_undef:
......
......@@ -381,6 +381,17 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
if (prev_node->type != nir_cf_node_if)
return false;
nir_block *prev_block = nir_cf_node_as_block(nir_cf_node_prev(prev_node));
/* If the last instruction before this if/else block is a jump, we can't
* append stuff after it because it would break a bunch of assumption about
* control flow (nir_validate expects the successor of a return/halt jump
* to be the end of the function, which might not match the successor of
* the if/else blocks).
*/
if (nir_block_ends_in_return_or_halt(prev_block))
return false;
nir_if *if_stmt = nir_cf_node_as_if(prev_node);
/* first, try to collapse the if */
......@@ -422,8 +433,6 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
* selects.
*/
nir_block *prev_block = nir_cf_node_as_block(nir_cf_node_prev(prev_node));
/* First, we move the remaining instructions from the blocks to the
* block before. We have already guaranteed that this is safe by
* calling block_check_for_allowed_instrs()
......
......@@ -205,6 +205,27 @@ is_not_const_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
return true;
}
/** Is value unsigned less than 0xfffc07fc? */
static inline bool
is_ult_0xfffc07fc(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
unsigned src, unsigned num_components,
const uint8_t *swizzle)
{
/* only constant srcs: */
if (!nir_src_is_const(instr->src[src].src))
return false;
for (unsigned i = 0; i < num_components; i++) {
const unsigned val =
nir_src_comp_as_uint(instr->src[src].src, swizzle[i]);
if (val >= 0xfffc07fcU)
return false;
}
return true;
}
static inline bool
is_not_const(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
unsigned src, UNUSED unsigned num_components,
......