Commits (70)
......@@ -60,19 +60,25 @@ apt-get install -y --no-remove \
xserver-xorg-video-amdgpu \
xserver-xorg-video-ati
# We need multiarch for Wine
dpkg --add-architecture i386
# Install a more recent version of Wine than exists in Debian.
apt-key add .gitlab-ci/container/debian/winehq.gpg.key
apt-add-repository https://dl.winehq.org/wine-builds/debian/
apt update -qyy
apt-get update -q
# Needed for Valve's tracing jobs to collect information about the graphics
# hardware on the test devices.
pip3 install gfxinfo-mupuf==0.0.9
apt install -y --no-remove --install-recommends winehq-stable
# workaround wine needing 32-bit
# https://bugs.winehq.org/show_bug.cgi?id=53393
apt-get install -y --no-remove wine-stable-amd64 # a requirement for wine-stable
WINE_PKG="wine-stable"
WINE_PKG_DROP="wine-stable-i386"
apt-get download "${WINE_PKG}"
dpkg --ignore-depends="${WINE_PKG_DROP}" -i "${WINE_PKG}"*.deb
rm "${WINE_PKG}"*.deb
sed -i "/${WINE_PKG_DROP}/d" /var/lib/dpkg/status
apt-get install -y --no-remove winehq-stable # symlinks-only, depends on wine-stable
############### Install DXVK
......@@ -83,7 +89,7 @@ apt install -y --no-remove --install-recommends winehq-stable
. .gitlab-ci/container/install-wine-apitrace.sh
# Add the apitrace path to the registry
wine \
wine64 \
reg add "HKEY_LOCAL_MACHINE\System\CurrentControlSet\Control\Session Manager\Environment" \
/v Path \
/t REG_EXPAND_SZ \
......
......@@ -13,7 +13,7 @@ Windows Registry Editor Version 5.00
EOF
# Set the wine prefix and disable the crash dialog
wine regedit crashdialog.reg
wine64 regedit crashdialog.reg
rm crashdialog.reg
# An immediate wine command may fail with: "${WINEPREFIX}: Not a
......
......@@ -12,7 +12,7 @@ variables:
DEBIAN_X86_TEST_IMAGE_PATH: "debian/x86_test-gl"
DEBIAN_X86_TEST_GL_TAG: "2022-08-04-deqp-runner"
DEBIAN_X86_TEST_VK_TAG: "2022-08-04-deqp-runner"
DEBIAN_X86_TEST_VK_TAG: "2022-08-15-drop-wine-i386"
FEDORA_X86_BUILD_TAG: "2022-04-24-spirv-tools-5"
KERNEL_ROOTFS_TAG: "2022-08-08-skqp"
......
......@@ -27,6 +27,8 @@
- results/
exclude:
- results/*.shader_cache
reports:
junit: results/junit.xml
tags:
- $RUNNER_TAG
after_script:
......
......@@ -48,7 +48,7 @@ sleep 1
# when asked to load PE executables.
# TODO: Have boot2container mount this filesystem for all jobs?
mount -t binfmt_misc none /proc/sys/fs/binfmt_misc
echo ':DOSWin:M::MZ::/usr/bin/wine:' > /proc/sys/fs/binfmt_misc/register
echo ':DOSWin:M::MZ::/usr/bin/wine64:' > /proc/sys/fs/binfmt_misc/register
# Set environment for DXVK.
export DXVK_LOG_LEVEL="info"
......@@ -68,7 +68,7 @@ if [ ${TEST_START_XORG:-0} -eq 1 ]; then
export DISPLAY=:0
fi
wine --version
wine64 --version
SANITY_MESA_VERSION_CMD="$SANITY_MESA_VERSION_CMD | tee /tmp/version.txt | grep \"Mesa $MESA_VERSION\(\s\|$\)\""
......
......@@ -423,35 +423,35 @@ Vulkan 1.1 -- all DONE: anv, lvp, radv, tu, vn
VK_KHR_16bit_storage DONE (anv/gen8+, lvp, radv, tu/a650, v3dv, vn)
VK_KHR_bind_memory2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_dedicated_allocation DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_descriptor_update_template DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_device_group DONE (lvp, tu, v3dv, vn)
VK_KHR_device_group_creation DONE (lvp, tu, v3dv, vn)
VK_KHR_descriptor_update_template DONE (anv, dzn, lvp, radv, tu, v3dv, vn)
VK_KHR_device_group DONE (anv, lvp, tu, v3dv, vn)
VK_KHR_device_group_creation DONE (anv, lvp, tu, v3dv, vn)
VK_KHR_external_fence DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_external_fence_capabilities DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_external_memory DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_external_memory_capabilities DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_external_memory DONE (anv, lvp, pvr, radv, tu, v3dv, vn)
VK_KHR_external_memory_capabilities DONE (anv, lvp, pvr, radv, tu, v3dv, vn)
VK_KHR_external_semaphore DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_external_semaphore_capabilities DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_get_memory_requirements2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_get_physical_device_properties2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_get_physical_device_properties2 DONE (anv, dzn, lvp, panvk, radv, tu, v3dv, vn)
VK_KHR_maintenance1 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_maintenance2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_maintenance3 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_multiview DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_relaxed_block_layout DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_sampler_ycbcr_conversion DONE (anv, radv, tu, vn)
VK_KHR_shader_draw_parameters DONE (anv, lvp, radv, tu, vn)
VK_KHR_storage_buffer_storage_class DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_variable_pointers DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_shader_draw_parameters DONE (anv, dzn, lvp, radv, tu, vn)
VK_KHR_storage_buffer_storage_class DONE (anv, lvp, panvk, radv, tu, v3dv, vn)
VK_KHR_variable_pointers DONE (anv, lvp, panvk, radv, tu, v3dv, vn)
Vulkan 1.2 -- all DONE: anv, vn
VK_KHR_8bit_storage DONE (anv/gen8+, lvp, radv, v3dv, vn)
VK_KHR_buffer_device_address DONE (anv/gen8+, lvp, radv, tu, v3dv, vn)
VK_KHR_create_renderpass2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_depth_stencil_resolve DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_draw_indirect_count DONE (anv, lvp, radv, tu, vn)
VK_KHR_driver_properties DONE (anv, lvp, radv, v3dv, vn)
VK_KHR_create_renderpass2 DONE (anv, dzn, lvp, radv, tu, v3dv, vn)
VK_KHR_depth_stencil_resolve DONE (anv, dzn, lvp, radv, tu, v3dv, vn)
VK_KHR_draw_indirect_count DONE (anv, dzn, lvp, radv, tu, vn)
VK_KHR_driver_properties DONE (anv, dzn, lvp, radv, tu, v3dv, vn)
VK_KHR_image_format_list DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_imageless_framebuffer DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_sampler_mirror_clamp_to_edge DONE (anv, lvp, radv, tu, v3dv, vn)
......@@ -473,111 +473,112 @@ Vulkan 1.2 -- all DONE: anv, vn
Vulkan 1.3 -- all DONE: anv, radv, lvp
VK_KHR_copy_commands2 DONE (anv, lvp, radv, tu, v3dv)
VK_KHR_dynamic_rendering DONE (anv, lvp, radv, tu)
VK_KHR_format_feature_flags2 DONE (anv, radv, tu, v3dv)
VK_KHR_maintenance4 DONE (anv, radv, tu)
VK_KHR_copy_commands2 DONE (anv, lvp, panvk, radv, tu, v3dv, vn)
VK_KHR_dynamic_rendering DONE (anv, dzn, lvp, radv, tu, vn)
VK_KHR_format_feature_flags2 DONE (anv, lvp, radv, tu, v3dv)
VK_KHR_maintenance4 DONE (anv, lvp, radv, tu, vn)
VK_KHR_shader_non_semantic_info DONE (anv, radv, tu, v3dv)
VK_KHR_shader_terminate_invocation DONE (anv, radv, tu)
VK_KHR_synchronization2 DONE (anv, radv, tu)
VK_KHR_zero_initialize_workgroup_memory DONE (anv, radv, tu)
VK_EXT_4444_formats DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_extended_dynamic_state DONE (anv, lvp, radv, tu)
VK_EXT_extended_dynamic_state2 DONE (anv, lvp, radv, tu)
VK_EXT_inline_uniform_block DONE (anv, radv, v3dv)
VK_EXT_pipeline_creation_cache_control DONE (anv, radv, tu, v3dv)
VK_EXT_pipeline_creation_feedback DONE (anv, radv, tu, v3dv)
VK_EXT_private_data DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_image_robustness DONE (anv, radv, tu)
VK_EXT_shader_demote_to_helper_invocation DONE (anv, radv, tu)
VK_EXT_subgroup_size_control DONE (anv, radv, tu)
VK_EXT_texel_buffer_alignment DONE (anv, radv, tu)
VK_KHR_shader_terminate_invocation DONE (anv, lvp, radv, tu)
VK_KHR_synchronization2 DONE (anv, lvp, panvk, radv, tu)
VK_KHR_zero_initialize_workgroup_memory DONE (anv, lvp, radv, tu)
VK_EXT_4444_formats DONE (anv, lvp, radv, tu, v3dv, vn)
VK_EXT_extended_dynamic_state DONE (anv, lvp, radv, tu, vn)
VK_EXT_extended_dynamic_state2 DONE (anv, lvp, radv, tu, vn)
VK_EXT_inline_uniform_block DONE (anv, lvp, radv, v3dv, vn)
VK_EXT_pipeline_creation_cache_control DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_pipeline_creation_feedback DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_private_data DONE (anv, lvp, pvr, radv, tu, v3dv)
VK_EXT_image_robustness DONE (anv, lvp, radv, tu, vn)
VK_EXT_shader_demote_to_helper_invocation DONE (anv, lvp, radv, tu, vn)
VK_EXT_subgroup_size_control DONE (anv, lvp, radv, tu)
VK_EXT_texel_buffer_alignment DONE (anv, lvp, radv, tu)
Khronos extensions that are not part of any Vulkan version:
VK_KHR_acceleration_structure in progress
VK_KHR_acceleration_structure DONE (radv/gfx10.3+)
VK_KHR_android_surface not started
VK_KHR_deferred_host_operations DONE (anv, radv)
VK_KHR_display DONE (anv, lvp, radv, tu, v3dv)
VK_KHR_display DONE (anv, pvr, radv, tu, v3dv)
VK_KHR_display_swapchain not started
VK_KHR_external_fence_fd DONE (anv, radv, tu, v3dv, vn)
VK_KHR_external_fence_win32 not started
VK_KHR_external_memory_fd DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_external_memory_fd DONE (anv, lvp, pvr, radv, tu, v3dv, vn)
VK_KHR_external_memory_win32 not started
VK_KHR_external_semaphore_fd DONE (anv, radv, tu, v3dv, vn)
VK_KHR_external_semaphore_win32 not started
VK_KHR_fragment_shading_rate DONE (radv/gfx10.3+)
VK_KHR_get_display_properties2 DONE (anv, lvp, radv, tu, v3dv)
VK_KHR_fragment_shading_rate DONE (anv/gen11+, radv/gfx10.3+)
VK_KHR_get_display_properties2 DONE (anv, radv, tu, v3dv)
VK_KHR_get_surface_capabilities2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_incremental_present DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_performance_query DONE (anv/gen8+, tu, v3dv)
VK_KHR_performance_query DONE (anv/gen8+, radv/gfx10.3+, tu, v3dv)
VK_KHR_pipeline_executable_properties DONE (anv, radv, tu, v3dv)
VK_KHR_pipeline_library DONE (lvp, radv)
VK_KHR_push_descriptor DONE (anv, lvp, radv, tu)
VK_KHR_ray_query in progress
VK_KHR_ray_query DONE (radv/gfx10.3+)
VK_KHR_ray_tracing_maintenance1 DONE (radv/gfx10.3+)
VK_KHR_ray_tracing_pipeline in progress
VK_KHR_shader_clock DONE (anv, lvp, radv)
VK_KHR_shader_integer_dot_product DONE (anv, radv, tu)
VK_KHR_shader_integer_dot_product DONE (anv, lvp, radv, tu)
VK_KHR_shader_subgroup_uniform_control_flow DONE (anv, radv)
VK_KHR_shared_presentable_image not started
VK_KHR_surface DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_surface DONE (anv, dzn, lvp, panvk, pvr, radv, tu, v3dv, vn)
VK_KHR_surface_protected_capabilities DONE (anv, lvp, radv, v3dv, vn)
VK_KHR_swapchain DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_swapchain DONE (anv, dzn, lvp, panvk, pvr, radv, tu, v3dv, vn)
VK_KHR_swapchain_mutable_format DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_wayland_surface DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_wayland_surface DONE (anv, dzn, lvp, panvk, radv, tu, v3dv, vn)
VK_KHR_workgroup_memory_explicit_layout DONE (anv, radv)
VK_KHR_win32_keyed_mutex not started
VK_KHR_win32_surface DONE (lvp)
VK_KHR_xcb_surface DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_xlib_surface DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_win32_surface DONE (dzn, lvp)
VK_KHR_xcb_surface DONE (anv, dzn, lvp, radv, tu, v3dv, vn)
VK_KHR_xlib_surface DONE (anv, dzn, lvp, radv, tu, v3dv, vn)
VK_EXT_border_color_swizzle DONE (anv, lvp, tu, radv/gfx10+)
VK_EXT_buffer_device_address DONE (radv)
VK_EXT_calibrated_timestamps DONE (anv, lvp, radv)
VK_EXT_buffer_device_address DONE (anv/gen8+, radv)
VK_EXT_calibrated_timestamps DONE (anv, lvp, radv, vn)
VK_EXT_color_write_enable DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_conditional_rendering DONE (anv, lvp, radv, tu)
VK_EXT_conservative_rasterization DONE (anv/gen9+, radv)
VK_EXT_custom_border_color DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_conditional_rendering DONE (anv, lvp, radv, tu, vn)
VK_EXT_conservative_rasterization DONE (anv/gen9+, radv, vn)
VK_EXT_custom_border_color DONE (anv, lvp, panvk, radv, tu, v3dv, vn)
VK_EXT_debug_marker DONE (radv)
VK_EXT_depth_clip_enable DONE (anv, lvp, radv, tu)
VK_EXT_depth_clip_control DONE (lvp, radv, tu)
VK_EXT_depth_range_unrestricted DONE (radv)
VK_EXT_depth_clip_enable DONE (anv, lvp, radv, tu, vn)
VK_EXT_depth_clip_control DONE (anv, lvp, radv, tu)
VK_EXT_depth_range_unrestricted DONE (radv, lvp)
VK_EXT_discard_rectangles DONE (radv)
VK_EXT_display_control DONE (anv, tu)
VK_EXT_external_memory_dma_buf DONE (anv, radv, tu, v3dv, vn)
VK_EXT_display_control DONE (anv, radv, tu)
VK_EXT_external_memory_dma_buf DONE (anv, pvr, radv, tu, v3dv, vn)
VK_EXT_external_memory_host DONE (anv, lvp, radv)
VK_EXT_filter_cubic DONE (tu/a650)
VK_EXT_fragment_shader_interlock DONE (anv/gen9+)
VK_EXT_global_priority DONE (anv, radv)
VK_EXT_global_priority_query DONE (radv)
VK_EXT_global_priority_query DONE (anv, radv)
VK_EXT_graphics_pipeline_library DONE (lvp)
VK_EXT_image_2d_view_of_3d DONE (anv, tu, lvp)
VK_EXT_image_2d_view_of_3d DONE (anv, radv, tu, lvp)
VK_EXT_image_drm_format_modifier DONE (anv, radv/gfx9+, tu, v3dv, vn)
VK_EXT_image_view_min_lod DONE (radv, tu)
VK_EXT_index_type_uint8 DONE (anv, lvp, radv/gfx8+, v3dv, tu)
VK_EXT_line_rasterization DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_image_view_min_lod DONE (anv, radv, tu, vn)
VK_EXT_index_type_uint8 DONE (anv, lvp, panvk, radv/gfx8+, v3dv, tu, vn)
VK_EXT_line_rasterization DONE (anv, lvp, radv, tu, v3dv, vn)
VK_EXT_memory_budget DONE (anv, radv, tu)
VK_EXT_memory_priority DONE (radv)
VK_EXT_multi_draw DONE (anv, lvp, radv)
VK_EXT_multisampled_render_to_single_sampled DONE (lvp)
VK_EXT_non_seamless_cube_map DONE (anv, lvp, radv)
VK_EXT_pci_bus_info DONE (anv, radv)
VK_EXT_physical_device_drm DONE (anv, radv, tu, v3dv)
VK_EXT_physical_device_drm DONE (anv, radv, tu, v3dv, vn)
VK_EXT_post_depth_coverage DONE (anv/gfx10+, lvp, radv/gfx10+)
VK_EXT_primitive_topology_list_restart DONE (anv, lvp, radv, tu)
VK_EXT_primitives_generated_query DONE (lvp, radv, tu)
VK_EXT_provoking_vertex DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_queue_family_foreign DONE (anv, radv, vn)
VK_EXT_robustness2 DONE (anv, lvp, radv, tu)
VK_EXT_primitives_generated_query DONE (anv, lvp, radv, tu)
VK_EXT_provoking_vertex DONE (anv, lvp, radv, tu, v3dv, vn)
VK_EXT_queue_family_foreign DONE (anv, radv, tu, vn)
VK_EXT_robustness2 DONE (anv, lvp, radv, tu, vn)
VK_EXT_sample_locations DONE (anv, radv/gfx9-, tu/a650)
VK_EXT_shader_atomic_float DONE (anv, radv)
VK_EXT_shader_atomic_float2 DONE (anv/gen9+, radv)
VK_EXT_shader_image_atomic_int64 DONE (radv)
VK_EXT_shader_stencil_export DONE (anv/gen9+, lvp, radv, tu)
VK_EXT_shader_stencil_export DONE (anv/gen9+, lvp, radv, tu, vn)
VK_EXT_shader_subgroup_ballot DONE (anv, lvp, radv)
VK_EXT_shader_subgroup_vote DONE (anv, lvp, radv)
VK_EXT_shader_module_identifier DONE (anv, radv, tu)
VK_EXT_transform_feedback DONE (anv, lvp, radv, tu, vn)
VK_EXT_vertex_attribute_divisor DONE (anv, radv, lvp, tu, v3dv)
VK_EXT_vertex_attribute_divisor DONE (anv, dzn, panvk, radv, lvp, tu, v3dv, vn)
VK_EXT_vertex_input_dynamic_state DONE (lvp, radv, tu)
VK_EXT_ycbcr_image_arrays DONE (anv, radv)
VK_ANDROID_external_memory_android_hardware_buffer DONE (anv, radv, vn)
......
......@@ -623,7 +623,12 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
encoding |= vop3.opsel << 11;
for (unsigned i = 0; i < 3; i++)
encoding |= vop3.abs[i] << (8 + i);
if (instr->definitions.size() == 2)
/* On GFX9 and older, v_cmpx implicitly writes exec besides writing an SGPR pair.
* On GFX10 and newer, v_cmpx always writes just exec.
*/
if (instr->definitions.size() == 2 && instr->isVOPC())
assert(ctx.gfx_level <= GFX9 && instr->definitions[1].physReg() == exec);
else if (instr->definitions.size() == 2)
encoding |= instr->definitions[1].physReg() << 8;
encoding |= (0xFF & instr->definitions[0].physReg());
out.push_back(encoding);
......@@ -720,7 +725,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
uint32_t encoding = 0;
if (instr->isVOPC()) {
if (instr->definitions[0].physReg() != vcc) {
if (instr->definitions[0].physReg() != vcc && instr->definitions[0].physReg() != exec) {
encoding |= instr->definitions[0].physReg() << 8;
encoding |= 1 << 15;
}
......
......@@ -4825,7 +4825,12 @@ store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmas
 
wrmask = util_widen_mask(wrmask, elem_size_bytes);
 
const unsigned wrmask_bitcnt = util_bitcount(wrmask);
uint32_t todo = u_bit_consecutive(0, data.bytes());
if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
todo = MIN2(todo, wrmask);
while (todo) {
int offset, byte;
if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
......
......@@ -296,6 +296,18 @@ instr_writes_exec(Instruction* instr)
return false;
}
template <typename T, typename U>
bool
regs_intersect(const T& a, const U& b)
{
const unsigned a_lo = a.physReg();
const unsigned a_hi = a_lo + a.size();
const unsigned b_lo = b.physReg();
const unsigned b_hi = b_lo + b.size();
return a_hi > b_lo && b_hi > a_lo;
}
void
try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const int exec_val_idx,
const int exec_copy_idx)
......@@ -343,21 +355,23 @@ try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const in
if (exec_val->definitions.size() > 1)
return;
/* Check if a suitable v_cmpx opcode exists. */
const aco_opcode v_cmpx_op =
exec_val->isVOPC() ? get_vcmpx(exec_val->opcode) : aco_opcode::num_opcodes;
const bool vopc = v_cmpx_op != aco_opcode::num_opcodes;
/* If s_and_saveexec is used, we'll need to insert a new instruction to save the old exec. */
const bool save_original_exec = exec_copy->opcode == aco_opcode::s_and_saveexec_b32 ||
exec_copy->opcode == aco_opcode::s_and_saveexec_b64;
/* Position where the original exec mask copy should be inserted. */
const int save_original_exec_idx = exec_val_idx;
/* The copy can be removed when it kills its operand. */
const bool can_remove_copy = exec_copy->operands[0].isKill();
/* Only use v_cmpx on GFX10+ where it doesn't always clobber the VCC.
* Also check if a suitable v_cmpx opcode exists.
/* The copy can be removed when it kills its operand.
* v_cmpx also writes the original destination pre GFX10.
*/
const aco_opcode v_cmpx_op =
exec_val->isVOPC() ? get_vcmpx(exec_val->opcode) : aco_opcode::num_opcodes;
const bool usable_vcmpx = ctx.program->gfx_level >= GFX10 && v_cmpx_op != aco_opcode::num_opcodes;
const bool vopc = exec_val->isVOPC() && usable_vcmpx;
const bool can_remove_copy =
exec_copy->operands[0].isKill() || (vopc && ctx.program->gfx_level < GFX10);
/* Whether exec_val and exec_copy are adjacent (with p_logical_end inbetween). */
const bool val_and_copy_adjacent = exec_val_idx == exec_copy_idx - 2;
/* Always allow reassigning when the value is written by (usable) VOPC.
* Note, VOPC implicitly contains "& exec" because it yields zero on inactive lanes.
......@@ -376,17 +390,88 @@ try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const in
const Definition exec_wr_def = exec_val->definitions[0];
const Definition exec_copy_def = exec_copy->definitions[0];
/* Reassign the instruction to write exec directly. */
exec_val->definitions[0] = Definition(exec, ctx.program->lane_mask);
if (!val_and_copy_adjacent) {
/* When exec_val and exec_copy are non-adjacent, check whether there are any
* instructions inbetween (besides p_logical_end) which may inhibit the optimization.
*/
for (int idx = exec_val_idx + 1; idx < exec_copy_idx; ++idx) {
aco_ptr<Instruction>& instr = block.instructions[idx];
if (save_original_exec) {
/* Check if the instruction uses the exec_copy_def register, in which case we can't
* optimize. */
for (const Operand& op : instr->operands)
if (regs_intersect(exec_copy_def, op))
return;
for (const Definition& def : instr->definitions)
if (regs_intersect(exec_copy_def, def))
return;
}
/* Check if the instruction may implicitly read VCC, eg. v_cndmask or add with carry.
* Some of these may be fine to convert to VOP3 but there are edge cases, eg. SDWA.
* Better leave these instructions alone.
*/
if (instr->isVALU() && instr->operands.size() >= 3 && !instr->isVOP3())
return;
}
}
if (save_original_exec) {
/* We insert the exec copy before exec_val, so exec_val can't use those registers. */
for (const Operand& op : exec_val->operands)
if (regs_intersect(exec_copy_def, op))
return;
/* We would write over the saved exec value in this case. */
if (((vopc && ctx.program->gfx_level < GFX10) || !can_remove_copy) &&
regs_intersect(exec_copy_def, exec_wr_def))
return;
}
if (vopc) {
/* Add one extra definition for exec and copy the VOP3-specific fields if present. */
if (ctx.program->gfx_level < GFX10) {
if (exec_val->isSDWA() || exec_val->isDPP()) {
/* This might work but it needs testing and more code to copy the instruction. */
return;
}
else if (!exec_val->isVOP3()) {
aco_ptr<Instruction> tmp = std::move(exec_val);
exec_val.reset(create_instruction<VOPC_instruction>(
tmp->opcode, tmp->format, tmp->operands.size(), tmp->definitions.size() + 1));
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), exec_val->operands.begin());
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(),
exec_val->definitions.begin());
} else {
aco_ptr<Instruction> tmp = std::move(exec_val);
exec_val.reset(create_instruction<VOP3_instruction>(
tmp->opcode, tmp->format, tmp->operands.size(), tmp->definitions.size() + 1));
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), exec_val->operands.begin());
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(),
exec_val->definitions.begin());
VOP3_instruction& src = tmp->vop3();
VOP3_instruction& dst = exec_val->vop3();
dst.opsel = src.opsel;
dst.omod = src.omod;
dst.clamp = src.clamp;
std::copy(std::cbegin(src.abs), std::cend(src.abs), std::begin(dst.abs));
std::copy(std::cbegin(src.neg), std::cend(src.neg), std::begin(dst.neg));
}
}
/* Set v_cmpx opcode. */
exec_val->opcode = v_cmpx_op;
*exec_val->definitions.rbegin() = Definition(exec, ctx.program->lane_mask);
/* TODO: change instruction from VOP3 to plain VOPC when possible. */
} else {
/* Reassign the instruction to write exec directly. */
exec_val->definitions[0] = Definition(exec, ctx.program->lane_mask);
}
if (exec_val_idx != exec_copy_idx - 2) {
if (!val_and_copy_adjacent) {
/* If there are other instructions (besides p_logical_end) between
* writing the value and copying it to exec, reassign uses
* of the old definition.
......@@ -411,8 +496,19 @@ try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const in
exec_copy->operands[0] = Operand(exec, ctx.program->lane_mask);
}
if (exec_val->opcode == aco_opcode::p_parallelcopy && exec_val->operands[0].isConstant() &&
exec_val->operands[0].constantValue()) {
/* Remove the branch instruction when exec is constant non-zero. */
aco_ptr<Instruction>& branch = block.instructions.back();
if (branch->isBranch() && branch->operands.size() && branch->operands[0].physReg() == exec)
block.instructions.back().reset();
}
if (save_original_exec) {
/* Insert a new instruction that saves the original exec before it is overwritten. */
/* Insert a new instruction that saves the original exec before it is overwritten.
* Do this last, because inserting in the instructions vector may invalidate the exec_val
* reference.
*/
const auto it = std::next(block.instructions.begin(), save_original_exec_idx);
aco_ptr<Instruction> copy(
create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1));
......@@ -420,14 +516,6 @@ try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const in
copy->operands[0] = Operand(exec, ctx.program->lane_mask);
block.instructions.insert(it, std::move(copy));
}
if (exec_val->opcode == aco_opcode::p_parallelcopy && exec_val->operands[0].isConstant() &&
exec_val->operands[0].constantValue()) {
/* Remove the branch instruction when exec is constant non-zero. */
aco_ptr<Instruction>& branch = block.instructions.back();
if (branch->isBranch() && branch->operands.size() && branch->operands[0].physReg() == exec)
block.instructions.back().reset();
}
}
void
......@@ -493,7 +581,7 @@ eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
/* For a newly encountered exec write, clear the used flag. */
if (writes_exec) {
if (!logical_end_found && branch_reads_exec && instr->operands.size() == 1) {
if (!logical_end_found && branch_reads_exec && instr->operands.size()) {
/* We are in a branch that jumps according to exec.
* We just found the instruction that copies to exec before the branch.
*/
......@@ -509,13 +597,16 @@ eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
}
exec_write_used = false;
}
if (branch_exec_tempid && !exec_write_used && instr->definitions.size() &&
instr->definitions[0].tempId() == branch_exec_tempid) {
} else if (branch_exec_tempid && instr->definitions.size() &&
instr->definitions[0].tempId() == branch_exec_tempid) {
/* We just found the instruction that produces the exec mask that is copied. */
assert(branch_exec_val_idx == -1);
branch_exec_val_idx = i;
} else if (branch_exec_tempid && branch_exec_val_idx == -1 && needs_exec) {
/* There is an instruction that needs the original exec mask before
* branch_exec_val_idx was found, so we can't optimize the branching sequence. */
branch_exec_copy_idx = -1;
branch_exec_tempid = 0;
}
/* If the current instruction needs exec, mark it as used. */
......
......@@ -1673,13 +1673,6 @@ agx_compile_shader_nir(nir_shader *nir,
NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
glsl_get_natural_size_align_bytes);
NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
if (ctx->stage == MESA_SHADER_VERTEX) {
/* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */
if (!key->vs.clip_halfz)
NIR_PASS_V(nir, nir_lower_clip_halfz);
}
NIR_PASS_V(nir, nir_split_var_copies);
NIR_PASS_V(nir, nir_lower_global_vars_to_local);
NIR_PASS_V(nir, nir_lower_var_copies);
......
......@@ -236,9 +236,6 @@ struct agx_vs_shader_key {
unsigned vbuf_strides[AGX_MAX_VBUFS];
struct agx_attribute attributes[AGX_MAX_ATTRIBS];
/* Set to true for clip coordinates to range [0, 1] instead of [-1, 1] */
bool clip_halfz : 1;
};
struct agx_fs_shader_key {
......
......@@ -581,6 +581,7 @@
<field name="Deflake 3" start="110:0" size="64" type="address"/>
<field name="Unk 112" start="112:0" size="32" default="0x1" type="hex"/>
<field name="Unk 114" start="114:0" size="32" default="0x1c" type="hex"/>
<field name="OpenGL depth clipping" start="116:24" size="1" type="bool"/>
<field name="Unk 118" start="118:0" size="32" default="0xffffffff" type="hex"/>
<field name="Unk 119" start="119:0" size="32" default="0xffffffff" type="hex"/>
<field name="Unk 120" start="120:0" size="32" default="0xffffffff" type="hex"/>
......
......@@ -324,6 +324,7 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
c->tmu.flush[c->tmu.flush_count].dest = dest;
c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
c->tmu.flush_count++;
c->tmu.total_count++;
if (c->disable_tmu_pipelining)
ntq_flush_tmu(c);
......
......@@ -645,6 +645,7 @@ struct v3d_compile {
uint8_t component_mask;
} flush[MAX_TMU_QUEUE_SIZE];
uint32_t flush_count;
uint32_t total_count;
} tmu;
/**
......@@ -919,6 +920,7 @@ struct v3d_prog_data {
uint32_t spill_size;
uint32_t tmu_spills;
uint32_t tmu_fills;
uint32_t tmu_count;
uint32_t qpu_read_stalls;
......
......@@ -855,6 +855,7 @@ v3d_set_prog_data(struct v3d_compile *c,
prog_data->spill_size = c->spill_size;
prog_data->tmu_spills = c->spills;
prog_data->tmu_fills = c->fills;
prog_data->tmu_count = c->tmu.total_count;
prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
prog_data->compile_strategy_idx = c->compile_strategy_idx;
prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
......
This diff is collapsed.
......@@ -119,8 +119,8 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
if (!job)
return true;
v3dv_job_start_frame(job, width, height, max_layer, false,
1, internal_bpp,
v3dv_job_start_frame(job, width, height, max_layer,
false, true, 1, internal_bpp,
image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
struct v3dv_meta_framebuffer framebuffer;
......
......@@ -407,7 +407,7 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
v3dv_job_start_frame(job, width, height, num_layers, false,
v3dv_job_start_frame(job, width, height, num_layers, false, true,
1, internal_bpp, false);
struct v3dv_meta_framebuffer framebuffer;
......@@ -948,7 +948,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
v3dv_job_start_frame(job, width, height, num_layers, false, 1, internal_bpp,
v3dv_job_start_frame(job, width, height, num_layers,
false, true, 1, internal_bpp,
src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
struct v3dv_meta_framebuffer framebuffer;
......@@ -1448,7 +1449,7 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
v3dv_job_start_frame(job, width, height, num_layers, false,
v3dv_job_start_frame(job, width, height, num_layers, false, true,
1, internal_bpp, false);
struct v3dv_meta_framebuffer framebuffer;
......@@ -4288,7 +4289,7 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
(fb_format, region->srcSubresource.aspectMask,
&internal_type, &internal_bpp);
v3dv_job_start_frame(job, width, height, num_layers, false,
v3dv_job_start_frame(job, width, height, num_layers, false, true,
1, internal_bpp, true);
struct v3dv_meta_framebuffer framebuffer;
......
......@@ -341,10 +341,12 @@ subpass_get_granularity(struct v3dv_device *device,
msaa = true;
}
/* If requested, double-buffer may or may not be enabled depending on
* heuristics so we choose a conservative granularity here, with it disabled.
*/
uint32_t width, height;
bool double_buffer = (V3D_DEBUG & V3D_DEBUG_DOUBLE_BUFFER) && !msaa;
v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
double_buffer, &width, &height);
false /* double-buffer */, &width, &height);
*granularity = (VkExtent2D) {
.width = width,
.height = height
......
......@@ -1080,6 +1080,37 @@ struct v3dv_job {
*/
bool uses_buffer_device_address;
/* True if we have not identified anything that would be incompatible
* with double-buffer (like MSAA) or that would make double-buffer mode
* not efficient (like tile loads or not having any stores).
*/
bool can_use_double_buffer;
/* This structure keeps track of various scores to inform a heuristic
* for double-buffer mode.
*/
struct {
/* Cost of geometry shading */
uint32_t geom;
/* Cost of shader rendering */
uint32_t render;
} double_buffer_score;
/* We only need to allocate tile state for all layers if the binner
* writes primitives to layers other than the first. This can only be
* done using layered rendering (writing gl_Layer from a geometry shader),
* so for other cases of multilayered framebuffers (typically with
* meta copy/clear operations) that won't use layered rendering, we only
* need one layer worth of of tile state for the binner.
*/
bool allocate_tile_state_for_all_layers;
/* A pointer to the location of the TILE_BINNING_MODE_CFG packet so we can
* rewrite it to enable double-buffer mode by the time we have enough info
* about the job to make that decision.
*/
struct v3dv_cl_out *bcl_tile_binning_mode_ptr;
enum v3dv_job_type type;
struct v3dv_device *device;
......@@ -1188,6 +1219,7 @@ void v3dv_job_start_frame(struct v3dv_job *job,
uint32_t height,
uint32_t layers,
bool allocate_tile_state_for_all_layers,
bool allocate_tile_state_now,
uint32_t render_target_count,
uint8_t max_internal_bpp,
bool msaa);
......@@ -1211,7 +1243,10 @@ v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
void **ptr);
void v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
bool indexed, bool indirect);
bool indexed, bool indirect,
uint32_t vertex_count);
bool v3dv_job_allocate_tile_state(struct v3dv_job *job);
/* FIXME: only used on v3dv_cmd_buffer and v3dvx_cmd_buffer, perhaps move to a
* cmd_buffer specific header?
......@@ -1583,6 +1618,16 @@ void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
void v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst,
struct v3dv_barrier_state *src);
bool v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state,
VkImageAspectFlags aspect,
uint32_t first_subpass_idx,
VkAttachmentLoadOp load_op);
bool v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state,
VkImageAspectFlags aspect,
uint32_t last_subpass_idx,
VkAttachmentStoreOp store_op);
struct v3dv_event {
struct vk_object_base base;
int state;
......