...
 
Commits (119)
......@@ -525,7 +525,7 @@ check mr:
script:
- ci-fairy check-merge-request --require-allow-collaboration --junit-xml=check-merge-request.xml
check commits:
.check commits:
extends: .sanity-check
rules:
- if: *is-pre-merge
......
......@@ -13,7 +13,6 @@ dEQP-VK.spirv_assembly.instruction.graphics.opquantize.spec_const_positive_round
dEQP-VK.tessellation.invariance.outer_edge_index_independence.quads_fractional_even_spacing_ccw_point_mode
dEQP-VK.tessellation.invariance.outer_edge_symmetry.triangles_fractional_odd_spacing_cw_point_mode
KHR-GL30.clip_distance.functional
KHR-GL30.transform_feedback.api_errors_test
KHR-GL30.transform_feedback.capture_vertex_interleaved_test
KHR-GL30.transform_feedback.capture_vertex_separate_test
......
......@@ -23,3 +23,5 @@ dEQP-GLES31.functional.primitive_bounding_box.wide_points.global_state.vertex_ge
# This one is really slow and can time out (~56 seconds locally)
KHR-GL33.texture_swizzle.smoke
# Another slow one post NIR transition
KHR-GL33.texture_swizzle.functional
......@@ -236,7 +236,7 @@ traces:
- path: neverball/neverball.trace
expectations:
- device: freedreno-a630
checksum: e67cdf15590f1729201eb82393f5513e
checksum: 3e0a972c2a2180b349cb1c529d3ceca5
- path: pathfinder/canvas_moire.trace
expectations:
- device: freedreno-a630
......
......@@ -27,7 +27,6 @@
lists
bugs
Mesa/DRI Wiki <https://dri.freedesktop.org/>
.. toctree::
:maxdepth: 1
......@@ -77,6 +76,7 @@
dispatch
gallium/index
android
Linux Kernel Drivers <https://www.kernel.org/doc/html/latest/gpu/>
.. toctree::
:maxdepth: 1
......@@ -93,7 +93,6 @@
OpenGL Website <https://www.opengl.org>
DRI Website <https://dri.freedesktop.org>
freedesktop.org <https://www.freedesktop.org>
Developer Blogs <https://planet.freedesktop.org>
.. toctree::
......@@ -101,4 +100,4 @@
:caption: Hosted by:
:hidden:
freedesktop.org <https://planet.freedesktop.org>
freedesktop.org <https://www.freedesktop.org>
......@@ -554,6 +554,9 @@ RADV driver environment variables
``forcecompress``
Enables DCC,FMASK,CMASK,HTILE in situations where the driver supports it
but normally does not deem it beneficial.
``hang``
enable GPU hangs detection and dump a report to $HOME/radv_dumps_<pid>
if a GPU hang is detected
``info``
show GPU-related information
``metashaders``
......@@ -624,8 +627,6 @@ RADV driver environment variables
``RADV_TEX_ANISO``
force anisotropy filter (up to 16)
``RADV_TRACE_FILE``
generate cmdbuffer tracefiles when a GPU hang is detected
``ACO_DEBUG``
a comma-separated list of named flags, which do various things:
......
......@@ -11,3 +11,5 @@ VK_KHR_copy_commands2 on RADV
VK_KHR_shader_terminate_invocation on RADV
NGG GS support in ACO
VK_KHR_shader_terminate_invocation on ANV
driconf: add glx_extension_override
driconf: add indirect_gl_extension_override
......@@ -1795,7 +1795,7 @@ struct __DRIimageLookupExtensionRec {
* This extension allows for common DRI2 options
*/
#define __DRI2_CONFIG_QUERY "DRI_CONFIG_QUERY"
#define __DRI2_CONFIG_QUERY_VERSION 1
#define __DRI2_CONFIG_QUERY_VERSION 2
typedef struct __DRI2configQueryExtensionRec __DRI2configQueryExtension;
struct __DRI2configQueryExtensionRec {
......@@ -1804,6 +1804,7 @@ struct __DRI2configQueryExtensionRec {
int (*configQueryb)(__DRIscreen *screen, const char *var, unsigned char *val);
int (*configQueryi)(__DRIscreen *screen, const char *var, int *val);
int (*configQueryf)(__DRIscreen *screen, const char *var, float *val);
int (*configQuerys)(__DRIscreen *screen, const char *var, char **val);
};
/**
......
This diff is collapsed.
......@@ -31,6 +31,7 @@
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
......@@ -205,7 +206,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
void ac_compute_driver_uuid(char *uuid, size_t size);
void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size);
void ac_print_gpu_info(struct radeon_info *info);
void ac_print_gpu_info(struct radeon_info *info, FILE *f);
int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family);
void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p,
uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p);
......
......@@ -8609,9 +8609,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
has_bias = true;
break;
case nir_tex_src_lod: {
nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
if (val && val->f32 <= 0.0) {
if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
level_zero = true;
} else {
lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
......
......@@ -3776,10 +3776,14 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
break;
}
case nir_intrinsic_set_vertex_and_primitive_count:
/* Currently ignored. */
break;
default:
fprintf(stderr, "Unknown intrinsic: ");
nir_print_instr(&instr->instr, stderr);
fprintf(stderr, "\n");
abort();
break;
}
if (result) {
......
......@@ -28,6 +28,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <sys/utsname.h>
#include <sys/stat.h>
#include "util/mesa-sha1.h"
#include "sid.h"
......@@ -44,6 +45,8 @@
#define COLOR_YELLOW "\033[1;33m"
#define COLOR_CYAN "\033[1;36m"
#define RADV_DUMP_DIR "radv_dumps"
/* Trace BO layout (offsets are 4 bytes):
*
* [0]: primary trace ID
......@@ -80,19 +83,10 @@ radv_init_trace(struct radv_device *device)
}
static void
radv_dump_trace(struct radv_device *device, struct radeon_cmdbuf *cs)
radv_dump_trace(struct radv_device *device, struct radeon_cmdbuf *cs, FILE *f)
{
const char *filename = getenv("RADV_TRACE_FILE");
FILE *f = fopen(filename, "w");
if (!f) {
fprintf(stderr, "Failed to write trace dump to %s\n", filename);
return;
}
fprintf(f, "Trace ID: %x\n", *device->trace_id_ptr);
device->ws->cs_dump(cs, f, (const int*)device->trace_id_ptr, 2);
fclose(f);
}
static void
......@@ -485,21 +479,25 @@ radv_dump_queue_state(struct radv_queue *queue, FILE *f)
}
static void
radv_dump_dmesg(FILE *f)
radv_dump_cmd(const char *cmd, FILE *f)
{
char line[2000];
char line[2048];
FILE *p;
p = popen("dmesg | tail -n60", "r");
if (!p)
return;
p = popen(cmd, "r");
if (p) {
while (fgets(line, sizeof(line), p))
fputs(line, f);
fprintf(f, "\n");
pclose(p);
}
}
static void
radv_dump_dmesg(FILE *f)
{
fprintf(f, "\nLast 60 lines of dmesg:\n\n");
while (fgets(line, sizeof(line), p))
fputs(line, f);
fprintf(f, "\n");
pclose(p);
radv_dump_cmd("dmesg | tail -n60", f);
}
void
......@@ -550,6 +548,42 @@ radv_dump_device_name(struct radv_device *device, FILE *f)
kernel_version);
}
static void
radv_dump_umr_ring(struct radv_queue *queue, FILE *f)
{
enum ring_type ring = radv_queue_family_to_ring(queue->queue_family_index);
struct radv_device *device = queue->device;
char cmd[128];
/* TODO: Dump compute ring. */
if (ring != RING_GFX)
return;
sprintf(cmd, "umr -R %s 2>&1",
device->physical_device->rad_info.chip_class >= GFX10 ? "gfx_0.0.0" : "gfx");
fprintf(f, "\nUMR GFX ring:\n\n");
radv_dump_cmd(cmd, f);
}
static void
radv_dump_umr_waves(struct radv_queue *queue, FILE *f)
{
enum ring_type ring = radv_queue_family_to_ring(queue->queue_family_index);
struct radv_device *device = queue->device;
char cmd[128];
/* TODO: Dump compute ring. */
if (ring != RING_GFX)
return;
sprintf(cmd, "umr -O bits,halt_waves -wa %s 2>&1",
device->physical_device->rad_info.chip_class >= GFX10 ? "gfx_0.0.0" : "gfx");
fprintf(f, "\nUMR GFX waves:\n\n");
radv_dump_cmd(cmd, f);
}
static bool
radv_gpu_hang_occured(struct radv_queue *queue, enum ring_type ring)
{
......@@ -565,8 +599,10 @@ void
radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_cmdbuf *cs)
{
struct radv_device *device = queue->device;
char dump_dir[256], dump_path[512];
enum ring_type ring;
uint64_t addr;
FILE *f;
ring = radv_queue_family_to_ring(queue->queue_family_index);
......@@ -578,23 +614,96 @@ radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_cmdbuf *cs)
if (!hang_occurred && !vm_fault_occurred)
return;
radv_dump_trace(queue->device, cs);
fprintf(stderr, "radv: GPU hang detected...\n");
/* Create a directory into $HOME/radv_dumps_<pid> to save various
* debugging info about that GPU hang.
*/
snprintf(dump_dir, sizeof(dump_dir), "%s/"RADV_DUMP_DIR"_%d",
debug_get_option("HOME", "."), getpid());
if (mkdir(dump_dir, 0774) && errno != EEXIST) {
fprintf(stderr, "radv: can't create directory '%s' (%i).\n",
dump_dir, errno);
abort();
}
/* Dump trace file. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "trace.log");
f = fopen(dump_path, "w+");
if (f) {
radv_dump_trace(queue->device, cs, f);
fclose(f);
}
/* Dump pipeline state. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "pipeline.log");
f = fopen(dump_path, "w+");
if (f) {
radv_dump_queue_state(queue, f);
fclose(f);
}
fprintf(stderr, "GPU hang report:\n\n");
radv_dump_device_name(device, stderr);
ac_print_gpu_info(&device->physical_device->rad_info);
/* Dump UMR ring. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "umr_ring.log");
f = fopen(dump_path, "w+");
if (f) {
radv_dump_umr_ring(queue, f);
fclose(f);
}
radv_dump_enabled_options(device, stderr);
radv_dump_dmesg(stderr);
/* Dump UMR waves. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "umr_waves.log");
f = fopen(dump_path, "w+");
if (f) {
radv_dump_umr_waves(queue, f);
fclose(f);
}
/* Dump debug registers. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "registers.log");
f = fopen(dump_path, "w+");
if (f) {
radv_dump_debug_registers(device, f);
fclose(f);
}
/* Dump VM fault info. */
if (vm_fault_occurred) {
fprintf(stderr, "VM fault report.\n\n");
fprintf(stderr, "Failing VM page: 0x%08"PRIx64"\n\n", addr);
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "vm_fault.log");
f = fopen(dump_path, "w+");
if (f) {
fprintf(f, "VM fault report.\n\n");
fprintf(f, "Failing VM page: 0x%08"PRIx64"\n\n", addr);
fclose(f);
}
}
/* Dump enabled debug/perftest options. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "options.log");
f = fopen(dump_path, "w+");
if (f) {
radv_dump_enabled_options(device, f);
fclose(f);
}
radv_dump_debug_registers(device, stderr);
radv_dump_queue_state(queue, stderr);
/* Dump GPU info. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "gpu_info.log");
f = fopen(dump_path, "w+");
if (f) {
radv_dump_device_name(device, f);
ac_print_gpu_info(&device->physical_device->rad_info, f);
fclose(f);
}
/* Dump dmesg. */
snprintf(dump_path, sizeof(dump_path), "%s/%s", dump_dir, "dmesg.log");
f = fopen(dump_path, "w+");
if (f) {
radv_dump_dmesg(f);
fclose(f);
}
fprintf(stderr, "radv: GPU hang report saved to '%s'!\n", dump_dir);
abort();
}
......@@ -602,8 +711,7 @@ void
radv_print_spirv(const char *data, uint32_t size, FILE *fp)
{
char path[] = "/tmp/fileXXXXXX";
char line[2048], command[128];
FILE *p;
char command[128];
int fd;
/* Dump the binary into a temporary file. */
......@@ -614,15 +722,9 @@ radv_print_spirv(const char *data, uint32_t size, FILE *fp)
if (write(fd, data, size) == -1)
goto fail;
sprintf(command, "spirv-dis %s", path);
/* Disassemble using spirv-dis if installed. */
p = popen(command, "r");
if (p) {
while (fgets(line, sizeof(line), p))
fprintf(fp, "%s", line);
pclose(p);
}
sprintf(command, "spirv-dis %s", path);
radv_dump_cmd(command, fp);
fail:
close(fd);
......
......@@ -57,6 +57,7 @@ enum {
RADV_DEBUG_DISCARD_TO_DEMOTE = 1 << 26,
RADV_DEBUG_LLVM = 1 << 27,
RADV_DEBUG_FORCE_COMPRESS = 1 << 28,
RADV_DEBUG_HANG = 1 << 29,
};
enum {
......
......@@ -430,7 +430,7 @@ radv_physical_device_try_create(struct radv_instance *instance,
device->bus_info = *drm_device->businfo.pci;
if ((device->instance->debug_flags & RADV_DEBUG_INFO))
ac_print_gpu_info(&device->rad_info);
ac_print_gpu_info(&device->rad_info, stdout);
/* The WSI is structured as a layer on top of the driver, so this has
* to be the last part of initialization (at least until we get other
......@@ -530,6 +530,7 @@ static const struct debug_control radv_debug_options[] = {
{"discardtodemote", RADV_DEBUG_DISCARD_TO_DEMOTE},
{"llvm", RADV_DEBUG_LLVM},
{"forcecompress", RADV_DEBUG_FORCE_COMPRESS},
{"hang", RADV_DEBUG_HANG},
{NULL, 0}
};
......@@ -2794,19 +2795,25 @@ VkResult radv_CreateDevice(
device->physical_device->rad_info.family == CHIP_HAWAII ? 4096 : 8192;
if (getenv("RADV_TRACE_FILE")) {
const char *filename = getenv("RADV_TRACE_FILE");
fprintf(stderr, "***********************************************************************************\n");
fprintf(stderr, "* WARNING: RADV_TRACE_FILE=<file> is deprecated and replaced by RADV_DEBUG=hang *\n");
fprintf(stderr, "***********************************************************************************\n");
abort();
}
if (device->instance->debug_flags & RADV_DEBUG_HANG) {
/* Enable GPU hangs detection and dump logs if a GPU hang is
* detected.
*/
keep_shader_info = true;
if (!radv_init_trace(device))
goto fail;
fprintf(stderr, "*****************************************************************************\n");
fprintf(stderr, "* WARNING: RADV_TRACE_FILE is costly and should only be used for debugging! *\n");
fprintf(stderr, "* WARNING: RADV_DEBUG=hang is costly and should only be used for debugging! *\n");
fprintf(stderr, "*****************************************************************************\n");
fprintf(stderr, "Trace file will be dumped to %s\n", filename);
/* Wait for idle after every draw/dispatch to identify the
* first bad call.
*/
......
......@@ -877,7 +877,7 @@ v3d_field_iterator_next(struct clif_dump *clif, struct v3d_field_iterator *iter)
if (iter->field->minus_one)
value++;
if (strcmp(iter->field->name, "Vec size") == 0 && value == 0)
value = 1 << (e - s);
value = 1 << (e - s + 1);
snprintf(iter->value, sizeof(iter->value), "%u", value);
enum_name = v3d_get_enum_name(&iter->field->inline_enum, value);
break;
......
......@@ -1595,6 +1595,10 @@ ntq_setup_vs_inputs(struct v3d_compile *c)
* components and unused ones DCEed. The vertex fetcher will load
* from the start of the attribute to the number of components we
* declare we need in c->vattr_sizes[].
*
* BGRA vertex attributes are a bit special: since we implement these
* as RGBA swapping R/B components we always need at least 3 components
* if component 0 is read.
*/
nir_foreach_shader_in_variable(var, c->s) {
/* No VS attribute array support. */
......@@ -1606,6 +1610,16 @@ ntq_setup_vs_inputs(struct v3d_compile *c)
c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
start_component + num_components);
/* Handle BGRA user inputs */
if (start_component == 0 &&
var->data.location >= VERT_ATTRIB_GENERIC0) {
int32_t idx = var->data.location - VERT_ATTRIB_GENERIC0;
if (c->vs_key->va_swap_rb_mask & (1 << idx)) {
c->vattr_sizes[loc] =
MAX2(3, c->vattr_sizes[loc]);
}
}
}
unsigned num_components = 0;
......
......@@ -1181,11 +1181,6 @@ emit_clip_window(struct v3dv_job *job, const VkRect2D *rect)
}
}
/* Checks whether the render area rectangle covers a region that is aligned to
* tile boundaries, which means that for all tiles covered by the render area
* region, there are no uncovered pixels (unless they are also outside the
* framebuffer).
*/
static void
cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
{
......@@ -1200,24 +1195,11 @@ cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
* always have framebuffer information available.
*/
assert(cmd_buffer->state.framebuffer);
const VkExtent2D fb_extent = {
.width = cmd_buffer->state.framebuffer->width,
.height = cmd_buffer->state.framebuffer->height
};
VkExtent2D granularity;
v3dv_subpass_get_granularity(cmd_buffer->state.pass,
cmd_buffer->state.subpass_idx,
&granularity);
cmd_buffer->state.tile_aligned_render_area =
rect->offset.x % granularity.width == 0 &&
rect->offset.y % granularity.height == 0 &&
(rect->extent.width % granularity.width == 0 ||
rect->offset.x + rect->extent.width >= fb_extent.width) &&
(rect->extent.height % granularity.height == 0 ||
rect->offset.y + rect->extent.height >= fb_extent.height);
v3dv_subpass_area_is_tile_aligned(rect,
cmd_buffer->state.framebuffer,
cmd_buffer->state.pass,
cmd_buffer->state.subpass_idx);
if (!cmd_buffer->state.tile_aligned_render_area) {
perf_debug("Render area for subpass %d of render pass %p doesn't "
......@@ -2023,7 +2005,6 @@ cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
assert(state->subpass_idx < state->pass->subpass_count);
const struct v3dv_render_pass *pass = state->pass;
const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
struct v3dv_cl *rcl = &job->rcl;
/* Comon config must be the first TILE_RENDERING_MODE_CFG and
......@@ -2031,7 +2012,6 @@ cmd_buffer_emit_render_pass_rcl(struct v3dv_cmd_buffer *cmd_buffer)
* updates to the previous HW state.
*/
const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
config.image_width_pixels = framebuffer->width;
config.image_height_pixels = framebuffer->height;
......
......@@ -1977,6 +1977,8 @@ v3dv_CreateFramebuffer(VkDevice _device,
framebuffer->width = pCreateInfo->width;
framebuffer->height = pCreateInfo->height;
framebuffer->layers = pCreateInfo->layers;
framebuffer->has_edge_padding = true;
framebuffer->attachment_count = pCreateInfo->attachmentCount;
framebuffer->color_attachment_count = 0;
for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
......
......@@ -658,7 +658,6 @@ v3dv_CreateImageView(VkDevice _device,
iview->vk_format = format;
iview->format = v3dv_get_format(format);
assert(iview->format && iview->format->supported);
iview->swap_rb = iview->format->swizzle[0] == PIPE_SWIZZLE_Z;
if (vk_format_is_depth_or_stencil(iview->vk_format)) {
iview->internal_type = v3dv_get_internal_depth_type(iview->vk_format);
......@@ -671,6 +670,7 @@ v3dv_CreateImageView(VkDevice _device,
const uint8_t *format_swizzle = v3dv_get_format_swizzle(format);
util_format_compose_swizzles(format_swizzle, image_view_swizzle,
iview->swizzle);
iview->swap_rb = iview->swizzle[0] == PIPE_SWIZZLE_Z;
pack_texture_shader_state(device, iview);
......
......@@ -61,6 +61,7 @@ v3dv_meta_blit_finish(struct v3dv_device *device)
struct v3dv_meta_blit_pipeline *item = entry->data;
v3dv_DestroyPipeline(_device, item->pipeline, &device->alloc);
v3dv_DestroyRenderPass(_device, item->pass, &device->alloc);
v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->alloc);
vk_free(&device->alloc, item);
}
_mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
......@@ -771,7 +772,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
VkColorComponentFlags cmask,
VkComponentMapping *cswizzle,
const VkImageBlit *region,
VkFilter filter);
VkFilter filter,
bool dst_is_padded_image);
/**
* Returns true if the implementation supports the requested operation (even if
......@@ -908,6 +910,12 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
else
buf_height = region->bufferImageHeight;
/* If the image is compressed, the bpp refers to blocks, not pixels */
uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
uint32_t block_height = vk_format_get_blockheight(image->vk_format);
buf_width = buf_width / block_width;
buf_height = buf_height / block_height;
/* Compute layers to copy */
uint32_t num_layers;
if (image->type != VK_IMAGE_TYPE_3D)
......@@ -916,9 +924,51 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
num_layers = region->imageExtent.depth;
assert(num_layers > 0);
/* Copy requested layers */
/* Our blit interface can see the real format of the images to detect
* copies between compressed and uncompressed images and adapt the
* blit region accordingly. Here we are just doing a raw copy of
* compressed data, but we are passing an uncompressed view of the
* buffer for the blit destination image (since compressed formats are
* not renderable), so we also want to provide an uncompressed view of
* the source image.
*/
VkResult result;
struct v3dv_device *device = cmd_buffer->device;
VkDevice _device = v3dv_device_to_handle(device);
if (vk_format_is_compressed(image->vk_format)) {
VkImage uiview;
VkImageCreateInfo uiview_info = {
.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
.imageType = VK_IMAGE_TYPE_3D,
.format = dst_format,
.extent = { buf_width, buf_height, image->extent.depth },
.mipLevels = image->levels,
.arrayLayers = image->array_size,
.samples = image->samples,
.tiling = image->tiling,
.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.initialLayout = VK_IMAGE_LAYOUT_GENERAL,
};
result = v3dv_CreateImage(_device, &uiview_info, &device->alloc, &uiview);
if (result != VK_SUCCESS)
return handled;
v3dv_cmd_buffer_add_private_obj(
cmd_buffer, (uintptr_t)uiview,
(v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
result = v3dv_BindImageMemory(_device, uiview,
v3dv_device_memory_to_handle(image->mem),
image->mem_offset);
if (result != VK_SUCCESS)
return handled;
image = v3dv_image_from_handle(uiview);
}
/* Copy requested layers */
for (uint32_t i = 0; i < num_layers; i++) {
/* Create the destination blit image from the destination buffer */
VkImageCreateInfo image_info = {
......@@ -937,7 +987,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
};
VkImage buffer_image;
VkResult result =
result =
v3dv_CreateImage(_device, &image_info, &device->alloc, &buffer_image);
if (result != VK_SUCCESS)
return handled;
......@@ -972,13 +1022,15 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
},
.srcOffsets = {
{
region->imageOffset.x,
region->imageOffset.y,
DIV_ROUND_UP(region->imageOffset.x, block_width),
DIV_ROUND_UP(region->imageOffset.y, block_height),
region->imageOffset.z + i,
},
{
region->imageOffset.x + region->imageExtent.width,
region->imageOffset.y + region->imageExtent.height,
DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
block_width),
DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
block_height),
region->imageOffset.z + i + 1,
},
},
......@@ -990,7 +1042,11 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
},
.dstOffsets = {
{ 0, 0, 0 },
{ region->imageExtent.width, region->imageExtent.height, 1 },
{
DIV_ROUND_UP(region->imageExtent.width, block_width),
DIV_ROUND_UP(region->imageExtent.height, block_height),
1
},
},
};
......@@ -998,7 +1054,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
v3dv_image_from_handle(buffer_image), dst_format,
image, src_format,
cmask, &cswizzle,
&blit_region, VK_FILTER_NEAREST);
&blit_region, VK_FILTER_NEAREST, false);
if (!handled) {
/* This is unexpected, we should have a supported blit spec */
unreachable("Unable to blit buffer to destination image");
......@@ -1454,7 +1510,7 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
dst, format,
src, format,
0, NULL,
&blit_region, VK_FILTER_NEAREST);
&blit_region, VK_FILTER_NEAREST, true);
/* We should have selected formats that we can blit */
assert(handled);
......@@ -2491,8 +2547,18 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
if (image->tiling != VK_IMAGE_TILING_LINEAR) {
src_format = image->vk_format;
dst_format = src_format;
} else {
src_format = VK_FORMAT_R8G8B8A8_UINT;
aspect = VK_IMAGE_ASPECT_COLOR_BIT;
if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
cmask = VK_COLOR_COMPONENT_R_BIT |
VK_COLOR_COMPONENT_G_BIT |
VK_COLOR_COMPONENT_B_BIT;
}
}
dst_format = src_format;
break;
case VK_IMAGE_ASPECT_STENCIL_BIT:
/* Since we don't support separate stencil this is always a stencil
......@@ -2515,8 +2581,8 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
};
break;
case 2:
src_format = (aspect == VK_IMAGE_ASPECT_COLOR_BIT) ?
VK_FORMAT_R16_UINT : image->vk_format;
aspect = VK_IMAGE_ASPECT_COLOR_BIT;
src_format = VK_FORMAT_R16_UINT;
dst_format = src_format;
break;
case 1:
......@@ -2672,18 +2738,20 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
.dstSubresource = {
.aspectMask = aspect,
.mipLevel = region->imageSubresource.mipLevel,
.baseArrayLayer = region->imageSubresource.baseArrayLayer,
.layerCount = region->imageSubresource.layerCount,
.baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
.layerCount = 1,
},
.dstOffsets = {
{
region->imageOffset.x / block_width,
region->imageOffset.y / block_height,
DIV_ROUND_UP(region->imageOffset.x, block_width),
DIV_ROUND_UP(region->imageOffset.y, block_height),
region->imageOffset.z + i,
},
{
(region->imageOffset.x + region->imageExtent.width) / block_width,
(region->imageOffset.y + region->imageExtent.height) / block_height,
DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
block_width),
DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
block_height),
region->imageOffset.z + i + 1,
},
},
......@@ -2693,7 +2761,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
image, dst_format,
v3dv_image_from_handle(buffer_image), src_format,
cmask, NULL,
&blit_region, VK_FILTER_NEAREST);
&blit_region, VK_FILTER_NEAREST, true);
if (!handled) {
/* This is unexpected, we should have a supported blit spec */
unreachable("Unable to blit buffer to destination image");
......@@ -3101,20 +3169,15 @@ static bool
create_blit_render_pass(struct v3dv_device *device,
VkFormat dst_format,
VkFormat src_format,
VkRenderPass *pass)
VkRenderPass *pass_load,
VkRenderPass *pass_no_load)
{
const bool is_color_blit = vk_format_is_color(dst_format);
/* FIXME: if blitting to tile boundaries or to the whole image, we could
* use LOAD_DONT_CARE, but then we would have to include that in the
* pipeline hash key. Or maybe we should just create both render passes and
* use one or the other at draw time since they would both be compatible
* with the pipeline anyway
*/
/* Attachment load operation is specified below */
VkAttachmentDescription att = {
.format = dst_format,
.samples = VK_SAMPLE_COUNT_1_BIT,
.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
.storeOp = VK_ATTACHMENT_STORE_OP_STORE,
.initialLayout = VK_IMAGE_LAYOUT_GENERAL,
.finalLayout = VK_IMAGE_LAYOUT_GENERAL,
......@@ -3146,8 +3209,16 @@ create_blit_render_pass(struct v3dv_device *device,
.pDependencies = NULL,
};
VkResult result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
&info, &device->alloc, pass);
VkResult result;
att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
&info, &device->alloc, pass_load);
if (result != VK_SUCCESS)
return false;
att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
&info, &device->alloc, pass_no_load);
return result == VK_SUCCESS;
}
......@@ -3763,10 +3834,14 @@ get_blit_pipeline(struct v3dv_device *device,
goto fail;
ok = create_blit_render_pass(device, dst_format, src_format,
&(*pipeline)->pass);
&(*pipeline)->pass,
&(*pipeline)->pass_no_load);
if (!ok)
goto fail;
/* Create the pipeline using one of the render passes, they are both
* compatible, so we don't care which one we use here.
*/
ok = create_blit_pipeline(device,
dst_format,
src_format,
......@@ -3794,6 +3869,8 @@ fail:
if (*pipeline) {
if ((*pipeline)->pass)
v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->alloc);
if ((*pipeline)->pass_no_load)
v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->alloc);
if ((*pipeline)->pipeline)
v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->alloc);
vk_free(&device->alloc, *pipeline);
......@@ -3896,7 +3973,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
VkColorComponentFlags cmask,
VkComponentMapping *cswizzle,
const VkImageBlit *_region,
VkFilter filter)
VkFilter filter,
bool dst_is_padded_image)
{
bool handled = true;
......@@ -3906,8 +3984,11 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
assert(dst->tiling != VK_IMAGE_TILING_LINEAR ||
!vk_format_is_depth_or_stencil(dst_format));
VkImageBlit region = *_region;
/* Can't sample from linear images */
if (src->tiling == VK_IMAGE_TILING_LINEAR && src->type != VK_IMAGE_TYPE_1D)
return false;
VkImageBlit region = *_region;
/* Rewrite combined D/S blits to compatible color blits */
if (vk_format_is_depth_or_stencil(dst_format)) {
assert(src_format == dst_format);
......@@ -3940,12 +4021,12 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
}
if (cmask == 0) {
cmask = VK_COLOR_COMPONENT_R_BIT |
VK_COLOR_COMPONENT_G_BIT |
VK_COLOR_COMPONENT_B_BIT |
VK_COLOR_COMPONENT_A_BIT;
}
const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
VK_COLOR_COMPONENT_G_BIT |
VK_COLOR_COMPONENT_B_BIT |
VK_COLOR_COMPONENT_A_BIT;
if (cmask == 0)
cmask = full_cmask;
VkComponentMapping ident_swizzle = {
.r = VK_COMPONENT_SWIZZLE_IDENTITY,
......@@ -4072,7 +4153,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
&pipeline);
if (!ok)
return handled;
assert(pipeline && pipeline->pipeline && pipeline->pass);
assert(pipeline && pipeline->pipeline &&
pipeline->pass && pipeline->pass_no_load);
struct v3dv_device *device = cmd_buffer->device;
assert(cmd_buffer->meta.blit.dspool);
......@@ -4128,6 +4210,11 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
if (result != VK_SUCCESS)
goto fail;
struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
fb_info.height == dst_level_h &&
dst_is_padded_image;
v3dv_cmd_buffer_add_private_obj(
cmd_buffer, (uintptr_t)fb,
(v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
......@@ -4208,15 +4295,30 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
};
v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
/* If the region we are about to blit is tile-aligned, then we can
* use the render pass version that won't pre-load the tile buffer
* with the dst image contents before the blit. The exception is when we
* don't have a full color mask, since in that case we need to preserve
* the original value of some of the color components.
*/
const VkRect2D render_area = {
.offset = { dst_x, dst_y },
.extent = { dst_w, dst_h },
};
struct v3dv_render_pass *pipeline_pass =
v3dv_render_pass_from_handle(pipeline->pass);
bool can_skip_tlb_load =
cmask == full_cmask &&
v3dv_subpass_area_is_tile_aligned(&render_area, framebuffer,
pipeline_pass, 0);
/* Record blit */
VkRenderPassBeginInfo rp_info = {
.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
.renderPass = pipeline->pass,
.renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
pipeline->pass,
.framebuffer = fb,
.renderArea = {
.offset = { dst_x, dst_y },
.extent = { dst_w, dst_h }
},
.renderArea = render_area,
.clearValueCount = 0,
};
......@@ -4234,7 +4336,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
tex_coords[4] =
!mirror_z ?
(min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
(max_dst_layer - (i + 0.5f) * src_z_step) / (float)src_level_d ;
(max_src_layer - (i + 0.5f) * src_z_step) / (float)src_level_d;
}
v3dv_CmdPushConstants(_cmd_buffer,
......@@ -4308,7 +4410,7 @@ v3dv_CmdBlitImage(VkCommandBuffer commandBuffer,
dst, dst->vk_format,
src, src->vk_format,
0, NULL,
&pRegions[i], filter)) {
&pRegions[i], filter, true)) {
continue;
}
unreachable("Unsupported blit operation");
......@@ -4469,7 +4571,7 @@ resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
dst, dst->vk_format,
src, src->vk_format,
0, NULL,
&blit_region, VK_FILTER_NEAREST);
&blit_region, VK_FILTER_NEAREST, true);
}
void
......
......@@ -255,10 +255,10 @@ v3dv_DestroyRenderPass(VkDevice _device,
vk_free2(&device->alloc, pAllocator, pass);
}
void
v3dv_subpass_get_granularity(struct v3dv_render_pass *pass,
uint32_t subpass_idx,
VkExtent2D *granularity)
static void
subpass_get_granularity(struct v3dv_render_pass *pass,
uint32_t subpass_idx,
VkExtent2D *granularity)
{
static const uint8_t tile_sizes[] = {
64, 64,
......@@ -321,8 +321,50 @@ v3dv_GetRenderAreaGranularity(VkDevice device,
for (uint32_t i = 0; i < pass->subpass_count; i++) {
VkExtent2D sg;
v3dv_subpass_get_granularity(pass, i, &sg);
subpass_get_granularity(pass, i, &sg);
pGranularity->width = MIN2(pGranularity->width, sg.width);
pGranularity->height = MIN2(pGranularity->height, sg.height);
}
}
/* Checks whether the render area rectangle covers a region that is aligned to
* tile boundaries. This means that we are writing to all pixels covered by
* all tiles in that area (except for pixels on edge tiles that are outside
* the framebuffer dimensions).
*
* When our framebuffer is aligned to tile boundaries we know we are writing
* valid data to all all pixels in each tile and we can apply certain
* optimizations, like avoiding tile loads, since we know that none of the
* original pixel values in each tile for that area need to be preserved.
* We also use this to decide if we can use TLB clears, as these clear whole
* tiles so we can't use them if the render area is not aligned.
*
* Note that when an image is created it will possibly include padding blocks
* depending on its tiling layout. When the framebuffer dimensions are not
* aligned to tile boundaries then edge tiles are only partially covered by the
* framebuffer pixels, but tile stores still seem to store full tiles
* writing to the padded sections. This is important when the framebuffer
* is aliasing a smaller section of a larger image, as in that case the edge
* tiles of the framebuffer would overwrite valid pixels in the larger image.
* In that case, we can't flag the area as being aligned.
*/
bool
v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
struct v3dv_framebuffer *fb,
struct v3dv_render_pass *pass,
uint32_t subpass_idx)
{
assert(subpass_idx >= 0 && subpass_idx < pass->subpass_count);
VkExtent2D granularity;
subpass_get_granularity(pass, subpass_idx, &granularity);
return area->offset.x % granularity.width == 0 &&
area->offset.y % granularity.height == 0 &&
(area->extent.width % granularity.width == 0 ||
(fb->has_edge_padding &&
area->offset.x + area->extent.width >= fb->width)) &&
(area->extent.height % granularity.height == 0 ||
(fb->has_edge_padding &&
area->offset.y + area->extent.height >= fb->height));
}
......@@ -1154,10 +1154,10 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
key->alpha_test = false;
key->alpha_test_func = COMPARE_FUNC_NEVER;
/* FIXME: placeholder. Final value for swap_color_rb depends on the format
* of the surface to be used.
/* This is intended for V3D versions before 4.1, otherwise we just use the
* tile buffer load/store swap R/B bit.
*/
key->swap_color_rb = false;
key->swap_color_rb = 0;
const struct v3dv_render_pass *pass =
v3dv_render_pass_from_handle(pCreateInfo->renderPass);
......
......@@ -257,6 +257,7 @@ struct v3dv_meta_depth_clear_pipeline {
struct v3dv_meta_blit_pipeline {
VkPipeline pipeline;
VkRenderPass pass;
VkRenderPass pass_no_load;
uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
};
......@@ -555,15 +556,22 @@ struct v3dv_render_pass {
struct v3dv_subpass_attachment *subpass_attachments;
};
void v3dv_subpass_get_granularity(struct v3dv_render_pass *pass,
uint32_t subpass_idx,
VkExtent2D *granularity);
struct v3dv_framebuffer {
uint32_t width;
uint32_t height;
uint32_t layers;
/* Typically, edge tiles in the framebuffer have padding depending on the
* underlying tiling layout. One consequnce of this is that when the
* framebuffer dimensions are not aligned to tile boundaries, tile stores
* would still write full tiles on the edges and write to the padded area.
* If the framebuffer is aliasing a smaller region of a larger image, then
* we need to be careful with this though, as we won't have padding on the
* edge tiles (which typically means that we need to load the tile buffer
* before we store).
*/
bool has_edge_padding;
uint32_t attachment_count;
uint32_t color_attachment_count;
struct v3dv_image_view *attachments[0];
......@@ -590,6 +598,10 @@ void v3dv_framebuffer_compute_internal_bpp_msaa(const struct v3dv_framebuffer *f
const struct v3dv_subpass *subpass,
uint8_t *max_bpp, bool *msaa);
bool v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
struct v3dv_framebuffer *fb,
struct v3dv_render_pass *pass,
uint32_t subpass_idx);
struct v3dv_cmd_pool {
VkAllocationCallbacks alloc;
struct list_head cmd_buffers;
......
......@@ -54,7 +54,9 @@ class lower_shared_reference_visitor :
public:
lower_shared_reference_visitor(struct gl_linked_shader *shader)
: list_ctx(ralloc_context(NULL)), shader(shader), shared_size(0u)
: buffer_access_type(shared_load_access),
list_ctx(ralloc_context(NULL)), shader(shader), shared_size(0u),
progress(false)
{
list_inithead(&var_offsets);
}
......
......@@ -3193,7 +3193,7 @@ typedef struct nir_shader_compiler_options {
/* Does the native fdot instruction replicate its result for four
* components? If so, then opt_algebraic_late will turn all fdotN
* instructions into fdot_replicatedN instructions.
* instructions into fdotN_replicated instructions.
*/
bool fdot_replicates;
......
......@@ -1111,7 +1111,9 @@ force_unroll_array_access(loop_info_state *state, nir_deref_instr *deref)
{
unsigned array_size = find_array_access_via_induction(state, deref, NULL);
if (array_size) {
if (array_size == state->loop->info->max_trip_count)
if ((array_size == state->loop->info->max_trip_count) &&
(deref->mode & (nir_var_shader_in | nir_var_shader_out |
nir_var_shader_temp | nir_var_function_temp)))
return true;
if (deref->mode & state->indirect_mask)
......
......@@ -127,8 +127,10 @@ nir_lower_clip_cull_distance_arrays(nir_shader *nir)
if (nir->info.stage <= MESA_SHADER_GEOMETRY)
progress |= combine_clip_cull(nir, nir_var_shader_out, true);
if (nir->info.stage > MESA_SHADER_VERTEX)
progress |= combine_clip_cull(nir, nir_var_shader_in, false);
if (nir->info.stage > MESA_SHADER_VERTEX) {
progress |= combine_clip_cull(nir, nir_var_shader_in,
nir->info.stage == MESA_SHADER_FRAGMENT);
}
nir_foreach_function(function, nir) {
if (!function->impl)
......
......@@ -115,6 +115,18 @@ lower_array(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var,
{
b->cursor = nir_before_instr(&intr->instr);
if (nir_deref_instr_is_known_out_of_bounds(nir_src_as_deref(intr->src[0]))) {
/* See Section 5.11 (Out-of-Bounds Accesses) of the GLSL 4.60 */
if (intr->intrinsic != nir_intrinsic_store_deref) {
nir_ssa_def *zero = nir_imm_zero(b, intr->dest.ssa.num_components,
intr->dest.ssa.bit_size);
nir_ssa_def_rewrite_uses(&intr->dest.ssa,
nir_src_for_ssa(zero));
}
nir_instr_remove(&intr->instr);
return;
}
nir_variable **elements =
get_array_elements(varyings, var, b->shader->info.stage);
......
......@@ -107,9 +107,9 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
static bool
has_replicated_dest(nir_alu_instr *alu)
{
return alu->op == nir_op_fdot_replicated2 ||
alu->op == nir_op_fdot_replicated3 ||
alu->op == nir_op_fdot_replicated4 ||
return alu->op == nir_op_fdot2_replicated ||
alu->op == nir_op_fdot3_replicated ||
alu->op == nir_op_fdot4_replicated ||
alu->op == nir_op_fdph_replicated;
}
......
......@@ -541,7 +541,7 @@ def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
False, "", const_expr)
def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
reduce_expr, final_expr):
reduce_expr, final_expr, suffix=""):
def final(src):
return final_expr.format(src= "(" + src + ")")
def reduce_(src0, src1):
......@@ -554,10 +554,10 @@ def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
return srcs[start]
return reduce_(pairwise_reduce(start, size // 2), pairwise_reduce(start + size // 2, size // 2))
for size in [2, 4, 8, 16]:
opcode(name + str(size), output_size, output_type,
opcode(name + str(size) + suffix, output_size, output_type,
[size, size], [src_type, src_type], False, _2src_commutative,
final(pairwise_reduce(0, size)))
opcode(name + "3", output_size, output_type,
opcode(name + "3" + suffix, output_size, output_type,
[3, 3], [src_type, src_type], False, _2src_commutative,
final(reduce_(reduce_(srcs[0], srcs[1]), srcs[2])))
......@@ -825,8 +825,9 @@ binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
"{src}")
binop_reduce("fdot_replicated", 4, tfloat, tfloat,
"{src0} * {src1}", "{src0} + {src1}", "{src}")
binop_reduce("fdot", 4, tfloat, tfloat,
"{src0} * {src1}", "{src0} + {src1}", "{src}",
suffix="_replicated")
opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
"src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
......
......@@ -2074,9 +2074,9 @@ late_optimizations = [
(('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
(('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
(('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
(('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
(('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),
(('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),
(('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),
(('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
(('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
......@@ -2252,9 +2252,9 @@ distribute_src_mods = [
# Try to remove some spurious negations rather than pushing them down.
(('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
(('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
(('fdot_replicated2', ('fneg', a), ('fneg', b)), ('fdot_replicated2', a, b)),
(('fdot_replicated3', ('fneg', a), ('fneg', b)), ('fdot_replicated3', a, b)),
(('fdot_replicated4', ('fneg', a), ('fneg', b)), ('fdot_replicated4', a, b)),
(('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),
(('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),
((<