Skip to content
Snippets Groups Projects
Commit 2134ea38 authored by Matt Turner's avatar Matt Turner
Browse files

intel/compiler/fs: Implement ddy without using align16 for Gen11+


Align16 is no more. We previously generated an align16 ADD instruction
to calculate DDY:

   add(16) g25<1>F  -g23<4>.xyxyF   g23<4>.zwzwF   { align16 1H };

Without align16, we now implement it as:

   add(4) g25<1>F   -g23<0,2,1>F    g23.2<0,2,1>F  { align1 1N };
   add(4) g25.4<1>F -g23.4<0,2,1>F  g23.6<0,2,1>F  { align1 1N };
   add(4) g26<1>F   -g24<0,2,1>F    g24.2<0,2,1>F  { align1 1N };
   add(4) g26.4<1>F -g24.4<0,2,1>F  g24.6<0,2,1>F  { align1 1N };

where only the first two instructions are needed in SIMD8 mode.

Note: an earlier version of the patch implemented this in two
instructions in SIMD16:

   add(8) g25<2>F   -g23<4,2,0>F    g23.2<4,2,0>F  { align1 1N };
   add(8) g25.1<2>F -g23.1<4,2,0>F  g23.3<4,2,0>F  { align1 1N };

but I realized that the channel enable bits will not be correct. If we
knew we were under uniform control flow, we could emit only those two
instructions however.

Reviewed-by: Kenneth Graunke's avatarKenneth Graunke <kenneth@whitecape.org>
parent 62cfd4c6
No related branches found
No related tags found
Loading
......@@ -1187,15 +1187,45 @@ fs_generator::generate_ddy(const fs_inst *inst,
{
if (inst->opcode == FS_OPCODE_DDY_FINE) {
/* produce accurate derivatives */
struct brw_reg src0 = stride(src, 4, 4, 1);
struct brw_reg src1 = stride(src, 4, 4, 1);
src0.swizzle = BRW_SWIZZLE_XYXY;
src1.swizzle = BRW_SWIZZLE_ZWZW;
if (devinfo->gen >= 11) {
src = stride(src, 0, 2, 1);
struct brw_reg src_0 = byte_offset(src, 0 * sizeof(float));
struct brw_reg src_2 = byte_offset(src, 2 * sizeof(float));
struct brw_reg src_4 = byte_offset(src, 4 * sizeof(float));
struct brw_reg src_6 = byte_offset(src, 6 * sizeof(float));
struct brw_reg src_8 = byte_offset(src, 8 * sizeof(float));
struct brw_reg src_10 = byte_offset(src, 10 * sizeof(float));
struct brw_reg src_12 = byte_offset(src, 12 * sizeof(float));
struct brw_reg src_14 = byte_offset(src, 14 * sizeof(float));
struct brw_reg dst_0 = byte_offset(dst, 0 * sizeof(float));
struct brw_reg dst_4 = byte_offset(dst, 4 * sizeof(float));
struct brw_reg dst_8 = byte_offset(dst, 8 * sizeof(float));
struct brw_reg dst_12 = byte_offset(dst, 12 * sizeof(float));
brw_push_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_16);
brw_ADD(p, dst, negate(src0), src1);
brw_pop_insn_state(p);
brw_push_insn_state(p);
brw_set_default_exec_size(p, BRW_EXECUTE_4);
brw_ADD(p, dst_0, negate(src_0), src_2);
brw_ADD(p, dst_4, negate(src_4), src_6);
if (inst->exec_size == 16) {
brw_ADD(p, dst_8, negate(src_8), src_10);
brw_ADD(p, dst_12, negate(src_12), src_14);
}
brw_pop_insn_state(p);
} else {
struct brw_reg src0 = stride(src, 4, 4, 1);
struct brw_reg src1 = stride(src, 4, 4, 1);
src0.swizzle = BRW_SWIZZLE_XYXY;
src1.swizzle = BRW_SWIZZLE_ZWZW;
brw_push_insn_state(p);
brw_set_default_access_mode(p, BRW_ALIGN_16);
brw_ADD(p, dst, negate(src0), src1);
brw_pop_insn_state(p);
}
} else {
/* replicate the derivative at the top-left pixel to other pixels */
struct brw_reg src0 = stride(src, 4, 4, 0);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment