Commit 10c6ad38 authored by Zhenyu Wang's avatar Zhenyu Wang

lib: Add GPGPU fill

This is to add fill operation using GPGPU pipeline which is similar to
current media fill. This can be used to simply verify GPGPU pipeline
and help to enable it on newer HW, currently it works on Gen7 only and
will add support on later platform.

Now this sets very simply thread group dispatch for one thread per
thread group on SIMD16 dispatch. So the fill shader just uses thread
group ID for buffer offset.

v2: No new fill func typedef but adapt to igt_fillfunc_t.
Signed-off-by: default avatarZhenyu Wang <zhenyuw@linux.intel.com>
parent 106f0bf9
......@@ -179,6 +179,7 @@
#define GEN7_PIPELINE_SELECT GFXPIPE(1, 1, 4)
# define PIPELINE_SELECT_3D (0 << 0)
# define PIPELINE_SELECT_MEDIA (1 << 0)
# define PIPELINE_SELECT_GPGPU (2 << 0)
#define GEN7_STATE_BASE_ADDRESS GFXPIPE(0, 1, 1)
# define BASE_ADDRESS_MODIFY (1 << 0)
......@@ -187,6 +188,7 @@
#define GEN7_MEDIA_CURBE_LOAD GFXPIPE(2, 0, 1)
#define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD GFXPIPE(2, 0, 2)
#define GEN7_MEDIA_OBJECT GFXPIPE(2, 1, 0)
#define GEN7_GPGPU_WALKER GFXPIPE(2, 1, 5)
struct gen7_interface_descriptor_data
{
......
......@@ -511,3 +511,22 @@ igt_fillfunc_t igt_get_media_fillfunc(int devid)
return fill;
}
/**
* igt_get_gpgpu_fillfunc:
* @devid: pci device id
*
* Returns:
*
* The platform-specific gpgpu fill function pointer for the device specified
* with @devid. Will return NULL when no gpgpu fill function is implemented.
*/
igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
{
igt_fillfunc_t fill = NULL;
if (IS_GEN7(devid))
fill = gen7_gpgpu_fillfunc;
return fill;
}
......@@ -250,11 +250,11 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid);
* @color: fill color to use
*
* This is the type of the per-platform fill functions using media
* pipeline. The platform-specific implementation can be obtained
* by calling igt_get_media_fillfunc().
* or gpgpu pipeline. The platform-specific implementation can be obtained
* by calling igt_get_media_fillfunc() or igt_get_gpgpu_fillfunc().
*
* A fill function will emit a batchbuffer to the kernel which executes
* the specified blit fill operation using the media engine.
* the specified blit fill operation using the media/gpgpu engine.
*/
typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
struct igt_buf *dst,
......@@ -263,5 +263,6 @@ typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
uint8_t color);
igt_fillfunc_t igt_get_media_fillfunc(int devid);
igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid);
#endif
......@@ -32,4 +32,11 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
unsigned width, unsigned height,
uint8_t color);
void
gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
struct igt_buf *dst,
unsigned x, unsigned y,
unsigned width, unsigned height,
uint8_t color);
#endif /* RENDE_MEDIA_FILL_H */
......@@ -8,7 +8,6 @@
#include <assert.h>
static const uint32_t media_kernel[][4] = {
{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
......@@ -23,6 +22,23 @@ static const uint32_t media_kernel[][4] = {
{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
};
/* shaders/gpgpu/gpgpu_fill.gxa */
static const uint32_t gpgpu_kernel[][4] = {
{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
{ 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
{ 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
{ 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
{ 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
{ 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
{ 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
{ 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
{ 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
{ 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
{ 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
};
static uint32_t
batch_used(struct intel_batchbuffer *batch)
{
......@@ -160,14 +176,15 @@ gen7_fill_media_kernel(struct intel_batchbuffer *batch,
}
static uint32_t
gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
const uint32_t kernel[][4], size_t size)
{
struct gen7_interface_descriptor_data *idd;
uint32_t offset;
uint32_t binding_table_offset, kernel_offset;
binding_table_offset = gen7_fill_binding_table(batch, dst);
kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
idd = batch_alloc(batch, sizeof(*idd), 64);
offset = batch_offset(batch, idd);
......@@ -329,7 +346,9 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
media_kernel,
sizeof(media_kernel));
igt_assert(batch->ptr < &batch->buffer[4095]);
/* media pipeline */
......@@ -353,3 +372,137 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
gen7_render_flush(batch, batch_end);
intel_batchbuffer_reset(batch);
}
static void
gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
{
OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
/* scratch buffer */
OUT_BATCH(0);
/* number of threads & urb entries */
OUT_BATCH(1 << 16 | /* max num of threads */
0 << 8 | /* num of URB entry */
1 << 2); /* GPGPU mode */
OUT_BATCH(0);
/* urb entry size & curbe size */
OUT_BATCH(0 << 16 | /* URB entry size in 256 bits unit */
1); /* CURBE entry size in 256 bits unit */
/* scoreboard */
OUT_BATCH(0);
OUT_BATCH(0);
OUT_BATCH(0);
}
static void
gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
unsigned x, unsigned y,
unsigned width, unsigned height)
{
uint32_t x_dim, y_dim, tmp, right_mask;
/*
* Simply do SIMD16 based dispatch, so every thread uses
* SIMD16 channels.
*
* Define our own thread group size, e.g 16x1 for every group, then
* will have 1 thread each group in SIMD16 dispatch. So thread
* width/height/depth are all 1.
*
* Then thread group X = width / 16 (aligned to 16)
* thread group Y = height;
*/
x_dim = (width + 15) / 16;
y_dim = height;
tmp = width & 15;
if (tmp == 0)
right_mask = (1 << 16) - 1;
else
right_mask = (1 << tmp) - 1;
OUT_BATCH(GEN7_GPGPU_WALKER | 9);
/* interface descriptor offset */
OUT_BATCH(0);
/* SIMD size, thread w/h/d */
OUT_BATCH(1 << 30 | /* SIMD16 */
0 << 16 | /* depth:1 */
0 << 8 | /* height:1 */
0); /* width:1 */
/* thread group X */
OUT_BATCH(0);
OUT_BATCH(x_dim);
/* thread group Y */
OUT_BATCH(0);
OUT_BATCH(y_dim);
/* thread group Z */
OUT_BATCH(0);
OUT_BATCH(1);
/* right mask */
OUT_BATCH(right_mask);
/* bottom mask, height 1, always 0xffffffff */
OUT_BATCH(0xffffffff);
}
void
gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
struct igt_buf *dst,
unsigned x, unsigned y,
unsigned width, unsigned height,
uint8_t color)
{
uint32_t curbe_buffer, interface_descriptor;
uint32_t batch_end;
intel_batchbuffer_flush(batch);
/* setup states */
batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
/*
* const buffer needs to fill for every thread, but as we have just 1 thread
* per every group, so need only one curbe data.
*
* For each thread, just use thread group ID for buffer offset.
*/
curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
gpgpu_kernel,
sizeof(gpgpu_kernel));
igt_assert(batch->ptr < &batch->buffer[4095]);
batch->ptr = batch->buffer;
/* GPGPU pipeline */
OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
gen7_emit_state_base_address(batch);
gen7_emit_vfe_state_gpgpu(batch);
gen7_emit_curbe_load(batch, curbe_buffer);
gen7_emit_interface_descriptor_load(batch, interface_descriptor);
gen7_emit_gpgpu_walk(batch, x, y, width, height);
OUT_BATCH(MI_BATCH_BUFFER_END);
batch_end = batch_align(batch, 8);
igt_assert(batch_end < BATCH_STATE_SPLIT);
gen7_render_flush(batch, batch_end);
intel_batchbuffer_reset(batch);
}
Commands used to generate the shader on gen7
$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm
$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm
/*
* Registers
* g0 -- header
* g1 -- constant
* g2 -- calculate X/Y offset
* g4-g12 payload for write message
*/
define(`ORIG', `g2.0<2,2,1>UD')
define(`ORIG_X', `g2.0<1>UD')
define(`ORIG_Y', `g2.4<1>UD')
define(`COLOR', `g1.0')
define(`COLORUB', `COLOR<0,1,0>UB')
define(`COLORUD', `COLOR<0,1,0>UD')
define(`X', `g0.4<0,1,0>UD')
define(`Y', `g0.24<0,1,0>UD')
mov(4) COLOR<1>UB COLORUB {align1};
/* WRITE */
/* count thread group ID for X/Y offset */
mul(1) ORIG_X X 0x10UD {align1};
mov(1) ORIG_Y Y {align1};
mov(8) g4.0<1>UD g0.0<8,8,1>UD {align1};
mov(2) g4.0<1>UD ORIG {align1};
/* Normal mode: for block height 1 row and block width 16 bytes */
mov(1) g4.8<1>UD 0x0000000fUD {align1};
mov(16) g5.0<1>UD COLORUD {align1 compr};
mov(16) g7.0<1>UD COLORUD {align1 compr};
mov(16) g9.0<1>UD COLORUD {align1 compr};
mov(16) g11.0<1>UD COLORUD {align1 compr};
/*
* comment out the following instruction on Gen7
* write(0, 0, 10, 12)
* 10: media_block_write
* 12: data cache data port 1
*/
send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
/*
* uncomment the following instruction on Gen7
* write(0, 0, 10, 0)
* 10: media_block_write
* 0: reander cache data port
*/
/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
/* EOT */
mov(8) g112.0<1>UD g0.0<8,8,1>UD {align1};
send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment