Commit f8f77126 authored by Rhys Perry's avatar Rhys Perry Committed by Marge Bot
Browse files

aco: implement GS copy shaders



v5: rebase on float_controls changes
v7: rebase after shader args MR and load/store vectorizer MR
Signed-off-by: Rhys Perry's avatarRhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann's avatarDaniel Schürmann <daniel@schuermann.dev>
Part-of: <mesa/mesa!2421>
parent de4ce66f
......@@ -25,6 +25,7 @@
#include <algorithm>
#include <array>
#include <stack>
#include <map>
#include "ac_shader_util.h"
......@@ -8534,7 +8535,7 @@ static void create_vs_exports(isel_context *ctx)
if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
if (ctx->options->key.vs_common_out.export_clip_dists) {
if (ctx->export_clip_dists) {
if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
......@@ -8568,7 +8569,7 @@ static void emit_stream_output(isel_context *ctx,
Temp out[4];
bool all_undef = true;
assert(ctx->stage == vertex_vs);
assert(ctx->stage == vertex_vs || ctx->stage == gs_copy_vs);
for (unsigned i = 0; i < num_comps; i++) {
out[i] = ctx->vsgs_output.outputs[loc][start + i];
all_undef = all_undef && !out[i].id();
......@@ -8804,13 +8805,24 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader)
ctx->block->fp_mode = program->next_fp_mode;
}
void cleanup_cfg(Program *program)
{
/* create linear_succs/logical_succs */
for (Block& BB : program->blocks) {
for (unsigned idx : BB.linear_preds)
program->blocks[idx].linear_succs.emplace_back(BB.index);
for (unsigned idx : BB.logical_preds)
program->blocks[idx].logical_succs.emplace_back(BB.index);
}
}
void select_program(Program *program,
unsigned shader_count,
struct nir_shader *const *shaders,
ac_shader_config* config,
struct radv_shader_args *args)
{
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args);
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];
......@@ -8879,12 +8891,162 @@ void select_program(Program *program,
bld.smem(aco_opcode::s_dcache_wb, false);
bld.sopp(aco_opcode::s_endpgm);
/* cleanup CFG */
for (Block& BB : program->blocks) {
for (unsigned idx : BB.linear_preds)
program->blocks[idx].linear_succs.emplace_back(BB.index);
for (unsigned idx : BB.logical_preds)
program->blocks[idx].logical_succs.emplace_back(BB.index);
cleanup_cfg(program);
}
void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
ac_shader_config* config,
struct radv_shader_args *args)
{
isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
program->next_fp_mode.must_flush_denorms32 = false;
program->next_fp_mode.must_flush_denorms16_64 = false;
program->next_fp_mode.care_about_round32 = false;
program->next_fp_mode.care_about_round16_64 = false;
program->next_fp_mode.denorm16_64 = fp_denorm_keep;
program->next_fp_mode.denorm32 = 0;
program->next_fp_mode.round32 = fp_round_ne;
program->next_fp_mode.round16_64 = fp_round_ne;
ctx.block->fp_mode = program->next_fp_mode;
add_startpgm(&ctx);
append_logical_start(ctx.block);
Builder bld(ctx.program, ctx.block);
Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
Operand stream_id(0u);
if (args->shader_info->so.num_outputs)
stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
std::stack<Block> endif_blocks;
for (unsigned stream = 0; stream < 4; stream++) {
if (stream_id.isConstant() && stream != stream_id.constantValue())
continue;
unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
continue;
memset(ctx.vsgs_output.mask, 0, sizeof(ctx.vsgs_output.mask));
unsigned BB_if_idx = ctx.block->index;
Block BB_endif = Block();
if (!stream_id.isConstant()) {
/* begin IF */
Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
append_logical_end(ctx.block);
ctx.block->kind |= block_kind_uniform;
bld.branch(aco_opcode::p_cbranch_z, cond);
BB_endif.kind |= ctx.block->kind & block_kind_top_level;
ctx.block = ctx.program->create_and_insert_block();
add_edge(BB_if_idx, ctx.block);
bld.reset(ctx.block);
append_logical_start(ctx.block);
}
unsigned offset = 0;
for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
if (args->shader_info->gs.output_streams[i] != stream)
continue;
unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
unsigned length = util_last_bit(output_usage_mask);
for (unsigned j = 0; j < length; ++j) {
if (!(output_usage_mask & (1 << j)))
continue;
unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
Temp voffset = vtx_offset;
if (const_offset >= 4096u) {
voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
const_offset %= 4096u;
}
aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
mubuf->definitions[0] = bld.def(v1);
mubuf->operands[0] = Operand(voffset);
mubuf->operands[1] = Operand(gsvs_ring);
mubuf->operands[2] = Operand(0u);
mubuf->offen = true;
mubuf->offset = const_offset;
mubuf->glc = true;
mubuf->slc = true;
mubuf->dlc = args->options->chip_class >= GFX10;
mubuf->barrier = barrier_none;
mubuf->can_reorder = true;
ctx.vsgs_output.mask[i] |= 1 << j;
ctx.vsgs_output.outputs[i][j] = mubuf->definitions[0].getTemp();
bld.insert(std::move(mubuf));
offset++;
}
}
if (args->shader_info->so.num_outputs) {
emit_streamout(&ctx, stream);
bld.reset(ctx.block);
}
if (stream == 0) {
create_vs_exports(&ctx);
ctx.block->kind |= block_kind_export_end;
}
if (!stream_id.isConstant()) {
append_logical_end(ctx.block);
/* branch from then block to endif block */
bld.branch(aco_opcode::p_branch);
add_edge(ctx.block->index, &BB_endif);
ctx.block->kind |= block_kind_uniform;
/* emit else block */
ctx.block = ctx.program->create_and_insert_block();
add_edge(BB_if_idx, ctx.block);
bld.reset(ctx.block);
append_logical_start(ctx.block);
endif_blocks.push(std::move(BB_endif));
}
}
while (!endif_blocks.empty()) {
Block BB_endif = std::move(endif_blocks.top());
endif_blocks.pop();
Block *BB_else = ctx.block;
append_logical_end(BB_else);
/* branch from else block to endif block */
bld.branch(aco_opcode::p_branch);
add_edge(BB_else->index, &BB_endif);
BB_else->kind |= block_kind_uniform;
/** emit endif merge block */
ctx.block = program->insert_block(std::move(BB_endif));
bld.reset(ctx.block);
append_logical_start(ctx.block);
}
program->config->float_mode = program->blocks[0].fp_mode.val;
append_logical_end(ctx.block);
ctx.block->kind |= block_kind_uniform;
bld.sopp(aco_opcode::s_endpgm);
cleanup_cfg(program);
}
}
......@@ -85,6 +85,7 @@ struct isel_context {
uint64_t output_masks[MESA_SHADER_COMPUTE];
/* VS output information */
bool export_clip_dists;
unsigned num_clip_distances;
unsigned num_cull_distances;
......@@ -661,6 +662,54 @@ mem_vectorize_callback(unsigned align, unsigned bit_size,
return false;
}
void
setup_vs_output_info(isel_context *ctx, nir_shader *nir,
bool export_prim_id, bool export_clip_dists,
radv_vs_output_info *outinfo)
{
memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
sizeof(outinfo->vs_output_param_offset));
outinfo->param_exports = 0;
int pos_written = 0x1;
if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
pos_written |= 1 << 1;
uint64_t mask = ctx->output_masks[nir->info.stage];
while (mask) {
int idx = u_bit_scan64(&mask);
if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
}
}
if (outinfo->writes_layer &&
outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
/* when ctx->options->key.has_multiview_view_index = true, the layer
* variable isn't declared in NIR and it's isel's job to get the layer */
outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
}
if (export_prim_id) {
assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
}
ctx->export_clip_dists = export_clip_dists;
ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
pos_written |= 1 << 2;
if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
pos_written |= 1 << 3;
outinfo->pos_exports = util_bitcount(pos_written);
}
void
setup_vs_variables(isel_context *ctx, nir_shader *nir)
{
......@@ -681,49 +730,8 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir)
if (ctx->stage == vertex_vs) {
radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
sizeof(outinfo->vs_output_param_offset));
bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists;
outinfo->param_exports = 0;
int pos_written = 0x1;
if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
pos_written |= 1 << 1;
uint64_t mask = ctx->output_masks[nir->info.stage];
while (mask) {
int idx = u_bit_scan64(&mask);
if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
}
}
if (outinfo->writes_layer &&
outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
/* when ctx->options->key.has_multiview_view_index = true, the layer
* variable isn't declared in NIR and it's isel's job to get the layer */
outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
}
if (outinfo->export_prim_id) {
assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
}
ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
pos_written |= 1 << 2;
if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
pos_written |= 1 << 3;
outinfo->pos_exports = util_bitcount(pos_written);
setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
ctx->options->key.vs_common_out.export_clip_dists, outinfo);
} else if (ctx->stage == vertex_geometry_gs || ctx->stage == vertex_es) {
/* TODO: radv_nir_shader_info_pass() already sets this but it's larger
* than it needs to be in order to set it better, we have to improve
......@@ -824,12 +832,80 @@ get_io_masks(isel_context *ctx, unsigned shader_count, struct nir_shader *const
}
}
void
setup_nir(isel_context *ctx, nir_shader *nir)
{
Program *program = ctx->program;
/* align and copy constant data */
while (program->constant_data.size() % 4u)
program->constant_data.push_back(0);
ctx->constant_data_offset = program->constant_data.size();
program->constant_data.insert(program->constant_data.end(),
(uint8_t*)nir->constant_data,
(uint8_t*)nir->constant_data + nir->constant_data_size);
/* the variable setup has to be done before lower_io / CSE */
setup_variables(ctx, nir);
/* optimize and lower memory operations */
bool lower_to_scalar = false;
bool lower_pack = false;
if (nir_opt_load_store_vectorize(nir,
(nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo |
nir_var_mem_push_const | nir_var_mem_shared),
mem_vectorize_callback)) {
lower_to_scalar = true;
lower_pack = true;
}
if (nir->info.stage != MESA_SHADER_COMPUTE)
nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0);
nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
if (lower_to_scalar)
nir_lower_alu_to_scalar(nir, NULL, NULL);
if (lower_pack)
nir_lower_pack(nir);
/* lower ALU operations */
// TODO: implement logic64 in aco, it's more effective for sgprs
nir_lower_int64(nir, nir->options->lower_int64_options);
nir_opt_idiv_const(nir, 32);
nir_lower_idiv(nir, nir_lower_idiv_precise);
/* optimize the lowered ALU operations */
bool more_algebraic = true;
while (more_algebraic) {
more_algebraic = false;
NIR_PASS_V(nir, nir_copy_prop);
NIR_PASS_V(nir, nir_opt_dce);
NIR_PASS_V(nir, nir_opt_constant_folding);
NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
}
/* cleanup passes */
nir_lower_load_const_to_scalar(nir);
nir_opt_shrink_load(nir);
nir_move_options move_opts = (nir_move_options)(
nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons);
nir_opt_sink(nir, move_opts);
nir_opt_move(nir, move_opts);
nir_convert_to_lcssa(nir, true, false);
nir_lower_phis_to_scalar(nir);
nir_function_impl *func = nir_shader_get_entrypoint(nir);
nir_index_ssa_defs(func);
nir_metadata_require(func, nir_metadata_block_index);
}
isel_context
setup_isel_context(Program* program,
unsigned shader_count,
struct nir_shader *const *shaders,
ac_shader_config* config,
struct radv_shader_args *args)
struct radv_shader_args *args,
bool is_gs_copy_shader)
{
program->stage = 0;
for (unsigned i = 0; i < shader_count; i++) {
......@@ -844,7 +920,7 @@ setup_isel_context(Program* program,
program->stage |= sw_tes;
break;
case MESA_SHADER_GEOMETRY:
program->stage |= sw_gs;
program->stage |= is_gs_copy_shader ? sw_gs_copy : sw_gs;
break;
case MESA_SHADER_FRAGMENT:
program->stage |= sw_fs;
......@@ -868,6 +944,8 @@ setup_isel_context(Program* program,
program->stage |= hw_fs;
else if (program->stage == sw_cs)
program->stage |= hw_cs;
else if (program->stage == sw_gs_copy)
program->stage |= hw_vs;
else if (program->stage == (sw_vs | sw_gs) && gfx9_plus && !ngg)
program->stage |= hw_gs;
else
......@@ -918,94 +996,25 @@ setup_isel_context(Program* program,
get_io_masks(&ctx, shader_count, shaders);
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];
/* align and copy constant data */
while (program->constant_data.size() % 4u)
program->constant_data.push_back(0);
ctx.constant_data_offset = program->constant_data.size();
program->constant_data.insert(program->constant_data.end(),
(uint8_t*)nir->constant_data,
(uint8_t*)nir->constant_data + nir->constant_data_size);
/* the variable setup has to be done before lower_io / CSE */
setup_variables(&ctx, nir);
/* optimize and lower memory operations */
bool lower_to_scalar = false;
bool lower_pack = false;
if (nir_opt_load_store_vectorize(nir,
(nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo |
nir_var_mem_push_const | nir_var_mem_shared),
mem_vectorize_callback)) {
lower_to_scalar = true;
lower_pack = true;
}
if (nir->info.stage != MESA_SHADER_COMPUTE)
nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0);
nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
if (lower_to_scalar)
nir_lower_alu_to_scalar(nir, NULL, NULL);
if (lower_pack)
nir_lower_pack(nir);
/* lower ALU operations */
// TODO: implement logic64 in aco, it's more effective for sgprs
nir_lower_int64(nir, nir->options->lower_int64_options);
nir_opt_idiv_const(nir, 32);
nir_lower_idiv(nir, nir_lower_idiv_precise);
/* optimize the lowered ALU operations */
bool more_algebraic = true;
while (more_algebraic) {
more_algebraic = false;
NIR_PASS_V(nir, nir_copy_prop);
NIR_PASS_V(nir, nir_opt_dce);
NIR_PASS_V(nir, nir_opt_constant_folding);
NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
}
unsigned scratch_size = 0;
if (program->stage == gs_copy_vs) {
assert(shader_count == 1);
setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo);
} else {
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];
setup_nir(&ctx, nir);
/* Do late algebraic optimization to turn add(a, neg(b)) back into
* subs, then the mandatory cleanup after algebraic. Note that it may
* produce fnegs, and if so then we need to keep running to squash
* fneg(fneg(a)).
*/
bool more_late_algebraic = true;
while (more_late_algebraic) {
more_late_algebraic = false;
NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
NIR_PASS_V(nir, nir_opt_constant_folding);
NIR_PASS_V(nir, nir_copy_prop);
NIR_PASS_V(nir, nir_opt_dce);
NIR_PASS_V(nir, nir_opt_cse);
if (args->options->dump_preoptir) {
fprintf(stderr, "NIR shader before instruction selection:\n");
nir_print_shader(nir, stderr);
}
}
/* cleanup passes */
nir_lower_load_const_to_scalar(nir);
nir_opt_shrink_load(nir);
nir_move_options move_opts = (nir_move_options)(
nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons);
nir_opt_sink(nir, move_opts);
nir_opt_move(nir, move_opts);
nir_convert_to_lcssa(nir, true, false);
nir_lower_phis_to_scalar(nir);
nir_function_impl *func = nir_shader_get_entrypoint(nir);
nir_index_ssa_defs(func);
nir_metadata_require(func, nir_metadata_block_index);
if (args->options->dump_preoptir) {
fprintf(stderr, "NIR shader before instruction selection:\n");
nir_print_shader(nir, stderr);
}
for (unsigned i = 0; i < shader_count; i++)
scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
}
unsigned scratch_size = 0;
for (unsigned i = 0; i < shader_count; i++)
scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024);
ctx.block = ctx.program->create_and_insert_block();
......
......@@ -65,7 +65,10 @@ void aco_compile_shader(unsigned shader_count,
std::unique_ptr<aco::Program> program{new aco::Program};
/* Instruction Selection */
aco::select_program(program.get(), shader_count, shaders, &config, args);
if (args->is_gs_copy_shader)
aco::select_gs_copy_shader(program.get(), shaders[0], &config, args);
else
aco::select_program(program.get(), shader_count, shaders, &config, args);
if (args->options->dump_preoptir) {
std::cerr << "After Instruction Selection:\n";
aco_print_program(program.get(), stderr);
......@@ -162,7 +165,7 @@ void aco_compile_shader(unsigned shader_count,
legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY;
legacy_binary->base.stage = shaders[shader_count-1]->info.stage;
legacy_binary->base.is_gs_copy_shader = false;
legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader;
legacy_binary->base.total_size = size;
memcpy(legacy_binary->data, code.data(), code.size() * sizeof(uint32_t));
......
......@@ -1106,23 +1106,25 @@ static constexpr Stage sw_tcs = 1 << 2;
static constexpr Stage sw_tes = 1 << 3;
static constexpr Stage sw_fs = 1 << 4;
static constexpr Stage sw_cs = 1 << 5;
static constexpr Stage sw_mask = 0x3f;
static constexpr Stage sw_gs_copy = 1 << 6;
static constexpr Stage sw_mask = 0x7f;
/* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
static constexpr Stage hw_vs = 1 << 6;
static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
static constexpr Stage hw_gs = 1 << 8;
static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
static constexpr Stage hw_hs = 1 << 10;
static constexpr Stage hw_fs = 1 << 11;
static constexpr Stage hw_cs = 1 << 12;
static constexpr Stage hw_mask = 0x7f << 6;
static constexpr Stage hw_vs = 1 << 7;
static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
static constexpr Stage hw_gs = 1 << 9;
static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
static constexpr Stage hw_hs = 1 << 11;
static constexpr Stage hw_fs = 1 << 12;
static constexpr Stage hw_cs = 1 << 13;
static constexpr Stage hw_mask = 0x7f << 7;
/* possible settings of Program::stage */
static constexpr Stage vertex_vs = sw_vs | hw_vs;
static constexpr Stage fragment_fs = sw_fs | hw_fs;
static constexpr Stage compute_cs = sw_cs | hw_cs;