From f81eb2a8276ea7ab7143e7e5de8457d4894b2901 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Sat, 10 Jul 2021 12:17:37 +0200 Subject: [PATCH 01/18] aco/spill: Avoid unneeded copies when iterating over maps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 205a687b7e4c..dc1d3bc34f77 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -217,7 +217,7 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& work } /* all remaining live vars must be live-out at the predecessors */ - for (std::pair> pair : next_uses) { + for (std::pair>& pair : next_uses) { Temp temp = pair.first; uint32_t distance = pair.second.second; uint32_t dom = pair.second.first; @@ -360,7 +360,7 @@ local_next_uses(spill_ctx& ctx, Block* block) std::vector> local_next_uses(block->instructions.size()); std::map next_uses; - for (std::pair> pair : + for (std::pair>& pair : ctx.next_use_distances_end[block->index]) next_uses[pair.first] = pair.second.second + block->instructions.size(); @@ -489,7 +489,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) unsigned distance = 0; Temp to_spill; - for (std::pair> pair : next_use_distances) { + for (std::pair>& pair : next_use_distances) { if (pair.first.type() == type && (pair.second.first >= loop_end || (ctx.remat.count(pair.first) && type == RegType::sgpr)) && @@ -533,7 +533,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) Temp to_spill; type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr; - for (std::pair> pair : next_use_distances) { + for (std::pair>& pair : next_use_distances) { if (pair.first.type() == type && pair.second.second > distance && ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { to_spill = pair.first; @@ -604,7 +604,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) std::set partial_spills; /* keep variables spilled on all incoming paths */ - for (std::pair> pair : next_use_distances) { + for (std::pair>& pair : next_use_distances) { std::vector& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload @@ -722,7 +722,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) unsigned insert_idx = 0; RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0); - for (std::pair> live : + for (std::pair>& live : ctx.next_use_distances_start[block_idx]) { const unsigned pred_idx = block->linear_preds[0]; @@ -758,7 +758,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) } while (instructions.back()->opcode != aco_opcode::p_logical_start); unsigned pred_idx = block->logical_preds[0]; - for (std::pair> live : + for (std::pair>& live : ctx.next_use_distances_start[block_idx]) { if (live.first.is_linear()) continue; @@ -993,7 +993,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* iterate live variables for which to reload */ // TODO: reload at current block if variable is spilled on all predecessors - for (std::pair> pair : + for (std::pair>& pair : ctx.next_use_distances_start[block_idx]) { /* skip spilled variables */ if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end()) @@ -1189,7 +1189,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, /* add interferences with currently spilled variables */ for (std::pair pair : current_spills) ctx.add_interference(spill_id, pair.second); - for (std::pair> pair : reloads) + for (std::pair>& pair : reloads) ctx.add_interference(spill_id, pair.second.second); current_spills[to_spill] = spill_id; @@ -1210,7 +1210,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, } /* add reloads and instruction to new instructions */ - for (std::pair> pair : reloads) { + for (std::pair>& pair : reloads) { aco_ptr reload = do_reload(ctx, pair.second.first, pair.first, pair.second.second); instructions.emplace_back(std::move(reload)); -- GitLab From 0812d440c7f6adbfe215f0373e241a7b65469415 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Sat, 10 Jul 2021 12:20:56 +0200 Subject: [PATCH 02/18] aco: Use std::vector for the underlying container of std::stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By default, std::stack uses std::deque to allocate its elements, which has poor cache efficiency. std::vector makes appending elements more expensive (due to potential reallocations), but in the changed contexts the element count should always be low anyway. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_NOPs.cpp | 2 +- src/amd/compiler/aco_insert_waitcnt.cpp | 2 +- src/amd/compiler/aco_instruction_selection.cpp | 2 +- src/amd/compiler/aco_spill.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 361411d05e5d..5dd1c7183e83 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -847,7 +847,7 @@ void mitigate_hazards(Program* program) { std::vector all_ctx(program->blocks.size()); - std::stack loop_header_indices; + std::stack> loop_header_indices; for (unsigned i = 0; i < program->blocks.size(); i++) { Block& block = program->blocks[i]; diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index d7fc87c126d6..cb6c2a60804c 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -767,7 +767,7 @@ insert_wait_states(Program* program) std::vector in_ctx(program->blocks.size(), wait_ctx(program)); std::vector out_ctx(program->blocks.size(), wait_ctx(program)); - std::stack loop_header_indices; + std::stack> loop_header_indices; unsigned loop_progress = 0; for (unsigned i = 0; i < program->blocks.size();) { diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index bd81c50083c9..d4fd9794fdc9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11811,7 +11811,7 @@ select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_ Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), get_arg(&ctx, ctx.args->ac.vertex_id)); - std::stack if_contexts; + std::stack> if_contexts; for (unsigned stream = 0; stream < 4; stream++) { if (stream_id.isConstant() && stream != stream_id.constantValue()) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index dc1d3bc34f77..7dfbc03006b9 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -56,7 +56,7 @@ struct spill_ctx { std::vector> spills_entry; std::vector> spills_exit; std::vector processed; - std::stack loop_header; + std::stack> loop_header; std::vector>> next_use_distances_start; std::vector>> next_use_distances_end; std::vector>> interferences; -- GitLab From b183bdeabf0234027611cbd2563de479a2be99f6 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Sat, 10 Jul 2021 12:25:45 +0200 Subject: [PATCH 03/18] aco/spill: Remove unused container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 7dfbc03006b9..499512150f33 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1128,7 +1128,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, aco_ptr& instr = block->instructions[idx]; std::map> reloads; - std::map spills; + /* rename and reload operands */ for (Operand& op : instr->operands) { if (!op.isTemp()) -- GitLab From 276da301e6803023560164679c48fc0e07f2b8d2 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Thu, 15 Jul 2021 16:13:58 +0200 Subject: [PATCH 04/18] aco/spill: Replace map[] with map::insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 499512150f33..646b492fbc94 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -362,7 +362,7 @@ local_next_uses(spill_ctx& ctx, Block* block) std::map next_uses; for (std::pair>& pair : ctx.next_use_distances_end[block->index]) - next_uses[pair.first] = pair.second.second + block->instructions.size(); + next_uses.insert({pair.first, pair.second.second + block->instructions.size()}); for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { aco_ptr& instr = block->instructions[idx]; -- GitLab From 387b315ea047779db4a0657c1280bc3bd5531eaa Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Sat, 10 Jul 2021 13:59:42 +0200 Subject: [PATCH 05/18] aco/spill: Avoid copying next_use maps more often than needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 646b492fbc94..fcb0eafbb2ba 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -359,10 +359,11 @@ local_next_uses(spill_ctx& ctx, Block* block) { std::vector> local_next_uses(block->instructions.size()); - std::map next_uses; for (std::pair>& pair : - ctx.next_use_distances_end[block->index]) - next_uses.insert({pair.first, pair.second.second + block->instructions.size()}); + ctx.next_use_distances_end[block->index]) { + local_next_uses[block->instructions.size() - 1].insert( + {pair.first, pair.second.second + block->instructions.size()}); + } for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { aco_ptr& instr = block->instructions[idx]; @@ -371,19 +372,24 @@ local_next_uses(spill_ctx& ctx, Block* block) if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi) break; + if (idx != (int)block->instructions.size() - 1) { + local_next_uses[idx] = local_next_uses[idx + 1]; + } + for (const Operand& op : instr->operands) { if (op.isFixed() && op.physReg() == exec) continue; if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear()) continue; - if (op.isTemp()) - next_uses[op.getTemp()] = idx; + if (op.isTemp()) { + local_next_uses[idx][op.getTemp()] = idx; + } } for (const Definition& def : instr->definitions) { - if (def.isTemp()) - next_uses.erase(def.getTemp()); + if (def.isTemp()) { + local_next_uses[idx].erase(def.getTemp()); + } } - local_next_uses[idx] = next_uses; } return local_next_uses; } -- GitLab From df6c395095e02ebf91a93b3174e4ce0137ad0b3e Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Mon, 19 Jul 2021 19:32:38 +0200 Subject: [PATCH 06/18] aco/spill: Persist memory allocations of local next use maps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function local_next_uses allocated one next-use map per instruction in the current block. Since the function is called once per block, this caused an excessive amount of memory allocations to take place. With this change, the memory for next-use maps is allocated only once and then reused for later blocks. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index fcb0eafbb2ba..1965b53f4f70 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -55,10 +55,12 @@ struct spill_ctx { std::vector> renames; std::vector> spills_entry; std::vector> spills_exit; + std::vector processed; std::stack> loop_header; std::vector>> next_use_distances_start; std::vector>> next_use_distances_end; + std::vector> local_next_use_distance; /* Working buffer */ std::vector>> interferences; std::vector> affinities; std::vector is_reloaded; @@ -354,11 +356,13 @@ get_rematerialize_info(spill_ctx& ctx) } } -std::vector> -local_next_uses(spill_ctx& ctx, Block* block) +void +update_local_next_uses(spill_ctx& ctx, Block* block, + std::vector>& local_next_uses) { - std::vector> local_next_uses(block->instructions.size()); + local_next_uses.resize(block->instructions.size()); + local_next_uses[block->instructions.size() - 1].clear(); for (std::pair>& pair : ctx.next_use_distances_end[block->index]) { local_next_uses[block->instructions.size() - 1].insert( @@ -391,7 +395,6 @@ local_next_uses(spill_ctx& ctx, Block* block) } } } - return local_next_uses; } RegisterDemand @@ -1117,7 +1120,6 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, { assert(!ctx.processed[block_idx]); - std::vector> local_next_use_distance; std::vector> instructions; unsigned idx = 0; @@ -1127,8 +1129,11 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, instructions.emplace_back(std::move(block->instructions[idx++])); } - if (block->register_demand.exceeds(ctx.target_pressure)) - local_next_use_distance = local_next_uses(ctx, block); + if (block->register_demand.exceeds(ctx.target_pressure)) { + update_local_next_uses(ctx, block, ctx.local_next_use_distance); + } else { + /* We won't use local_next_use_distance, so no initialization needed */ + } while (idx < block->instructions.size()) { aco_ptr& instr = block->instructions[idx]; @@ -1163,7 +1168,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand new_demand = ctx.register_demand[block_idx][idx]; new_demand.update(get_demand_before(ctx, block_idx, idx)); - assert(!local_next_use_distance.empty()); + assert(!ctx.local_next_use_distance.empty()); /* if reg pressure is too high, spill variable with furthest next use */ while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) { @@ -1174,7 +1179,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) type = RegType::vgpr; - for (std::pair pair : local_next_use_distance[idx]) { + for (std::pair pair : ctx.local_next_use_distance[idx]) { if (pair.first.type() != type) continue; bool can_rematerialize = ctx.remat.count(pair.first); -- GitLab From 92d7a6ab1c2f812f1b00ecad313ea1edc4fe7e53 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Mon, 19 Jul 2021 19:43:54 +0200 Subject: [PATCH 07/18] aco/spill: Avoid destroying local next use maps over-eagerly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recreating these maps in a later block requires allocating fresh memory. Instead, by never shrinking the containing vector in the first place, previously allocated map memory is now re-used. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 1965b53f4f70..c9b758eb0bdc 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -360,7 +360,11 @@ void update_local_next_uses(spill_ctx& ctx, Block* block, std::vector>& local_next_uses) { - local_next_uses.resize(block->instructions.size()); + if (local_next_uses.size() < block->instructions.size()) { + /* Allocate more next-use-maps. Note that by never reducing the vector size, we enable + * future calls to this function to re-use already allocated map memory. */ + local_next_uses.resize(block->instructions.size()); + } local_next_uses[block->instructions.size() - 1].clear(); for (std::pair>& pair : -- GitLab From 4453bce77095d3bf4659cdaabda86b7da8bdc9f6 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Tue, 20 Jul 2021 11:48:15 +0200 Subject: [PATCH 08/18] aco/spill: Replace vector with vector for local_next_use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While adding/removing elements is faster with std::map, the cost of container copies (and the involved memory allocations) vastly outweigh that benefit in this usage pattern. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index c9b758eb0bdc..9367db9e160c 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -28,6 +28,7 @@ #include "common/sid.h" +#include #include #include #include @@ -60,7 +61,7 @@ struct spill_ctx { std::stack> loop_header; std::vector>> next_use_distances_start; std::vector>> next_use_distances_end; - std::vector> local_next_use_distance; /* Working buffer */ + std::vector>> local_next_use_distance; /* Working buffer */ std::vector>> interferences; std::vector> affinities; std::vector is_reloaded; @@ -358,7 +359,7 @@ get_rematerialize_info(spill_ctx& ctx) void update_local_next_uses(spill_ctx& ctx, Block* block, - std::vector>& local_next_uses) + std::vector>>& local_next_uses) { if (local_next_uses.size() < block->instructions.size()) { /* Allocate more next-use-maps. Note that by never reducing the vector size, we enable @@ -369,8 +370,8 @@ update_local_next_uses(spill_ctx& ctx, Block* block, local_next_uses[block->instructions.size() - 1].clear(); for (std::pair>& pair : ctx.next_use_distances_end[block->index]) { - local_next_uses[block->instructions.size() - 1].insert( - {pair.first, pair.second.second + block->instructions.size()}); + local_next_uses[block->instructions.size() - 1].push_back(std::make_pair( + (Temp)pair.first, pair.second.second + block->instructions.size())); } for (int idx = block->instructions.size() - 1; idx >= 0; idx--) { @@ -390,12 +391,22 @@ update_local_next_uses(spill_ctx& ctx, Block* block, if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear()) continue; if (op.isTemp()) { - local_next_uses[idx][op.getTemp()] = idx; + auto it = std::find_if(local_next_uses[idx].begin(), local_next_uses[idx].end(), + [op](auto& pair) { return pair.first == op.getTemp(); }); + if (it == local_next_uses[idx].end()) { + local_next_uses[idx].push_back(std::make_pair(op.getTemp(), idx)); + } else { + it->second = idx; + } } } for (const Definition& def : instr->definitions) { if (def.isTemp()) { - local_next_uses[idx].erase(def.getTemp()); + auto it = std::find_if(local_next_uses[idx].begin(), local_next_uses[idx].end(), + [def](auto& pair) { return pair.first == def.getTemp(); }); + if (it != local_next_uses[idx].end()) { + local_next_uses[idx].erase(it); + } } } } -- GitLab From 6650799ea5c45272a59a0b764e20b8d7a6ac44fc Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Tue, 13 Jul 2021 12:31:12 +0200 Subject: [PATCH 09/18] aco/spill: Prefer unordered_map over map for next use distances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes the iteration order of next use distances, so some "random" changes to shader metrics are expected. fossil-db for Navi14: Totals from 1239 (0.82% of 150305) affected shaders: SpillSGPRs: 10559 -> 10562 (+0.03%); split: -0.05%, +0.08% SpillVGPRs: 1632 -> 1863 (+14.15%) CodeSize: 19321468 -> 19304164 (-0.09%); split: -0.09%, +0.01% Instrs: 3593957 -> 3591647 (-0.06%); split: -0.07%, +0.01% Latency: 103120695 -> 102475647 (-0.63%); split: -0.63%, +0.01% InvThroughput: 23897614 -> 23575320 (-1.35%); split: -1.36%, +0.02% VClause: 66406 -> 66943 (+0.81%); split: -0.01%, +0.81% SClause: 118559 -> 118548 (-0.01%) Copies: 310871 -> 308950 (-0.62%); split: -0.69%, +0.08% Branches: 123386 -> 123413 (+0.02%); split: -0.00%, +0.03% These numbers mostly come from parallel-rdp ubershaders. Small changes are also found in the rdr2 and rage2 shader metrics, whereas others are not significantly affected. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 9367db9e160c..35a532dea728 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -29,12 +29,25 @@ #include "common/sid.h" #include +#include #include #include #include +#include #include #include +namespace std { +template <> struct hash { + size_t operator()(aco::Temp temp) const noexcept + { + uint32_t v; + std::memcpy(&v, &temp, sizeof(temp)); + return std::hash{}(v); + } +}; +} // namespace std + /* * Implements the spilling algorithm on SSA-form from * "Register Spilling and Live-Range Splitting for SSA-Form Programs" @@ -59,8 +72,8 @@ struct spill_ctx { std::vector processed; std::stack> loop_header; - std::vector>> next_use_distances_start; - std::vector>> next_use_distances_end; + std::vector>> next_use_distances_start; + std::vector>> next_use_distances_end; std::vector>> local_next_use_distance; /* Working buffer */ std::vector>> interferences; std::vector> affinities; @@ -158,11 +171,12 @@ void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& worklist) { Block* block = &ctx.program->blocks[block_idx]; - std::map> next_uses = ctx.next_use_distances_end[block_idx]; + std::unordered_map> next_uses = + ctx.next_use_distances_end[block_idx]; /* to compute the next use distance at the beginning of the block, we have to add the block's * size */ - for (std::map>::iterator it = next_uses.begin(); + for (std::unordered_map>::iterator it = next_uses.begin(); it != next_uses.end(); ++it) it->second.second = it->second.second + block->instructions.size(); -- GitLab From 05163fd4f4b0144a01ac0e55e75d053ad48f2514 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Sat, 10 Jul 2021 14:06:50 +0200 Subject: [PATCH 10/18] aco/spill: Avoid copying current_spills when not needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 35a532dea728..cc4b9a54fd35 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1144,8 +1144,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) } void -process_block(spill_ctx& ctx, unsigned block_idx, Block* block, - std::map& current_spills, RegisterDemand spilled_registers) +process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand spilled_registers) { assert(!ctx.processed[block_idx]); @@ -1164,6 +1163,8 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, /* We won't use local_next_use_distance, so no initialization needed */ } + auto& current_spills = ctx.spills_exit[block_idx]; + while (idx < block->instructions.size()) { aco_ptr& instr = block->instructions[idx]; @@ -1214,9 +1215,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, bool can_rematerialize = ctx.remat.count(pair.first); if (((pair.second > distance && can_rematerialize == do_rematerialize) || (can_rematerialize && !do_rematerialize && pair.second > idx)) && - current_spills.find(pair.first) == current_spills.end() && - ctx.spills_exit[block_idx].find(pair.first) == - ctx.spills_exit[block_idx].end()) { + current_spills.find(pair.first) == current_spills.end()) { to_spill = pair.first; distance = pair.second; do_rematerialize = can_rematerialize; @@ -1260,7 +1259,6 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, } block->instructions = std::move(instructions); - ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end()); } void @@ -1284,7 +1282,7 @@ spill_block(spill_ctx& ctx, unsigned block_idx) add_coupling_code(ctx, block, block_idx); } - std::map current_spills = ctx.spills_entry[block_idx]; + const std::map& current_spills = ctx.spills_entry[block_idx]; /* check conditions to process this block */ bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) || @@ -1295,10 +1293,11 @@ spill_block(spill_ctx& ctx, unsigned block_idx) process = true; } - if (process) - process_block(ctx, block_idx, block, current_spills, spilled_registers); - else - ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end()); + assert(ctx.spills_exit[block_idx].empty()); + ctx.spills_exit[block_idx] = current_spills; + if (process) { + process_block(ctx, block_idx, block, spilled_registers); + } ctx.processed[block_idx] = true; -- GitLab From 112babc6978cb53235fc828d009a977d9778d7b4 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Tue, 13 Jul 2021 12:59:58 +0200 Subject: [PATCH 11/18] aco/spill: Reduce redundant std::map lookups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous code checked for element containment first and then performed another map lookup upon element access. Instead, map::find can be used to retrieve an iterator usable for element access with no extra lookup needed. Furthermore, pure containment checks have been simplified using map::count. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 142 ++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 66 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index cc4b9a54fd35..5514d0e29500 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -223,11 +223,13 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& work unsigned pred_idx = instr->opcode == aco_opcode::p_phi ? block->logical_preds[i] : block->linear_preds[i]; if (instr->operands[i].isTemp()) { - if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) == - ctx.next_use_distances_end[pred_idx].end() || - ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != distance) + auto insert_result = ctx.next_use_distances_end[pred_idx].insert( + std::make_pair(instr->operands[i].getTemp(), distance)); + const bool inserted = insert_result.second; + std::pair& entry_distance = insert_result.first->second; + if (inserted || entry_distance != distance) worklist.insert(pred_idx); - ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = distance; + entry_distance = distance; } } idx--; @@ -242,16 +244,19 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& work for (unsigned pred_idx : preds) { if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth) distance += 0xFFFF; - if (ctx.next_use_distances_end[pred_idx].find(temp) != - ctx.next_use_distances_end[pred_idx].end()) { - dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program, - temp.is_linear()); - distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance); + auto insert_result = ctx.next_use_distances_end[pred_idx].insert( + std::make_pair(temp, std::pair{})); + const bool inserted = insert_result.second; + std::pair& entry_distance = insert_result.first->second; + + if (!inserted) { + dom = get_dominator(dom, entry_distance.first, ctx.program, temp.is_linear()); + distance = std::min(entry_distance.second, distance); } - if (ctx.next_use_distances_end[pred_idx][temp] != - std::pair{dom, distance}) + if (entry_distance != std::pair{dom, distance}) { worklist.insert(pred_idx); - ctx.next_use_distances_end[pred_idx][temp] = {dom, distance}; + entry_distance = {dom, distance}; + } } } } @@ -531,8 +536,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) if (pair.first.type() == type && (pair.second.first >= loop_end || (ctx.remat.count(pair.first) && type == RegType::sgpr)) && - pair.second.second > distance && - ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + pair.second.second > distance && !ctx.spills_entry[block_idx].count(pair.first)) { to_spill = pair.first; distance = pair.second.second; } @@ -547,8 +551,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) } uint32_t spill_id; - if (ctx.spills_exit[block_idx - 1].find(to_spill) == - ctx.spills_exit[block_idx - 1].end()) { + if (!ctx.spills_exit[block_idx - 1].count(to_spill)) { spill_id = ctx.allocate_spill_id(to_spill.regClass()); } else { spill_id = ctx.spills_exit[block_idx - 1][to_spill]; @@ -573,7 +576,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) for (std::pair>& pair : next_use_distances) { if (pair.first.type() == type && pair.second.second > distance && - ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) { + !ctx.spills_entry[block_idx].count(pair.first)) { to_spill = pair.first; distance = pair.second.second; } @@ -592,9 +595,12 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) /* keep variables spilled if they are alive and not used in the current block */ unsigned pred_idx = block->linear_preds[0]; for (std::pair pair : ctx.spills_exit[pred_idx]) { - if (pair.first.type() == RegType::sgpr && - next_use_distances.find(pair.first) != next_use_distances.end() && - next_use_distances[pair.first].first != block_idx) { + if (pair.first.type() != RegType::sgpr) { + continue; + } + auto next_use_distance_it = next_use_distances.find(pair.first); + if (next_use_distance_it != next_use_distances.end() && + next_use_distance_it->second.first != block_idx) { ctx.spills_entry[block_idx].insert(pair); spilled_registers.sgpr += pair.first.size(); } @@ -602,9 +608,12 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) if (block->logical_preds.size() == 1) { pred_idx = block->logical_preds[0]; for (std::pair pair : ctx.spills_exit[pred_idx]) { - if (pair.first.type() == RegType::vgpr && - next_use_distances.find(pair.first) != next_use_distances.end() && - next_use_distances[pair.first].first != block_idx) { + if (pair.first.type() != RegType::vgpr) { + continue; + } + auto next_use_distance_it = next_use_distances.find(pair.first); + if (next_use_distance_it != next_use_distances.end() && + next_use_distance_it->second.first != block_idx) { ctx.spills_entry[block_idx].insert(pair); spilled_registers.vgpr += pair.first.size(); } @@ -616,8 +625,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) { pred_idx = block->linear_preds[0]; for (std::pair pair : ctx.spills_exit[pred_idx]) { - if (pair.first.type() == RegType::sgpr && - next_use_distances.find(pair.first) != next_use_distances.end() && + if (pair.first.type() == RegType::sgpr && next_use_distances.count(pair.first) && ctx.spills_entry[block_idx].insert(pair).second) { spilled_registers.sgpr += pair.first.size(); } @@ -627,8 +635,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) block->logical_preds.size() == 1) { pred_idx = block->logical_preds[0]; for (std::pair pair : ctx.spills_exit[pred_idx]) { - if (pair.first.type() == RegType::vgpr && - next_use_distances.find(pair.first) != next_use_distances.end() && + if (pair.first.type() == RegType::vgpr && next_use_distances.count(pair.first) && ctx.spills_entry[block_idx].insert(pair).second) { spilled_registers.vgpr += pair.first.size(); } @@ -656,12 +663,11 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) uint32_t spill_id = 0; for (unsigned pred_idx : preds) { /* variable is not even live at the predecessor: probably from a phi */ - if (ctx.next_use_distances_end[pred_idx].find(pair.first) == - ctx.next_use_distances_end[pred_idx].end()) { + if (!ctx.next_use_distances_end[pred_idx].count(pair.first)) { spill = false; break; } - if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) { + if (!ctx.spills_exit[pred_idx].count(pair.first)) { if (!remat) spill = false; } else { @@ -698,8 +704,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) continue; } - if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == - ctx.spills_exit[preds[i]].end()) + if (!ctx.spills_exit[preds[i]].count(phi->operands[i].getTemp())) spill = false; else partial_spills.insert(phi->definitions[0].getTemp()); @@ -724,7 +729,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) RegType type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr; while (it != partial_spills.end()) { - assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end()); + assert(!ctx.spills_entry[block_idx].count(*it)); if (it->type() == type && next_use_distances[*it].second > distance) { distance = next_use_distances[*it].second; @@ -767,11 +772,12 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) if (!live.first.is_linear()) continue; /* still spilled */ - if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end()) + if (ctx.spills_entry[block_idx].count(live.first)) continue; /* in register at end of predecessor */ - if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) { + auto spills_exit_it = ctx.spills_exit[pred_idx].find(live.first); + if (spills_exit_it == ctx.spills_exit[pred_idx].end()) { std::map::iterator it = ctx.renames[pred_idx].find(live.first); if (it != ctx.renames[pred_idx].end()) ctx.renames[block_idx].insert(*it); @@ -780,8 +786,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* variable is spilled at predecessor and live at current block: create reload instruction */ Temp new_name = ctx.program->allocateTmp(live.first.regClass()); - aco_ptr reload = - do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); + aco_ptr reload = do_reload(ctx, live.first, new_name, spills_exit_it->second); instructions.emplace_back(std::move(reload)); reg_demand.push_back(demand_before); ctx.renames[block_idx][live.first] = new_name; @@ -801,11 +806,12 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) if (live.first.is_linear()) continue; /* still spilled */ - if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end()) + if (ctx.spills_entry[block_idx].count(live.first)) continue; /* in register at end of predecessor */ - if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) { + auto spills_exit_it = ctx.spills_exit[pred_idx].find(live.first); + if (spills_exit_it == ctx.spills_exit[pred_idx].end()) { std::map::iterator it = ctx.renames[pred_idx].find(live.first); if (it != ctx.renames[pred_idx].end()) ctx.renames[block_idx].insert(*it); @@ -816,7 +822,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) * create reload instruction */ Temp new_name = ctx.program->allocateTmp(live.first.regClass()); aco_ptr reload = - do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); + do_reload(ctx, live.first, new_name, spills_exit_it->second); instructions.emplace_back(std::move(reload)); reg_demand.emplace_back(reg_demand.back()); ctx.renames[block_idx][live.first] = new_name; @@ -850,8 +856,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* if the phi is not spilled, add to instructions */ if (!phi->definitions[0].isTemp() || - ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == - ctx.spills_entry[block_idx].end()) { + !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp())) { instructions.emplace_back(std::move(phi)); continue; } @@ -935,8 +940,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) } /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ - if (ctx.next_use_distances_end[pred_idx].find(pair.first) == - ctx.next_use_distances_end[pred_idx].end()) + if (!ctx.next_use_distances_end[pred_idx].count(pair.first)) continue; /* add interferences between spilled variable and predecessors exit spills */ @@ -976,8 +980,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) for (aco_ptr& phi : instructions) { assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi); assert(!phi->definitions[0].isTemp() || - ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) == - ctx.spills_entry[block_idx].end()); + !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp())); std::vector& preds = phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; @@ -987,15 +990,18 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) unsigned pred_idx = preds[i]; /* if the operand was reloaded, rename */ - if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) == - ctx.spills_exit[pred_idx].end()) { + if (!ctx.spills_exit[pred_idx].count(phi->operands[i].getTemp())) { std::map::iterator it = ctx.renames[pred_idx].find(phi->operands[i].getTemp()); - if (it != ctx.renames[pred_idx].end()) + if (it != ctx.renames[pred_idx].end()) { phi->operands[i].setTemp(it->second); /* prevent the definining instruction from being DCE'd if it could be rematerialized */ - else if (ctx.remat.count(phi->operands[i].getTemp())) - ctx.remat_used[ctx.remat[phi->operands[i].getTemp()].instr] = true; + } else { + auto remat_it = ctx.remat.find(phi->operands[i].getTemp()); + if (remat_it != ctx.remat.end()) { + ctx.remat_used[remat_it->second.instr] = true; + } + } continue; } @@ -1034,7 +1040,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) for (std::pair>& pair : ctx.next_use_distances_start[block_idx]) { /* skip spilled variables */ - if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end()) + if (ctx.spills_entry[block_idx].count(pair.first)) continue; std::vector preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; @@ -1042,15 +1048,14 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) /* variable is dead at predecessor, it must be from a phi */ bool is_dead = false; for (unsigned pred_idx : preds) { - if (ctx.next_use_distances_end[pred_idx].find(pair.first) == - ctx.next_use_distances_end[pred_idx].end()) + if (!ctx.next_use_distances_end[pred_idx].count(pair.first)) is_dead = true; } if (is_dead) continue; for (unsigned pred_idx : preds) { /* the variable is not spilled at the predecessor */ - if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) + if (!ctx.spills_exit[pred_idx].count(pair.first)) continue; /* variable is spilled at predecessor and has to be reloaded */ @@ -1076,7 +1081,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) Temp rename = Temp(); bool is_same = true; for (unsigned pred_idx : preds) { - if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) { + if (!ctx.renames[pred_idx].count(pair.first)) { if (rename == Temp()) rename = pair.first; else @@ -1100,7 +1105,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) rename = ctx.program->allocateTmp(pair.first.regClass()); for (unsigned i = 0; i < phi->operands.size(); i++) { Temp tmp; - if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end()) { + if (ctx.renames[preds[i]].count(pair.first)) { tmp = ctx.renames[preds[i]][pair.first]; } else if (preds[i] >= block_idx) { tmp = rename; @@ -1174,13 +1179,18 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s for (Operand& op : instr->operands) { if (!op.isTemp()) continue; - if (current_spills.find(op.getTemp()) == current_spills.end()) { + if (!current_spills.count(op.getTemp())) { /* the Operand is in register: check if it was renamed */ - if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end()) - op.setTemp(ctx.renames[block_idx][op.getTemp()]); - /* prevent it's definining instruction from being DCE'd if it could be rematerialized */ - else if (ctx.remat.count(op.getTemp())) - ctx.remat_used[ctx.remat[op.getTemp()].instr] = true; + auto rename_it = ctx.renames[block_idx].find(op.getTemp()); + if (rename_it != ctx.renames[block_idx].end()) { + op.setTemp(rename_it->second); + } else { + /* prevent its definining instruction from being DCE'd if it could be rematerialized */ + auto remat_it = ctx.remat.find(op.getTemp()); + if (remat_it != ctx.remat.end()) { + ctx.remat_used[remat_it->second.instr] = true; + } + } continue; } /* the Operand is spilled: add it to reloads */ @@ -1215,7 +1225,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s bool can_rematerialize = ctx.remat.count(pair.first); if (((pair.second > distance && can_rematerialize == do_rematerialize) || (can_rematerialize && !do_rematerialize && pair.second > idx)) && - current_spills.find(pair.first) == current_spills.end()) { + !current_spills.count(pair.first)) { to_spill = pair.first; distance = pair.second; do_rematerialize = can_rematerialize; @@ -1235,7 +1245,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s spilled_registers += to_spill; /* rename if necessary */ - if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) { + if (ctx.renames[block_idx].count(to_spill)) { to_spill = ctx.renames[block_idx][to_spill]; } @@ -1289,7 +1299,7 @@ spill_block(spill_ctx& ctx, unsigned block_idx) !ctx.renames[block_idx].empty() || ctx.remat_used.size(); for (auto it = current_spills.begin(); !process && it != current_spills.end(); ++it) { - if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx) + if (ctx.next_use_distances_start[block_idx].at(it->first).first == block_idx) process = true; } -- GitLab From 4a78a05247c3e8c27a284194a0e7ba4cf4685377 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Thu, 15 Jul 2021 15:11:44 +0200 Subject: [PATCH 12/18] aco/spill: Replace an std::map to booleans with std::set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 5514d0e29500..3522a287a750 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -79,7 +79,7 @@ struct spill_ctx { std::vector> affinities; std::vector is_reloaded; std::map remat; - std::map remat_used; + std::set unused_remats; unsigned wave_size; spill_ctx(const RegisterDemand target_pressure_, Program* program_, @@ -339,7 +339,7 @@ do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id) if (instr->operands[i].isTemp()) { assert(false && "unsupported"); if (ctx.remat.count(instr->operands[i].getTemp())) - ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true; + ctx.unused_remats.erase(ctx.remat[instr->operands[i].getTemp()].instr); } } res->definitions[0] = Definition(new_name); @@ -368,7 +368,7 @@ get_rematerialize_info(spill_ctx& ctx) for (const Definition& def : instr->definitions) { if (def.isTemp()) { ctx.remat[def.getTemp()] = remat_info{instr.get()}; - ctx.remat_used[instr.get()] = false; + ctx.unused_remats.insert(instr.get()); } } } @@ -879,7 +879,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) std::map::iterator rename_it = ctx.renames[pred_idx].find(var); /* prevent the definining instruction from being DCE'd if it could be rematerialized */ if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var)) - ctx.remat_used[ctx.remat[var].instr] = true; + ctx.unused_remats.erase(ctx.remat[var].instr); /* check if variable is already spilled at predecessor */ std::map::iterator spilled = ctx.spills_exit[pred_idx].find(var); @@ -999,7 +999,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) } else { auto remat_it = ctx.remat.find(phi->operands[i].getTemp()); if (remat_it != ctx.remat.end()) { - ctx.remat_used[remat_it->second.instr] = true; + ctx.unused_remats.erase(remat_it->second.instr); } } continue; @@ -1113,7 +1113,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) tmp = pair.first; /* prevent the definining instruction from being DCE'd if it could be rematerialized */ if (ctx.remat.count(tmp)) - ctx.remat_used[ctx.remat[tmp].instr] = true; + ctx.unused_remats.erase(ctx.remat[tmp].instr); } phi->operands[i] = Operand(tmp); } @@ -1188,7 +1188,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s /* prevent its definining instruction from being DCE'd if it could be rematerialized */ auto remat_it = ctx.remat.find(op.getTemp()); if (remat_it != ctx.remat.end()) { - ctx.remat_used[remat_it->second.instr] = true; + ctx.unused_remats.erase(remat_it->second.instr); } } continue; @@ -1296,7 +1296,7 @@ spill_block(spill_ctx& ctx, unsigned block_idx) /* check conditions to process this block */ bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) || - !ctx.renames[block_idx].empty() || ctx.remat_used.size(); + !ctx.renames[block_idx].empty() || ctx.unused_remats.size(); for (auto it = current_spills.begin(); !process && it != current_spills.end(); ++it) { if (ctx.next_use_distances_start[block_idx].at(it->first).first == block_idx) @@ -1803,7 +1803,7 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) reload->definitions[0] = (*it)->definitions[0]; instructions.emplace_back(aco_ptr(reload)); } - } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) { + } else if (!ctx.unused_remats.count(it->get())) { instructions.emplace_back(std::move(*it)); } } -- GitLab From e60a70dde02f69a57946282feafd50a2efbe41e9 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Thu, 15 Jul 2021 15:14:41 +0200 Subject: [PATCH 13/18] aco/spill: Store remat list in an std::unordered_map instead of std::map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 3522a287a750..fbd396f3579b 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -78,7 +78,7 @@ struct spill_ctx { std::vector>> interferences; std::vector> affinities; std::vector is_reloaded; - std::map remat; + std::unordered_map remat; std::set unused_remats; unsigned wave_size; @@ -309,7 +309,7 @@ should_rematerialize(aco_ptr& instr) aco_ptr do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id) { - std::map::iterator remat = ctx.remat.find(tmp); + std::unordered_map::iterator remat = ctx.remat.find(tmp); if (remat != ctx.remat.end()) { Instruction* instr = remat->second.instr; assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) && -- GitLab From a6bcda6a01ffefe8e23357b2da742b8fe89bc63b Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Thu, 15 Jul 2021 16:36:09 +0200 Subject: [PATCH 14/18] aco/spill: Change worklist to a single integer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index fbd396f3579b..bf4c8c9da0d5 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -168,7 +168,7 @@ get_dominator(int idx_a, int idx_b, Program* program, bool is_linear) } void -next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& worklist) +next_uses_per_block(spill_ctx& ctx, unsigned block_idx, uint32_t& worklist) { Block* block = &ctx.program->blocks[block_idx]; std::unordered_map> next_uses = @@ -228,7 +228,7 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& work const bool inserted = insert_result.second; std::pair& entry_distance = insert_result.first->second; if (inserted || entry_distance != distance) - worklist.insert(pred_idx); + worklist = std::max(worklist, pred_idx + 1); entry_distance = distance; } } @@ -254,7 +254,7 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& work distance = std::min(entry_distance.second, distance); } if (entry_distance != std::pair{dom, distance}) { - worklist.insert(pred_idx); + worklist = std::max(worklist, pred_idx + 1); entry_distance = {dom, distance}; } } @@ -266,14 +266,10 @@ compute_global_next_uses(spill_ctx& ctx) { ctx.next_use_distances_start.resize(ctx.program->blocks.size()); ctx.next_use_distances_end.resize(ctx.program->blocks.size()); - std::set worklist; - for (Block& block : ctx.program->blocks) - worklist.insert(block.index); - - while (!worklist.empty()) { - std::set::reverse_iterator b_it = worklist.rbegin(); - unsigned block_idx = *b_it; - worklist.erase(block_idx); + + uint32_t worklist = ctx.program->blocks.size(); + while (worklist) { + unsigned block_idx = --worklist; next_uses_per_block(ctx, block_idx, worklist); } } -- GitLab From a3ac3b231fdab9f2c0c47914719612f3345b42c5 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Thu, 15 Jul 2021 17:11:24 +0200 Subject: [PATCH 15/18] aco/spill: Reduce allocations in next_uses_per_block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index bf4c8c9da0d5..c52004c4a868 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -171,13 +171,14 @@ void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, uint32_t& worklist) { Block* block = &ctx.program->blocks[block_idx]; - std::unordered_map> next_uses = - ctx.next_use_distances_end[block_idx]; + ctx.next_use_distances_start[block_idx] = ctx.next_use_distances_end[block_idx]; + auto& next_use_distances_start = ctx.next_use_distances_start[block_idx]; /* to compute the next use distance at the beginning of the block, we have to add the block's * size */ - for (std::unordered_map>::iterator it = next_uses.begin(); - it != next_uses.end(); ++it) + for (std::unordered_map>::iterator it = + next_use_distances_start.begin(); + it != next_use_distances_start.end(); ++it) it->second.second = it->second.second + block->instructions.size(); int idx = block->instructions.size() - 1; @@ -189,7 +190,7 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, uint32_t& worklist) for (const Definition& def : instr->definitions) { if (def.isTemp()) - next_uses.erase(def.getTemp()); + next_use_distances_start.erase(def.getTemp()); } for (const Operand& op : instr->operands) { @@ -199,24 +200,24 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, uint32_t& worklist) if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear()) continue; if (op.isTemp()) - next_uses[op.getTemp()] = {block_idx, idx}; + next_use_distances_start[op.getTemp()] = {block_idx, idx}; } idx--; } - assert(block_idx != 0 || next_uses.empty()); - ctx.next_use_distances_start[block_idx] = next_uses; + assert(block_idx != 0 || next_use_distances_start.empty()); + std::unordered_set phi_defs; while (idx >= 0) { aco_ptr& instr = block->instructions[idx]; assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi); std::pair distance{block_idx, 0}; - auto it = instr->definitions[0].isTemp() ? next_uses.find(instr->definitions[0].getTemp()) - : next_uses.end(); - if (it != next_uses.end()) { + auto it = instr->definitions[0].isTemp() ? next_use_distances_start.find(instr->definitions[0].getTemp()) + : next_use_distances_start.end(); + if (it != next_use_distances_start.end() && + phi_defs.insert(instr->definitions[0].getTemp()).second) { distance = it->second; - next_uses.erase(it); } for (unsigned i = 0; i < instr->operands.size(); i++) { @@ -236,8 +237,11 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, uint32_t& worklist) } /* all remaining live vars must be live-out at the predecessors */ - for (std::pair>& pair : next_uses) { + for (std::pair>& pair : next_use_distances_start) { Temp temp = pair.first; + if (phi_defs.count(temp)) { + continue; + } uint32_t distance = pair.second.second; uint32_t dom = pair.second.first; std::vector& preds = temp.is_linear() ? block->linear_preds : block->logical_preds; -- GitLab From 7368f16fe81311277b916a85c1d79977501a5d24 Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Fri, 16 Jul 2021 11:33:32 +0200 Subject: [PATCH 16/18] aco/spill: Clarify use of long-lived references by adding const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index c52004c4a868..8e81eefab916 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -485,7 +485,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) return {0, 0}; /* next use distances at the beginning of the current block */ - auto& next_use_distances = ctx.next_use_distances_start[block_idx]; + const auto& next_use_distances = ctx.next_use_distances_start[block_idx]; /* loop header block */ if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) { @@ -532,7 +532,8 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) unsigned distance = 0; Temp to_spill; - for (std::pair>& pair : next_use_distances) { + for (const std::pair>& pair : + next_use_distances) { if (pair.first.type() == type && (pair.second.first >= loop_end || (ctx.remat.count(pair.first) && type == RegType::sgpr)) && @@ -574,7 +575,8 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) Temp to_spill; type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr; - for (std::pair>& pair : next_use_distances) { + for (const std::pair>& pair : + next_use_distances) { if (pair.first.type() == type && pair.second.second > distance && !ctx.spills_entry[block_idx].count(pair.first)) { to_spill = pair.first; @@ -649,7 +651,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) std::set partial_spills; /* keep variables spilled on all incoming paths */ - for (std::pair>& pair : next_use_distances) { + for (const std::pair>& pair : next_use_distances) { std::vector& preds = pair.first.is_linear() ? block->linear_preds : block->logical_preds; /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload @@ -731,8 +733,8 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) while (it != partial_spills.end()) { assert(!ctx.spills_entry[block_idx].count(*it)); - if (it->type() == type && next_use_distances[*it].second > distance) { - distance = next_use_distances[*it].second; + if (it->type() == type && next_use_distances.at(*it).second > distance) { + distance = next_use_distances.at(*it).second; to_spill = *it; } ++it; -- GitLab From 3fad5efd1527f284f1e6cf307e260aebd187811c Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Fri, 16 Jul 2021 12:17:29 +0200 Subject: [PATCH 17/18] aco/spill: Use unordered_map for spills_exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 8e81eefab916..3ad8c0e85537 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -68,7 +68,7 @@ struct spill_ctx { std::vector> register_demand; std::vector> renames; std::vector> spills_entry; - std::vector> spills_exit; + std::vector> spills_exit; std::vector processed; std::stack> loop_header; @@ -884,7 +884,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) ctx.unused_remats.erase(ctx.remat[var].instr); /* check if variable is already spilled at predecessor */ - std::map::iterator spilled = ctx.spills_exit[pred_idx].find(var); + auto spilled = ctx.spills_exit[pred_idx].find(var); if (spilled != ctx.spills_exit[pred_idx].end()) { if (spilled->second != def_spill_id) ctx.add_affinity(def_spill_id, spilled->second); @@ -934,7 +934,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) for (unsigned pred_idx : preds) { /* variable is already spilled at predecessor */ - std::map::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first); + auto spilled = ctx.spills_exit[pred_idx].find(pair.first); if (spilled != ctx.spills_exit[pred_idx].end()) { if (spilled->second != pair.second) ctx.add_affinity(pair.second, spilled->second); @@ -1306,7 +1306,7 @@ spill_block(spill_ctx& ctx, unsigned block_idx) } assert(ctx.spills_exit[block_idx].empty()); - ctx.spills_exit[block_idx] = current_spills; + ctx.spills_exit[block_idx] = std::unordered_map(current_spills.begin(), current_spills.end()); if (process) { process_block(ctx, block_idx, block, spilled_registers); } -- GitLab From fa655b6f703d21d8244041d1f4916d64d52985cf Mon Sep 17 00:00:00 2001 From: Tony Wasserka Date: Fri, 16 Jul 2021 12:19:28 +0200 Subject: [PATCH 18/18] aco/spill: Use std::unordered_map for spills_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db on Navi14: Totals from 305 (0.20% of 150305) affected shaders: CodeSize: 5498340 -> 5498328 (-0.00%) Instrs: 1009992 -> 1009989 (-0.00%) Latency: 33922644 -> 33923018 (+0.00%) InvThroughput: 9302963 -> 9303151 (+0.00%) VClause: 19004 -> 19001 (-0.02%) Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_spill.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 3ad8c0e85537..96f3bb85061e 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -67,7 +67,7 @@ struct spill_ctx { Program* program; std::vector> register_demand; std::vector> renames; - std::vector> spills_entry; + std::vector> spills_entry; std::vector> spills_exit; std::vector processed; @@ -1294,7 +1294,7 @@ spill_block(spill_ctx& ctx, unsigned block_idx) add_coupling_code(ctx, block, block_idx); } - const std::map& current_spills = ctx.spills_entry[block_idx]; + const auto& current_spills = ctx.spills_entry[block_idx]; /* check conditions to process this block */ bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) || @@ -1306,7 +1306,7 @@ spill_block(spill_ctx& ctx, unsigned block_idx) } assert(ctx.spills_exit[block_idx].empty()); - ctx.spills_exit[block_idx] = std::unordered_map(current_spills.begin(), current_spills.end()); + ctx.spills_exit[block_idx] = current_spills; if (process) { process_block(ctx, block_idx, block, spilled_registers); } -- GitLab