/* * Copyright © 2019 Valve Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * */ #include "aco_builder.h" #include "aco_ir.h" #include #include #include #include /* * Implements an algorithm to lower to Concentional SSA Form (CSSA). * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency" * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon, * * By lowering the IR to CSSA, the insertion of parallelcopies is separated from * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling. * The algorithm coalesces non-interfering phi-resources while taking value-equality * into account. Re-indexes the SSA-defs. */ namespace aco { namespace { typedef std::vector merge_set; struct copy { Definition def; Operand op; }; struct merge_node { Operand value = Operand(); /* original value: can be an SSA-def or constant value */ uint32_t index = -1u; /* index into the vector of merge sets */ uint32_t defined_at = -1u; /* defining block */ /* we also remember two dominating defs with the same value: */ Temp equal_anc_in = Temp(); /* within the same merge set */ Temp equal_anc_out = Temp(); /* from a different set */ }; struct cssa_ctx { Program* program; std::vector& live_out; /* live-out sets per block */ std::vector> parallelcopies; /* copies per block */ std::vector merge_sets; /* each vector is one (ordered) merge set */ std::unordered_map merge_node_table; /* tempid -> merge node */ }; /* create (virtual) parallelcopies for each phi instruction and * already merge copy-definitions with phi-defs into merge sets */ void collect_parallelcopies(cssa_ctx& ctx) { ctx.parallelcopies.resize(ctx.program->blocks.size()); Builder bld(ctx.program); for (Block& block : ctx.program->blocks) { for (aco_ptr& phi : block.instructions) { if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) break; const Definition& def = phi->definitions[0]; /* if the definition is not temp, it is the exec mask. * We can reload the exec mask directly from the spill slot. */ if (!def.isTemp()) continue; std::vector& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; uint32_t index = ctx.merge_sets.size(); merge_set set; bool has_preheader_copy = false; for (unsigned i = 0; i < phi->operands.size(); i++) { Operand op = phi->operands[i]; if (op.isUndefined()) continue; if (def.regClass().type() == RegType::sgpr && !op.isTemp()) { /* SGPR inline constants and literals on GFX10+ can be spilled * and reloaded directly (without intermediate register) */ if (op.isConstant()) { if (ctx.program->chip_class >= GFX10) continue; if (op.size() == 1 && !op.isLiteral()) continue; } else { assert(op.isFixed() && op.physReg() == exec); continue; } } /* create new temporary and rename operands */ Temp tmp = bld.tmp(def.regClass()); ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op}); phi->operands[i] = Operand(tmp); /* place the new operands in the same merge set */ set.emplace_back(tmp); ctx.merge_node_table[tmp.id()] = {op, index, preds[i]}; /* update the liveness information */ if (op.isKill()) ctx.live_out[preds[i]].erase(op.tempId()); ctx.live_out[preds[i]].insert(tmp.id()); has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header; } if (set.empty()) continue; /* place the definition in dominance-order */ if (def.isTemp()) { if (has_preheader_copy) set.emplace(std::next(set.begin()), def.getTemp()); else if (block.kind & block_kind_loop_header) set.emplace(set.begin(), def.getTemp()); else set.emplace_back(def.getTemp()); ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index}; } ctx.merge_sets.emplace_back(set); } } } /* check whether the definition of a comes after b. */ inline bool defined_after(cssa_ctx& ctx, Temp a, Temp b) { merge_node& node_a = ctx.merge_node_table[a.id()]; merge_node& node_b = ctx.merge_node_table[b.id()]; if (node_a.defined_at == node_b.defined_at) return a.id() > b.id(); return node_a.defined_at > node_b.defined_at; } /* check whether a dominates b where b is defined after a */ inline bool dominates(cssa_ctx& ctx, Temp a, Temp b) { assert(defined_after(ctx, b, a)); merge_node& node_a = ctx.merge_node_table[a.id()]; merge_node& node_b = ctx.merge_node_table[b.id()]; unsigned idom = node_b.defined_at; while (idom > node_a.defined_at) idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom : ctx.program->blocks[idom].linear_idom; return idom == node_a.defined_at; } /* check intersection between var and parent: * We already know that parent dominates var. */ inline bool intersects(cssa_ctx& ctx, Temp var, Temp parent) { merge_node& node_var = ctx.merge_node_table[var.id()]; merge_node& node_parent = ctx.merge_node_table[parent.id()]; assert(node_var.index != node_parent.index); uint32_t block_idx = node_var.defined_at; /* if the parent is live-out at the definition block of var, they intersect */ bool parent_live = ctx.live_out[block_idx].count(parent.id()); if (parent_live) return true; /* parent is defined in a different block than var */ if (node_parent.defined_at < node_var.defined_at) { /* if the parent is not live-in, they don't interfere */ std::vector& preds = var.type() == RegType::vgpr ? ctx.program->blocks[block_idx].logical_preds : ctx.program->blocks[block_idx].linear_preds; for (uint32_t pred : preds) { if (!ctx.live_out[pred].count(parent.id())) return false; } } for (const copy& cp : ctx.parallelcopies[block_idx]) { /* if var is defined at the edge, they don't intersect */ if (cp.def.getTemp() == var) return false; if (cp.op.isTemp() && cp.op.getTemp() == parent) parent_live = true; } /* if the parent is live at the edge, they intersect */ if (parent_live) return true; /* both, parent and var, are present in the same block */ const Block& block = ctx.program->blocks[block_idx]; for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) { /* if the parent was not encountered yet, it can only be used by a phi */ if (is_phi(it->get())) break; for (const Definition& def : (*it)->definitions) { if (!def.isTemp()) continue; /* if parent was not found yet, they don't intersect */ if (def.getTemp() == var) return false; } for (const Operand& op : (*it)->operands) { if (!op.isTemp()) continue; /* if the var was defined before this point, they intersect */ if (op.getTemp() == parent) return true; } } return false; } /* check interference between var and parent: * i.e. they have different values and intersect. * If parent and var share the same value, also updates the equal ancestor. */ inline bool interference(cssa_ctx& ctx, Temp var, Temp parent) { assert(var != parent); merge_node& node_var = ctx.merge_node_table[var.id()]; node_var.equal_anc_out = Temp(); if (node_var.index == ctx.merge_node_table[parent.id()].index) { /* check/update in other set */ parent = ctx.merge_node_table[parent.id()].equal_anc_out; } Temp tmp = parent; /* check if var intersects with parent or any equal-valued ancestor */ while (tmp != Temp() && !intersects(ctx, var, tmp)) { merge_node& node_tmp = ctx.merge_node_table[tmp.id()]; tmp = node_tmp.equal_anc_in; } /* no intersection found */ if (tmp == Temp()) return false; /* var and parent, same value, but in different sets */ if (node_var.value == ctx.merge_node_table[parent.id()].value) { node_var.equal_anc_out = tmp; return false; } /* var and parent, different values and intersect */ return true; } /* tries to merge set_b into set_a of given temporary and * drops that temporary as it is being coalesced */ bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b) { auto def_node_it = ctx.merge_node_table.find(dst.id()); uint32_t index = def_node_it->second.index; merge_set& set_a = ctx.merge_sets[index]; std::vector dom; /* stack of the traversal */ merge_set union_set; /* the new merged merge-set */ uint32_t i_a = 0; uint32_t i_b = 0; while (i_a < set_a.size() || i_b < set_b.size()) { Temp current; if (i_a == set_a.size()) current = set_b[i_b++]; else if (i_b == set_b.size()) current = set_a[i_a++]; /* else pick the one defined first */ else if (defined_after(ctx, set_a[i_a], set_b[i_b])) current = set_b[i_b++]; else current = set_a[i_a++]; while (!dom.empty() && !dominates(ctx, dom.back(), current)) dom.pop_back(); /* not the desired parent, remove */ if (!dom.empty() && interference(ctx, current, dom.back())) return false; /* intersection detected */ dom.emplace_back(current); /* otherwise, keep checking */ if (current != dst) union_set.emplace_back(current); /* maintain the new merge-set sorted */ } /* update hashmap */ for (Temp t : union_set) { merge_node& node = ctx.merge_node_table[t.id()]; /* update the equal ancestors: * i.e. the 'closest' dominating def with the same value */ Temp in = node.equal_anc_in; Temp out = node.equal_anc_out; if (in == Temp() || (out != Temp() && defined_after(ctx, out, in))) node.equal_anc_in = out; node.equal_anc_out = Temp(); /* update merge-set index */ node.index = index; } set_b = merge_set(); /* free the old set_b */ ctx.merge_sets[index] = union_set; ctx.merge_node_table.erase(dst.id()); /* remove the temporary */ return true; } /* returns true if the copy can safely be omitted */ bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx) { /* we can only coalesce temporaries */ if (!copy.op.isTemp()) return false; /* try emplace a merge_node for the copy operand */ merge_node& op_node = ctx.merge_node_table[copy.op.tempId()]; if (op_node.defined_at == -1u) { /* find defining block of operand */ uint32_t pred = block_idx; do { block_idx = pred; pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom : ctx.program->blocks[pred].linear_idom; } while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId())); op_node.defined_at = block_idx; op_node.value = copy.op; } /* we can only coalesce copies of the same register class */ if (copy.op.regClass() != copy.def.regClass()) return false; /* check if this operand has not yet been coalesced */ if (op_node.index == -1u) { merge_set op_set = merge_set{copy.op.getTemp()}; return try_merge_merge_set(ctx, copy.def.getTemp(), op_set); } /* check if this operand has been coalesced into the same set */ assert(ctx.merge_node_table.count(copy.def.tempId())); if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index) return true; /* otherwise, try to coalesce both merge sets */ return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]); } /* node in the location-transfer-graph */ struct ltg_node { copy cp; uint32_t read_idx; uint32_t num_uses = 0; }; /* emit the copies in an order that does not * create interferences within a merge-set */ void emit_copies_block(Builder& bld, std::map& ltg, RegType type) { auto&& it = ltg.begin(); while (it != ltg.end()) { const copy& cp = it->second.cp; /* wrong regclass or still needed as operand */ if (cp.def.regClass().type() != type || it->second.num_uses > 0) { ++it; continue; } /* emit the copy */ bld.copy(cp.def, it->second.cp.op); /* update the location transfer graph */ if (it->second.read_idx != -1u) { auto&& other = ltg.find(it->second.read_idx); if (other != ltg.end()) other->second.num_uses--; } ltg.erase(it); it = ltg.begin(); } /* count the number of remaining circular dependencies */ unsigned num = std::count_if(ltg.begin(), ltg.end(), [&](auto& n) { return n.second.cp.def.regClass().type() == type; }); /* if there are circular dependencies, we just emit them as single parallelcopy */ if (num) { // TODO: this should be restricted to a feasible number of registers // and otherwise use a temporary to avoid having to reload more (spilled) // variables than we have registers. aco_ptr copy{create_instruction( aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)}; it = ltg.begin(); for (unsigned i = 0; i < num; i++) { while (it->second.cp.def.regClass().type() != type) ++it; copy->definitions[i] = it->second.cp.def; copy->operands[i] = it->second.cp.op; it = ltg.erase(it); } bld.insert(std::move(copy)); } } /* either emits or coalesces all parallelcopies and * renames the phi-operands accordingly. */ void emit_parallelcopies(cssa_ctx& ctx) { std::unordered_map renames; /* we iterate backwards to prioritize coalescing in else-blocks */ for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) { if (ctx.parallelcopies[i].empty()) continue; std::map ltg; bool has_vgpr_copy = false; bool has_sgpr_copy = false; /* first, try to coalesce all parallelcopies */ for (const copy& cp : ctx.parallelcopies[i]) { if (try_coalesce_copy(ctx, cp, i)) { renames.emplace(cp.def.tempId(), cp.op); /* update liveness info */ ctx.live_out[i].erase(cp.def.tempId()); ctx.live_out[i].insert(cp.op.tempId()); } else { uint32_t read_idx = -1u; if (cp.op.isTemp()) read_idx = ctx.merge_node_table[cp.op.tempId()].index; uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index; assert(write_idx != -1u); ltg[write_idx] = {cp, read_idx}; bool is_vgpr = cp.def.regClass().type() == RegType::vgpr; has_vgpr_copy |= is_vgpr; has_sgpr_copy |= !is_vgpr; } } /* build location-transfer-graph */ for (auto& pair : ltg) { if (pair.second.read_idx == -1u) continue; auto&& it = ltg.find(pair.second.read_idx); if (it != ltg.end()) it->second.num_uses++; } /* emit parallelcopies ordered */ Builder bld(ctx.program); Block& block = ctx.program->blocks[i]; if (has_vgpr_copy) { /* emit VGPR copies */ auto IsLogicalEnd = [](const aco_ptr& inst) -> bool { return inst->opcode == aco_opcode::p_logical_end; }; auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd); bld.reset(&block.instructions, std::prev(it.base())); emit_copies_block(bld, ltg, RegType::vgpr); } if (has_sgpr_copy) { /* emit SGPR copies */ aco_ptr branch = std::move(block.instructions.back()); block.instructions.pop_back(); bld.reset(&block.instructions); emit_copies_block(bld, ltg, RegType::sgpr); bld.insert(std::move(branch)); } } /* finally, rename coalesced phi operands */ for (Block& block : ctx.program->blocks) { for (aco_ptr& phi : block.instructions) { if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) break; for (Operand& op : phi->operands) { if (!op.isTemp()) continue; auto&& it = renames.find(op.tempId()); if (it != renames.end()) { op = it->second; renames.erase(it); } } } } assert(renames.empty()); } } /* end namespace */ void lower_to_cssa(Program* program, live& live_vars) { reindex_ssa(program, live_vars.live_out); cssa_ctx ctx = {program, live_vars.live_out}; collect_parallelcopies(ctx); emit_parallelcopies(ctx); /* update live variable information */ live_vars = live_var_analysis(program); } } // namespace aco