From 92d2671af6145cd55cf132c1fbd075064fbd30bc Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 11 Mar 2026 11:00:34 -0700 Subject: [PATCH] ir3: Late lowering of fmul+fadd to ffma Since we know our mad.f16/mad.f32 is unfused, we can also apply this opt in the exact case. Signed-off-by: Rob Clark Part-of: --- src/freedreno/ir3/ir3_compiler.c | 4 +- src/freedreno/ir3/ir3_context.c | 1 + src/freedreno/ir3/ir3_nir.c | 3 +- src/freedreno/ir3/ir3_nir.h | 1 + .../ir3/ir3_nir_opt_algebraic_late.py | 57 +++++++++++++++++++ src/freedreno/ir3/meson.build | 21 ++++++- 6 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 src/freedreno/ir3/ir3_nir_opt_algebraic_late.py diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index dac61355395..ee987cd8bf6 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -135,7 +135,9 @@ static const nir_shader_compiler_options ir3_base_options = { * SPIRV, and NIR don't require either fused or unfused behavior from * fma, and we'll turn mul+adds back into nir_op_ffma (again, implemented * as unfused) during nir_opt_algebraic_late() (assuming it's not - * decorated with GLSL's precise, or SPIRV's NoContraction). + * decorated with GLSL's precise, or SPIRV's NoContraction), or + * ir3_nir_opt_algebraic_late (if it is, since ir3's unfused mul-add is + * precise). */ .lower_ffma16 = true, .lower_ffma32 = true, diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index 3c05db88300..08a82537940 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -90,6 +90,7 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader, /* nir_opt_algebraic() above would have unfused our ffmas, re-fuse them. */ if (needs_late_alg) { NIR_PASS(progress, ctx->s, nir_opt_algebraic_late); + NIR_PASS(progress, ctx->s, ir3_nir_opt_algebraic_late); NIR_PASS(progress, ctx->s, nir_opt_dce); } diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 295578c2dda..333b1e6bb9c 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -1639,7 +1639,8 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, */ bool more_late_algebraic = true; while (more_late_algebraic) { - more_late_algebraic = OPT(s, nir_opt_algebraic_late); + more_late_algebraic = OPT(s, nir_opt_algebraic_late) || + OPT(s, ir3_nir_opt_algebraic_late); if (!more_late_algebraic && so->compiler->gen >= 5) { /* Lowers texture operations that have only f2f16 or u2u16 called on * them to have a 16-bit destination. Also, lower 16-bit texture diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index fb29eb5fe17..46006e9848b 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -62,6 +62,7 @@ nir_mem_access_size_align ir3_mem_access_size_align( bool ir3_nir_opt_branch_and_or_not(nir_shader *nir); bool ir3_nir_opt_triops_bitwise(nir_shader *nir); +bool ir3_nir_opt_algebraic_late(nir_shader *nir); struct ir3_optimize_options { nir_opt_uub_options opt_uub_options; diff --git a/src/freedreno/ir3/ir3_nir_opt_algebraic_late.py b/src/freedreno/ir3/ir3_nir_opt_algebraic_late.py new file mode 100644 index 00000000000..c35af819224 --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_opt_algebraic_late.py @@ -0,0 +1,57 @@ +# +# Copyright © 2016 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import argparse +import sys + +# fuse fadd+fmul late to get something we can turn into mad.f32/f16. The +# common nir_opt_algebraic_late pass only does this for non-exact patterns. +# Since for us, mad is not fused, we don't have this restriction. +late_optimizations = [] + +a = 'a' +b = 'b' +c = 'c' + +for sz in [16, 32]: + # Fuse the correct fmul. Only consider fmuls where the only users are fadd + # (or fneg/fabs which are assumed to be propagated away), as a heuristic to + # avoid fusing in cases where it's harmful. + fmul = 'fmul(is_only_used_by_fadd)' + ffma = 'ffma' + + fadd = 'fadd@{}'.format(sz) + + late_optimizations.extend([ + ((fadd, (fmul, a, b), c), (ffma, a, b, c)), + + ((fadd, ('fneg(is_only_used_by_fadd)', (fmul, a, b)), c), + (ffma, ('fneg', a), b, c)), + + ((fadd, ('fabs(is_only_used_by_fadd)', (fmul, a, b)), c), + (ffma, ('fabs', a), ('fabs', b), c)), + + ((fadd, ('fneg(is_only_used_by_fadd)', ('fabs', (fmul, a, b))), c), + (ffma, ('fneg', ('fabs', a)), ('fabs', b), c)), + ]) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + args = parser.parse_args() + sys.path.insert(0, args.import_path) + run() + + +def run(): + import nir_algebraic # pylint: disable=import-error + + print('#include "ir3_nir.h"') + print(nir_algebraic.AlgebraicPass("ir3_nir_opt_algebraic_late", + late_optimizations).render()) + + +if __name__ == '__main__': + main() diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index 8668f7c5d5f..6ff1f43d9c6 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -45,6 +45,17 @@ ir3_nir_triop_bitwise_c = custom_target( depend_files : nir_algebraic_depends, ) +ir3_nir_opt_algebraic_late_c = custom_target( + 'ir3_nir_opt_algebraic_late.c', + input : 'ir3_nir_opt_algebraic_late.py', + output : 'ir3_nir_opt_algebraic_late.c', + command : [ + prog_python, '@INPUT@', '-p', dir_compiler_nir, + ], + capture : true, + depend_files : nir_algebraic_depends, +) + ir3_parser = custom_target( 'ir3_parser.[ch]', input: 'ir3_parser.y', @@ -134,7 +145,15 @@ libfreedreno_ir3_files = files( libfreedreno_ir3 = static_library( 'freedreno_ir3', - [libfreedreno_ir3_files, ir3_nir_trig_c, ir3_nir_imul_c, ir3_nir_branch_and_or_not_c, ir3_nir_triop_bitwise_c, ir3_parser[0], ir3_parser[1], ir3_lexer], + [libfreedreno_ir3_files, + ir3_nir_trig_c, + ir3_nir_imul_c, + ir3_nir_branch_and_or_not_c, + ir3_nir_triop_bitwise_c, + ir3_nir_opt_algebraic_late_c, + ir3_parser[0], ir3_parser[1], + ir3_lexer, + ], include_directories : [inc_freedreno, inc_include, inc_src], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden',