nir/opt_algebraic: optimize more fmulz(1.0, a) remains

If dxvk's opencoded fmulz gets partially constant folded, it leaves this mess behind. It's important to do this before the more general fmul+b2f patterns added in the next commit, because they change the signed zero behavior in a way that can't be optimized back. Foz-DB Navi48: Totals from 36 (0.03% of 114655) affected shaders: Instrs: 16513 -> 15706 (-4.89%) CodeSize: 99756 -> 95760 (-4.01%) Latency: 45165 -> 44151 (-2.25%) InvThroughput: 8344 -> 7886 (-5.49%) VClause: 395 -> 401 (+1.52%) Copies: 639 -> 634 (-0.78%) PreSGPRs: 1158 -> 1154 (-0.35%) PreVGPRs: 1227 -> 1225 (-0.16%) VALU: 11310 -> 10769 (-4.78%) SALU: 813 -> 809 (-0.49%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40399>
2026-03-14 07:11:50 +01:00 · 2026-03-14 07:11:50 +01:00 · d2b37b667e
commit d2b37b667e
parent 3ad142d4d7
1 changed files with 19 additions and 2 deletions
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -1571,6 +1571,24 @@ for op in ('ior', 'iand', 'ixor'):
        (('iand', (op, ('b2i', 'a@1'), ('ineg', ('b2i', 'b@1'))), 1),       ('b2i', (op, a, b)) ),
    ])

+
+for compare in [('fneu', a, 0.0), ('inot', ('feq', a, 0.0))]:
+    mod = [a, ('fneg', a), ('fabs', a), ('fneg', ('fabs', a))]
+
+    for i in range(len(mod)):
+        for neg_b2f in [False, True]:
+
+            search_mod = mod[i]
+            replace_mod = mod[i ^ int(neg_b2f)];
+            search_b2f = ('fneg', ('b2f', compare)) if neg_b2f else ('b2f', compare)
+
+            replace_mod_mul = ('fcanonicalize', a) if replace_mod == a else replace_mod
+
+            optimizations.extend([
+                (('fmul', search_b2f, search_mod), replace_mod_mul),
+                (('ffma', search_b2f, search_mod, b), ('fadd', replace_mod, b)),
+            ])
+
 optimizations.extend([
   (('feq', ('seq', a, b), 1.0), ('feq', a, b)),
   (('feq', ('sne', a, b), 1.0), ('fneu', a, b)),
@ -1599,13 +1617,12 @@ optimizations.extend([
   (('ffma', ('b2f', 'a@1'), ('b2f', 'b@1'), c), ('fadd', ('b2f', ('iand', a, b)), c)),
   (('fadd', 1.0, ('fneg', ('b2f', a))), ('b2f', ('inot', a))),
   (('fadd(nsz)', -1.0, ('b2f', a)), ('fneg', ('b2f', ('inot', a)))),
-   (('fmul', ('b2f', ('fneu', a, 0)), a), ('fmul', 1.0, a)),
-   (('ffma', ('b2f', ('fneu', a, 0)), a, b), ('fadd', a, b)),
   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
   (('fsat', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1')))), ('b2f', ('iand', a, ('inot', b)))),
   (('fmax', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('b2f', ('iand', a, ('inot', b)))),
   (('iand', 'a@bool16', 1.0), ('b2f', a)),
   (('iand', 'a@bool32', 1.0), ('b2f', a)),
+
   # Comparison with the same args.  Note that these are only done for the
   # float versions when the source must be a number.  Generally, NaN cmp NaN
   # produces the opposite result of X cmp X.  flt is the outlier.  NaN < NaN