From f9cd399eb0fb174e56a4e8e1e9b7e441e3b5a287 Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Sun, 22 Mar 2026 16:12:21 +0100
Subject: [PATCH] ethosu: Fix ublock selection for 8-bit depthwise/pooling on
 U85-256

For U85-256 with 8-bit IFM, Vela's _uBlockToOpTable restricts which
microblocks are valid per operation type:

  {2,2,8}  and {4,1,8}:  conv, matmul, vectorprod, reducesum, eltwise, resize
  {2,1,16}:              depthwise, pool, eltwise, reduceminmax, argmax, resize

Mesa's find_ublock() was not enforcing these constraints, allowing
{4,1,8} or {2,2,8} to be selected for depthwise/pooling based on
minimum waste. For depthwise ops with OFM shapes that aligned better
to {4,1,8}, the wrong ublock was chosen, causing incorrect weight
encoding and NPU hangs.

Fix by skipping {4,1,8} and {2,2,8} for depthwise/pooling operations,
matching Vela's operation-validity table.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39611>
---
 src/gallium/drivers/ethosu/ethosu_sched_u85.c | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/ethosu/ethosu_sched_u85.c b/src/gallium/drivers/ethosu/ethosu_sched_u85.c
index a5ad39ef2ec..70bd7b1d1e7 100644
--- a/src/gallium/drivers/ethosu/ethosu_sched_u85.c
+++ b/src/gallium/drivers/ethosu/ethosu_sched_u85.c
@@ -656,23 +656,31 @@ find_ublock(struct ethosu_operation *operation, bool is_part_kernel)
          continue; /* Skip 2x1x16 for part-kernel 1x1 */
       }
 
-      /* The 2x1x16 microblock is only valid for 16-bit IFM convolutions.
-       * For 8-bit IFM, it can only be used for depthwise/pooling operations.
-       * This matches the bitsToOperations table in Vela's ethos_u85.cpp.
+      /* U85-256 ublock-to-operation validity for 8-bit IFM
+       * (from Vela's _uBlockToOpTable in ethos_u85.cpp):
+       *
+       *   {2,2,8}  / Shape(2,2,8):  conv, matmul, vectorprod, reducesum, eltwise, resize
+       *   {4,1,8}  / Shape(1,4,8):  conv, matmul, vectorprod, reducesum, eltwise, resize
+       *   {2,1,16} / Shape(1,2,16): depthwise, pool, eltwise, reduceminmax, argmax, resize
+       *
+       * So for 8-bit IFM:
+       *  - depthwise/pooling can ONLY use {2,1,16}
+       *  - convolution (non-depthwise) CANNOT use {2,1,16}
        */
+
+      /* Skip {2,1,16} for 8-bit non-depthwise convolutions */
       if (ublk.width == 2 && ublk.height == 1 && ublk.depth == 16 &&
           operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION &&
           !is_depthwise) {
-         continue; /* Skip 2x1x16 for 8-bit IFM non-depthwise convolutions */
+         continue;
       }
 
-      /* For non-1x1 regular convolutions, skip 2x1x16 unless it's depthwise */
-      if (!is_pointwise && !is_depthwise &&
-          operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION &&
-          ublk.width == 2 &&
-          ublk.height == 1 &&
-          ublk.depth == 16)
+      /* Skip {4,1,8} and {2,2,8} for 8-bit depthwise/pooling —
+       * only {2,1,16} is valid for these operations at 8-bit */
+      if (!(ublk.width == 2 && ublk.height == 1 && ublk.depth == 16) &&
+          (is_depthwise || is_pooling)) {
          continue;
+      }
 
       /* Minimum waste is better than aspect correct */
       struct ethosu_block tmp = block_round_away(operation->ofm.shape, ublk);