ethosu: Add a separate scheduler for the U85
As the performance details have changed quite a bit. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39611>
This commit is contained in:
parent
82d4f21106
commit
2cf3d0b273
4 changed files with 977 additions and 1 deletions
|
|
@ -4,6 +4,7 @@
|
|||
*/
|
||||
|
||||
#include "ethosu_sched.h"
|
||||
#include "ethosu_sched_u85.h"
|
||||
|
||||
static int
|
||||
required_input_size(int value, int stride, int border)
|
||||
|
|
@ -204,5 +205,10 @@ find_block_config(struct ethosu_subgraph *subgraph, struct ethosu_operation *ope
|
|||
void
|
||||
ethosu_sched_operation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
|
||||
{
|
||||
operation->block_config = find_block_config(subgraph, operation);
|
||||
struct ethosu_screen *screen = ethosu_screen(subgraph->base.context->screen);
|
||||
|
||||
if (ethosu_is_u65(screen))
|
||||
operation->block_config = find_block_config(subgraph, operation);
|
||||
else
|
||||
operation->block_config = find_block_config_u85(subgraph, operation);
|
||||
}
|
||||
|
|
|
|||
955
src/gallium/drivers/ethosu/ethosu_sched_u85.c
Normal file
955
src/gallium/drivers/ethosu/ethosu_sched_u85.c
Normal file
|
|
@ -0,0 +1,955 @@
|
|||
/*
|
||||
* Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Ethos-U85 Scheduler - Converted from Vela's ethos_u85.cpp
|
||||
*/
|
||||
|
||||
#include "ethosu_sched_u85.h"
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
#include "util/macros.h"
|
||||
#include "ethosu_ml.h"
|
||||
|
||||
#define ACC_DEPTH_GRANULE_U85 16
|
||||
#define OFM_SPLIT_DEPTH 16
|
||||
#define CB_SLOTS 6
|
||||
#define BRICK_ELEMENTS 16
|
||||
|
||||
/* Helper structure for block configuration search */
|
||||
struct find_config_common {
|
||||
struct ethosu_block ofm_block_max;
|
||||
struct ethosu_block ublock;
|
||||
struct ethosu_block granule;
|
||||
int acc_bits;
|
||||
int ifm_block_depth;
|
||||
bool is_pooling;
|
||||
};
|
||||
|
||||
/* Forward declarations */
|
||||
static struct ethosu_block calc_ifm_au_size_u85(int ifm_block_depth,
|
||||
struct ethosu_block ofm_ublock,
|
||||
int macs);
|
||||
static bool try_block_config_u85(enum ethosu_operation_type op_type,
|
||||
struct ethosu_block ofm_block,
|
||||
struct ethosu_block ifm_block,
|
||||
struct ethosu_block ifm_shape,
|
||||
int acc_bits, int ifm_space, int acc_space,
|
||||
int ifm_au_depth, int num_blocks_in_ram,
|
||||
bool is_equal_depth_op);
|
||||
|
||||
/* Round away from zero */
|
||||
static int
|
||||
round_away(int value, int align)
|
||||
{
|
||||
int rem = value % align;
|
||||
if (rem == 0) {
|
||||
return value;
|
||||
} else if (rem < 0) {
|
||||
return value - (align + rem);
|
||||
}
|
||||
return value + (align - rem);
|
||||
}
|
||||
|
||||
/* Round toward zero */
|
||||
static int
|
||||
round_zero(int value, int align)
|
||||
{
|
||||
assert(align > 0);
|
||||
return value - (value % align);
|
||||
}
|
||||
|
||||
/* Calculate required input size for given output, stride, and kernel */
|
||||
static int
|
||||
required_input_size_u85(int value, int stride, int border, int upscale, int rounding)
|
||||
{
|
||||
return (int)ceil((float)((value - 1) * stride + border + rounding) / (float)upscale);
|
||||
}
|
||||
|
||||
/* Get architecture-specific IFM block size */
|
||||
static struct ethosu_block
|
||||
get_arch_ifm_block_size_u85(struct ethosu_block ofm_block,
|
||||
struct ethosu_operation *operation,
|
||||
struct ethosu_block au_block,
|
||||
struct ethosu_block subkernel_limit,
|
||||
int upscale, int rounding,
|
||||
int ifm_block_depth)
|
||||
{
|
||||
struct ethosu_block result;
|
||||
int dilated_h = operation->kernel.height * operation->kernel.dilation_y -
|
||||
(operation->kernel.dilation_y - 1);
|
||||
int dilated_w = operation->kernel.width * operation->kernel.dilation_x -
|
||||
(operation->kernel.dilation_x - 1);
|
||||
|
||||
/* IFM block height */
|
||||
int h = required_input_size_u85(ofm_block.height, operation->kernel.stride_y,
|
||||
MIN2(dilated_h, subkernel_limit.height),
|
||||
upscale, rounding);
|
||||
h = round_away(h, au_block.height);
|
||||
|
||||
/* IFM block width */
|
||||
int w = required_input_size_u85(ofm_block.width, operation->kernel.stride_x,
|
||||
MIN2(dilated_w, subkernel_limit.width),
|
||||
upscale, rounding);
|
||||
w = round_away(w, au_block.width);
|
||||
|
||||
result.height = h;
|
||||
result.width = w;
|
||||
result.depth = round_away(ifm_block_depth ? ifm_block_depth : ofm_block.depth,
|
||||
au_block.depth);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Fit area by aspect ratio */
|
||||
static void
|
||||
fit_area_by_aspect(double aspect, int *width, int *height, int area,
|
||||
struct ethosu_block granule)
|
||||
{
|
||||
double w = sqrt(area / aspect);
|
||||
double h = w * aspect;
|
||||
|
||||
*width = round_zero(MAX2((int)w, granule.width), granule.width);
|
||||
*height = round_zero(MAX2((int)h, granule.height), granule.height);
|
||||
}
|
||||
|
||||
/* Find best tile size */
|
||||
static int
|
||||
best_tile(int range, int granule, int tile)
|
||||
{
|
||||
assert(range >= 0 && granule > 0 && tile > 0);
|
||||
assert((tile % granule) == 0);
|
||||
|
||||
int best = tile;
|
||||
|
||||
if ((range % tile) == 0)
|
||||
return best;
|
||||
|
||||
int n_tiles = (range + tile - 1) / tile;
|
||||
best = (range + n_tiles - 1) / n_tiles;
|
||||
best = round_away(best, granule);
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
/* Granular tile calculation */
|
||||
static int
|
||||
granular_tile(int range, int granule, int tile)
|
||||
{
|
||||
assert(range >= 0 && granule > 0 && tile > 0);
|
||||
assert((tile % granule) == 0 && "tile must be multiple of granule");
|
||||
|
||||
if (range % tile == 0)
|
||||
return tile;
|
||||
|
||||
int tiles = range / tile; /* Integer division */
|
||||
|
||||
return MAX2(round_away(range / (tiles + 1), granule), granule);
|
||||
}
|
||||
|
||||
static int
|
||||
reapportion(int value, int max_limit, int *avail, int required)
|
||||
{
|
||||
assert(required > 0 && value > 0);
|
||||
int excess = MIN2(*avail / required, max_limit / value);
|
||||
if (excess >= 2) {
|
||||
*avail /= excess;
|
||||
value *= excess;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/* Area fitting algorithm for U85 */
|
||||
static struct ethosu_block
|
||||
area_fit_u85(struct find_config_common common,
|
||||
struct ethosu_block ofm_shape,
|
||||
struct ethosu_block ofm_block_limit,
|
||||
struct ethosu_block ifm_shape,
|
||||
struct ethosu_operation *operation,
|
||||
int acc_ram_size, int ifm_ram_size, int macs)
|
||||
{
|
||||
const int acc_elements = (acc_ram_size * 8) / common.acc_bits;
|
||||
const int ib_elements = ifm_ram_size;
|
||||
const int ub_acc_elements = common.ublock.width * common.ublock.height * ACC_DEPTH_GRANULE_U85;
|
||||
|
||||
assert(ub_acc_elements > 0);
|
||||
|
||||
double aspect = (double)ofm_shape.height / ofm_shape.width;
|
||||
/* Prioritise depth only for 1x1 kernels (after dilation) */
|
||||
int dilated_w = operation->kernel.dilation_x * operation->kernel.width;
|
||||
int dilated_h = operation->kernel.dilation_y * operation->kernel.height;
|
||||
bool prioritise_depth = (dilated_w == 1 && dilated_h == 1);
|
||||
|
||||
struct ethosu_block fit_shape = {0};
|
||||
double best_metric = DBL_MAX;
|
||||
int max_depth = MIN2(MAX2(macs, acc_elements / ub_acc_elements),
|
||||
ofm_block_limit.depth);
|
||||
|
||||
double ofm_area = prioritise_depth ? (ofm_shape.width * ofm_shape.depth) : (ofm_shape.width * ofm_shape.height);
|
||||
|
||||
struct ethosu_block subkernel_max = SUB_KERNEL_MAX;
|
||||
|
||||
for (int depth = ACC_DEPTH_GRANULE_U85; depth <= max_depth;
|
||||
depth += ACC_DEPTH_GRANULE_U85) {
|
||||
|
||||
int width = 0, height = 0;
|
||||
int fit_acc = acc_elements;
|
||||
struct ethosu_block ifm_alloc_unit = calc_ifm_au_size_u85(depth, common.ublock, macs);
|
||||
int ifm_depth_granule = ifm_alloc_unit.depth;
|
||||
|
||||
/* Round IFM shape to allocation unit */
|
||||
struct ethosu_block ifm_rounded;
|
||||
ifm_rounded.width = round_away(ifm_shape.width, ifm_alloc_unit.width);
|
||||
ifm_rounded.height = round_away(ifm_shape.height, ifm_alloc_unit.height);
|
||||
ifm_rounded.depth = round_away(ifm_shape.depth, ifm_alloc_unit.depth);
|
||||
|
||||
int ifm_volume = ifm_rounded.width * ifm_rounded.height * ifm_rounded.depth;
|
||||
bool fitted = false;
|
||||
int prev_acc_req = -1;
|
||||
int retry = 25;
|
||||
|
||||
while (true) {
|
||||
fit_area_by_aspect(aspect, &width, &height, fit_acc / depth,
|
||||
common.granule);
|
||||
|
||||
width = MIN2(width, ofm_block_limit.width);
|
||||
height = MIN2(height, ofm_block_limit.height);
|
||||
|
||||
/* Regular width tiling is preferred */
|
||||
int tmp = width * height;
|
||||
if (width < ofm_shape.width) {
|
||||
width = best_tile(ofm_shape.width, common.granule.width, width);
|
||||
}
|
||||
height = MAX2(round_zero(tmp / width, common.granule.height),
|
||||
common.granule.height);
|
||||
|
||||
/* Accumulator fit */
|
||||
int acc_required = width * height * depth;
|
||||
double ab_ratio = (double)acc_elements / acc_required;
|
||||
|
||||
/* IFM fit */
|
||||
struct ethosu_block ofm_block = {width, height, depth};
|
||||
struct ethosu_block ifm_req = get_arch_ifm_block_size_u85(
|
||||
ofm_block, operation, ifm_alloc_unit, subkernel_max, 1, 0, 0);
|
||||
int ib_required = ifm_req.width * ifm_req.height * ifm_req.depth;
|
||||
assert(ib_required > 0);
|
||||
ib_required = MIN2(ib_required, ifm_volume);
|
||||
double ib_ratio = (double)ib_elements / ib_required;
|
||||
|
||||
if (ab_ratio >= 1.0 && ib_ratio >= 1.0) {
|
||||
int ifm_used = (depth % ifm_depth_granule) ? (depth % ifm_depth_granule) : ifm_depth_granule;
|
||||
double waste = 1.0 + ((double)(ifm_depth_granule - ifm_used) /
|
||||
ifm_depth_granule) /
|
||||
10.0;
|
||||
double fit = 1.0 + fabs(1.0 - ((double)ofm_block_limit.depth / depth)) / 10.0;
|
||||
int other_axis = prioritise_depth ? depth : height;
|
||||
double coverage = 1.0 + fabs(1.0 - (ofm_area / (width * other_axis)));
|
||||
double metric = coverage * fit * waste;
|
||||
|
||||
if (metric < best_metric) {
|
||||
fit_shape.width = width;
|
||||
fit_shape.height = height;
|
||||
fit_shape.depth = depth;
|
||||
best_metric = metric;
|
||||
|
||||
/* Early exit if it covers the entire OFM */
|
||||
if (depth >= ofm_shape.depth &&
|
||||
width >= ofm_shape.width && height >= ofm_shape.height) {
|
||||
fit_shape.width = round_away(ofm_shape.width, common.granule.width);
|
||||
fit_shape.height = round_away(ofm_shape.height, common.granule.height);
|
||||
fit_shape.depth = round_away(ofm_shape.depth, common.granule.depth);
|
||||
return fit_shape;
|
||||
}
|
||||
}
|
||||
fitted = true;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Force scaling if no progress */
|
||||
if (acc_required == prev_acc_req) {
|
||||
if (--retry <= 0) {
|
||||
ib_ratio = 0.9;
|
||||
}
|
||||
}
|
||||
prev_acc_req = acc_required;
|
||||
|
||||
/* Fast reduce if IB requirement not met */
|
||||
if ((ib_ratio < 1.0) && (ab_ratio > 1.0)) {
|
||||
fit_acc = MIN2(fit_acc, acc_required);
|
||||
}
|
||||
|
||||
/* Scale down and retry */
|
||||
double ratio = MIN2(ib_ratio, ab_ratio);
|
||||
int new_acc = (int)(fit_acc * ratio);
|
||||
new_acc = round_away(new_acc, ub_acc_elements);
|
||||
|
||||
/* No scaling progress */
|
||||
if (new_acc == fit_acc) {
|
||||
new_acc = fit_acc - ub_acc_elements;
|
||||
}
|
||||
|
||||
/* No fit possible */
|
||||
if (new_acc < ub_acc_elements) {
|
||||
break;
|
||||
}
|
||||
fit_acc = new_acc;
|
||||
}
|
||||
|
||||
/* Nothing fitted at this depth */
|
||||
if (!fitted) {
|
||||
mesa_loge("area_fit_u85: no solution found");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return fit_shape;
|
||||
}
|
||||
|
||||
/* Find elementwise block configuration */
|
||||
static struct ethosu_block
|
||||
find_elementwise_config_u85(struct ethosu_operation *operation,
|
||||
struct find_config_common common,
|
||||
int cb_ram_size, int ob_ram_size)
|
||||
{
|
||||
struct ethosu_block ofm_shape = operation->ofm.shape;
|
||||
|
||||
/* Pad and round OFM shape */
|
||||
ofm_shape.width = round_away(ofm_shape.width, common.granule.width);
|
||||
ofm_shape.height = round_away(ofm_shape.height, common.granule.height);
|
||||
ofm_shape.depth = round_away(ofm_shape.depth, common.granule.depth);
|
||||
|
||||
struct ethosu_block ofm_block_limit;
|
||||
ofm_block_limit.width = MIN2(ofm_shape.width, common.ofm_block_max.width);
|
||||
ofm_block_limit.height = MIN2(ofm_shape.height, common.ofm_block_max.height);
|
||||
ofm_block_limit.depth = MIN2(ofm_shape.depth, common.ofm_block_max.depth);
|
||||
|
||||
const bool is_scalar = false; /* TODO: Check if IFM2 is scalar */
|
||||
const int cb_bricks = (cb_ram_size / CB_SLOTS) / (BRICK_ELEMENTS * (8 / 8));
|
||||
const int ob_elements = (ob_ram_size * 8) / 8;
|
||||
const int ob_width_units = ob_elements / common.granule.depth;
|
||||
const int cb_width_units = cb_bricks / common.ublock.height;
|
||||
|
||||
int h_limit = granular_tile(ofm_shape.height, common.granule.height,
|
||||
ofm_block_limit.height);
|
||||
int w_required = granular_tile(ofm_shape.width, common.granule.width,
|
||||
MIN2(ob_width_units, ofm_block_limit.width));
|
||||
/* Start with full OB capacity; reapportion() may reduce this */
|
||||
int w_limit = ob_width_units;
|
||||
int c_limit = reapportion(common.granule.depth, ofm_block_limit.depth,
|
||||
&w_limit, MAX2(w_required, common.granule.width));
|
||||
|
||||
/* Binary elementwise, potentially broadcast */
|
||||
if (!is_scalar) {
|
||||
/* For now, assume no broadcasting (broadcastMask = 0) */
|
||||
unsigned broadcast_mask = 0; /* TODO: Calculate based on IFM shapes */
|
||||
|
||||
/* Broadcast in depth first */
|
||||
if (broadcast_mask & 1) {
|
||||
c_limit = granular_tile(ofm_shape.depth, common.granule.depth, ofm_block_limit.depth);
|
||||
w_limit = cb_width_units;
|
||||
|
||||
/* This is only the depth axis */
|
||||
if ((broadcast_mask & 1) == broadcast_mask)
|
||||
h_limit = reapportion(common.ublock.height, ofm_block_limit.height, &w_limit,
|
||||
MAX2(w_required, common.granule.width));
|
||||
}
|
||||
/* Broadcast in height */
|
||||
else if (broadcast_mask & 4) {
|
||||
w_limit = cb_width_units;
|
||||
c_limit = reapportion(BRICK_ELEMENTS, ofm_block_limit.depth, &w_limit,
|
||||
MAX2(w_required, common.granule.width));
|
||||
}
|
||||
/* Broadcast in width */
|
||||
else if (broadcast_mask & 2) {
|
||||
/* No change */
|
||||
}
|
||||
}
|
||||
|
||||
struct ethosu_block ofm_block;
|
||||
ofm_block.height = h_limit;
|
||||
ofm_block.width = w_limit;
|
||||
ofm_block.depth = c_limit;
|
||||
|
||||
/* Round to granule */
|
||||
ofm_block.width = round_away(ofm_block.width, common.granule.width);
|
||||
ofm_block.height = round_away(ofm_block.height, common.granule.height);
|
||||
ofm_block.depth = round_away(ofm_block.depth, common.granule.depth);
|
||||
|
||||
return ofm_block;
|
||||
}
|
||||
|
||||
/* Find depthwise block configuration */
|
||||
static struct ethosu_block
|
||||
find_depthwise_config_u85(struct ethosu_operation *operation,
|
||||
struct find_config_common common,
|
||||
struct ethosu_block *ifm_block_out,
|
||||
int acc_ram_size, int ifm_ram_size, int macs)
|
||||
{
|
||||
struct ethosu_block ofm_shape = operation->ofm.shape;
|
||||
struct ethosu_block ifm_shape = operation->ifm.shape;
|
||||
|
||||
/* Pad and round OFM shape */
|
||||
ofm_shape.width = round_away(ofm_shape.width, common.granule.width);
|
||||
ofm_shape.height = round_away(ofm_shape.height, common.granule.height);
|
||||
ofm_shape.depth = round_away(ofm_shape.depth, common.granule.depth);
|
||||
|
||||
struct ethosu_block ofm_block_limit;
|
||||
ofm_block_limit.width = MIN2(ofm_shape.width, common.ofm_block_max.width);
|
||||
ofm_block_limit.height = MIN2(ofm_shape.height, common.ofm_block_max.height);
|
||||
ofm_block_limit.depth = MIN2(ofm_shape.depth, common.ofm_block_max.depth);
|
||||
|
||||
const int acc_elements = (acc_ram_size * 8) / common.acc_bits;
|
||||
const int ib_elements = ifm_ram_size;
|
||||
|
||||
struct ethosu_block fit_shape = area_fit_u85(common, ofm_shape, ofm_block_limit,
|
||||
ifm_shape, operation,
|
||||
acc_ram_size, ifm_ram_size, macs);
|
||||
|
||||
int depth = fit_shape.depth;
|
||||
int width = fit_shape.width;
|
||||
int height = fit_shape.height;
|
||||
|
||||
struct ethosu_block ifm_alloc_unit;
|
||||
struct ethosu_block ifm_req;
|
||||
struct ethosu_block subkernel_max = SUB_KERNEL_MAX;
|
||||
unsigned force_reduce = 0;
|
||||
|
||||
/* Read-buffering optimization */
|
||||
while (true) {
|
||||
int ifm_depth_for_au = common.ifm_block_depth ? common.ifm_block_depth : depth;
|
||||
ifm_alloc_unit = calc_ifm_au_size_u85(ifm_depth_for_au, common.ublock, macs);
|
||||
int depth_granule = ifm_alloc_unit.depth;
|
||||
|
||||
struct ethosu_block ofm_block = {width, height, depth};
|
||||
ifm_req = get_arch_ifm_block_size_u85(ofm_block, operation, ifm_alloc_unit,
|
||||
subkernel_max, 1, 0, common.ifm_block_depth);
|
||||
|
||||
int ib_required = ifm_req.width * ifm_req.height * ifm_req.depth;
|
||||
assert(ib_required > 0);
|
||||
|
||||
/* Check if we can fit read-buffering for one extra row */
|
||||
if (width < ofm_shape.width || height < ofm_shape.height ||
|
||||
depth < ofm_shape.depth) {
|
||||
struct ethosu_block row_block = {width, common.granule.height, depth};
|
||||
struct ethosu_block ifm_row = get_arch_ifm_block_size_u85(
|
||||
row_block, operation, ifm_alloc_unit, subkernel_max, 1, 0,
|
||||
common.ifm_block_depth);
|
||||
|
||||
int ifm_row_elements = ifm_row.width * ifm_row.height * ifm_row.depth;
|
||||
|
||||
if ((ib_elements - ib_required) < ifm_row_elements) {
|
||||
/* Need to reduce block size */
|
||||
|
||||
if ((height != ofm_shape.height || (force_reduce & 2)) &&
|
||||
height > common.granule.height) {
|
||||
height -= common.granule.height;
|
||||
} else if ((depth > width || (force_reduce & 1)) &&
|
||||
depth > depth_granule) {
|
||||
depth -= common.granule.depth;
|
||||
} else if ((width != ofm_shape.width || (force_reduce & 4)) &&
|
||||
width > common.granule.width) {
|
||||
width -= common.granule.width;
|
||||
} else if (force_reduce != 7) {
|
||||
force_reduce = (force_reduce << 1) | 1;
|
||||
} else if (depth > common.granule.depth) {
|
||||
depth = MAX2(depth / 2, common.granule.depth);
|
||||
} else {
|
||||
mesa_loge("Cannot reduce block size further");
|
||||
UNREACHABLE("Cannot reduce block size further");
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
struct ethosu_block ofm_block = {width, height, depth};
|
||||
*ifm_block_out = ifm_req;
|
||||
|
||||
assert(MIN2(ifm_req.width * ifm_req.height * ifm_req.depth,
|
||||
ifm_shape.width * ifm_shape.height * ifm_shape.depth) <= ib_elements);
|
||||
assert(ofm_block.width * ofm_block.height * ofm_block.depth <= acc_elements);
|
||||
|
||||
return ofm_block;
|
||||
}
|
||||
|
||||
/* Calculate IFM allocation unit size for U85 */
|
||||
static struct ethosu_block
|
||||
calc_ifm_au_size_u85(int ifm_block_depth,
|
||||
struct ethosu_block ofm_ublock, int macs)
|
||||
{
|
||||
struct ethosu_block result;
|
||||
int ifm_depth_bits = ifm_block_depth * 8;
|
||||
|
||||
/* Determine IFMU index based on depth*bits */
|
||||
int ifmu = 0;
|
||||
if (ifm_depth_bits > 256) {
|
||||
ifmu = 2; /* ifmu3 */
|
||||
} else if (ifm_depth_bits > 128) {
|
||||
ifmu = 1; /* ifmu2 */
|
||||
}
|
||||
|
||||
/* For U85-256, the _uBlockToIfmAuTable is indexed by:
|
||||
* blockIdx (0 for 2x1x16, 1 for 4x1x8, 2 for 2x2x8)
|
||||
* ifmu (0, 1, or 2 based on ifm_depth_bits)
|
||||
*
|
||||
* The table values are:
|
||||
* [0][0] = 2x2x1 (ublock 2x1x16, ifmu=0)
|
||||
* [0][1] = 2x1x2 (ublock 2x1x16, ifmu=1)
|
||||
* [0][2] = 2x1x2 (ublock 2x1x16, ifmu=2)
|
||||
* [1][0] = 4x1x1 (ublock 4x1x8, ifmu=0)
|
||||
* [1][1] = 4x1x2 (ublock 4x1x8, ifmu=1)
|
||||
* [1][2] = 4x1x2 (ublock 4x1x8, ifmu=2)
|
||||
* [2][0] = 2x2x1 (ublock 2x2x8, ifmu=0)
|
||||
* [2][1] = 2x2x2 (ublock 2x2x8, ifmu=1)
|
||||
* [2][2] = 2x2x2 (ublock 2x2x8, ifmu=2)
|
||||
*/
|
||||
|
||||
/* Determine blockIdx from ofm_ublock */
|
||||
int block_idx = 0;
|
||||
if (ofm_ublock.width == 4 && ofm_ublock.height == 1 && ofm_ublock.depth == 8) {
|
||||
block_idx = 1;
|
||||
} else if (ofm_ublock.width == 2 && ofm_ublock.height == 2 && ofm_ublock.depth == 8) {
|
||||
block_idx = 2;
|
||||
}
|
||||
/* else block_idx = 0 for 2x1x16 */
|
||||
|
||||
/* Look up base AU shape from table */
|
||||
if (block_idx == 0) { /* 2x1x16 */
|
||||
if (ifmu == 0) {
|
||||
result.width = 2;
|
||||
result.height = 2;
|
||||
result.depth = 1;
|
||||
} else { /* ifmu == 1 or 2 */
|
||||
result.width = 2;
|
||||
result.height = 1;
|
||||
result.depth = 2;
|
||||
}
|
||||
} else if (block_idx == 1) { /* 4x1x8 */
|
||||
if (ifmu == 0) {
|
||||
result.width = 4;
|
||||
result.height = 1;
|
||||
result.depth = 1;
|
||||
} else { /* ifmu == 1 or 2 */
|
||||
result.width = 4;
|
||||
result.height = 1;
|
||||
result.depth = 2;
|
||||
}
|
||||
} else { /* block_idx == 2, 2x2x8 */
|
||||
if (ifmu == 0) {
|
||||
result.width = 2;
|
||||
result.height = 2;
|
||||
result.depth = 1;
|
||||
} else { /* ifmu == 1 or 2 */
|
||||
result.width = 2;
|
||||
result.height = 2;
|
||||
result.depth = 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Scale depth by 128/ifm_bits */
|
||||
result.depth = result.depth * 128 / 8;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Try block configuration to see if it fits in RAM */
|
||||
static bool
|
||||
try_block_config_u85(enum ethosu_operation_type op_type,
|
||||
struct ethosu_block ofm_block,
|
||||
struct ethosu_block ifm_block,
|
||||
struct ethosu_block ifm_shape,
|
||||
int acc_bits,
|
||||
int ifm_space, int acc_space,
|
||||
int ifm_au_depth, int num_blocks_in_ram,
|
||||
bool is_equal_depth_op)
|
||||
{
|
||||
assert(acc_bits > 0);
|
||||
|
||||
/* Elementwise and Resize don't use IB/AB */
|
||||
if (op_type == ETHOSU_OPERATION_TYPE_ELTWISE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* IFM space calculation */
|
||||
int ifm_align_depth = ifm_au_depth;
|
||||
int ifm_block_depth = is_equal_depth_op ? ofm_block.depth : MIN2(ifm_block.depth, ifm_shape.depth);
|
||||
ifm_block_depth = round_away(ifm_block_depth, ifm_align_depth);
|
||||
int ifm_bytes = ifm_block.width * ifm_block.height * ifm_block_depth * num_blocks_in_ram;
|
||||
|
||||
/* Accumulator space calculation */
|
||||
int ofm_block_depth = round_away(ofm_block.depth, ACC_DEPTH_GRANULE_U85);
|
||||
int acc_bytes = ((ofm_block.width * ofm_block.height * ofm_block_depth *
|
||||
acc_bits) /
|
||||
8) *
|
||||
num_blocks_in_ram;
|
||||
|
||||
if (ifm_bytes > ifm_space || acc_bytes > acc_space) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Regor's Shape(h,w,d) stores as [d,w,h], so Shape(1,2,16) = h=1,w=2,d=16
|
||||
* Mesa's {w,h,d} format, so we need to swap h and w from Regor's parameters
|
||||
* Regor Shape(1,2,16) → Mesa {2,1,16}
|
||||
* Regor Shape(1,4,8) → Mesa {4,1,8}
|
||||
* Regor Shape(2,2,8) → Mesa {2,2,8}
|
||||
*/
|
||||
static struct ethosu_block arch_ublocks[] = {
|
||||
{2, 1, 16}, /* Shape(1,2,16) */
|
||||
{4, 1, 8}, /* Shape(1,4,8) */
|
||||
{2, 2, 8}}; /* Shape(2,2,8) */
|
||||
|
||||
static struct ethosu_block
|
||||
block_round_away(struct ethosu_block value, struct ethosu_block granule)
|
||||
{
|
||||
struct ethosu_block result;
|
||||
|
||||
result.width = round_away(value.width, granule.width);
|
||||
result.height = round_away(value.height, granule.height);
|
||||
result.depth = round_away(value.depth, granule.depth);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static int
|
||||
block_elements(struct ethosu_block value)
|
||||
{
|
||||
return value.width * value.height * value.depth;
|
||||
}
|
||||
|
||||
static struct ethosu_block
|
||||
find_ublock(struct ethosu_operation *operation, bool is_part_kernel)
|
||||
{
|
||||
int best_waste = INT_MAX;
|
||||
struct ethosu_block best_ublock = {0};
|
||||
bool is_pointwise = (operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION &&
|
||||
operation->kernel.width == 1 && operation->kernel.height == 1);
|
||||
bool is_depthwise = operation->conv.depthwise;
|
||||
bool is_pooling = (operation->type == ETHOSU_OPERATION_TYPE_POOLING);
|
||||
bool is_memory_copy_pooling = is_pooling &&
|
||||
operation->kernel.width == 1 &&
|
||||
operation->kernel.height == 1;
|
||||
|
||||
/* For memory-copy pooling (1x1 pooling), always use 2x1x16 ublock
|
||||
* to match Vela's behavior and avoid waste-based selection issues */
|
||||
if (is_memory_copy_pooling) {
|
||||
struct ethosu_block result = {2, 1, 16};
|
||||
return result;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(arch_ublocks); i++) {
|
||||
const struct ethosu_block ublk = arch_ublocks[i];
|
||||
|
||||
/* Special case for 1x1 convolutions on U85-256:
|
||||
* The 2x1x16 microblock is ONLY valid for depth-first mode (not part-kernel).
|
||||
* For part-kernel mode, 2x1x16 must be rejected.
|
||||
* This matches Vela's logic in ethos_u85.cpp ValidateOfmUBlock().
|
||||
*/
|
||||
if (is_pointwise && !is_part_kernel && ublk.width == 2 && ublk.height == 1 && ublk.depth == 16) {
|
||||
/* 2x1x16 is allowed for depth-first 1x1 - don't skip */
|
||||
} else if (is_pointwise && is_part_kernel && ublk.width == 2 && ublk.height == 1 && ublk.depth == 16) {
|
||||
continue; /* Skip 2x1x16 for part-kernel 1x1 */
|
||||
}
|
||||
|
||||
/* The 2x1x16 microblock is only valid for 16-bit IFM convolutions.
|
||||
* For 8-bit IFM, it can only be used for depthwise/pooling operations.
|
||||
* This matches the bitsToOperations table in Vela's ethos_u85.cpp.
|
||||
*/
|
||||
if (ublk.width == 2 && ublk.height == 1 && ublk.depth == 16 &&
|
||||
operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION &&
|
||||
!is_depthwise) {
|
||||
continue; /* Skip 2x1x16 for 8-bit IFM non-depthwise convolutions */
|
||||
}
|
||||
|
||||
/* For non-1x1 regular convolutions, skip 2x1x16 unless it's depthwise */
|
||||
if (!is_pointwise && !is_depthwise &&
|
||||
operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION &&
|
||||
ublk.width == 2 &&
|
||||
ublk.height == 1 &&
|
||||
ublk.depth == 16)
|
||||
continue;
|
||||
|
||||
/* Minimum waste is better than aspect correct */
|
||||
struct ethosu_block tmp = block_round_away(operation->ofm.shape, ublk);
|
||||
int waste = block_elements(tmp) - block_elements(operation->ofm.shape);
|
||||
DBG("UBLOCK: ublock %ux%ux%u: tmp=%ux%ux%u waste=%d\n",
|
||||
ublk.width, ublk.height, ublk.depth,
|
||||
tmp.width, tmp.height, tmp.depth, waste);
|
||||
bool is_better = (waste < best_waste);
|
||||
/* For 1x1 convolutions with equal waste, prefer smaller depth */
|
||||
if (!is_better && waste == best_waste && is_pointwise && ublk.depth < best_ublock.depth)
|
||||
is_better = true;
|
||||
if (is_better) {
|
||||
best_ublock = ublk;
|
||||
best_waste = waste;
|
||||
}
|
||||
}
|
||||
|
||||
DBG("UBLOCK: Chosen ublock: width=%u height=%u depth=%u\n", best_ublock.width, best_ublock.height, best_ublock.depth);
|
||||
return best_ublock;
|
||||
}
|
||||
|
||||
/* Main entry point for U85 scheduling */
|
||||
struct ethosu_block_config
|
||||
find_block_config_u85(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
|
||||
{
|
||||
struct ethosu_block_config config = {0};
|
||||
|
||||
/* Architecture constants for U85-256.
|
||||
* Note: on U85, the coefficient buffer (CB) is smaller than the output
|
||||
* buffer (OB), which is the opposite of U55/U65. */
|
||||
const int macs = 256; /* TODO: Get from screen */
|
||||
const int ifm_ram_size_bytes = 16384; /* TODO: Get from screen */
|
||||
const int acc_ram_size_bytes = 16384; /* TODO: Get from screen */
|
||||
const int cb_ram_size_bytes = 1536; /* TODO: Get from screen */
|
||||
const int ob_ram_size_bytes = 2048; /* TODO: Get from screen */
|
||||
|
||||
/* Operation typing */
|
||||
bool is_pooling = (operation->type == ETHOSU_OPERATION_TYPE_POOLING);
|
||||
bool is_depthwise = operation->conv.depthwise;
|
||||
bool is_elementwise = (operation->type == ETHOSU_OPERATION_TYPE_ELTWISE);
|
||||
bool is_convolution = (operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION);
|
||||
bool is_equal_depth_op = is_elementwise || is_pooling || is_depthwise;
|
||||
|
||||
/* Determine if we should use kernel-first (part-kernel) scheduling for convolutions */
|
||||
bool is_part_kernel = false;
|
||||
if (is_convolution) {
|
||||
struct ethosu_block ifm_shape = operation->ifm.shape;
|
||||
int k = operation->kernel.width * operation->kernel.height;
|
||||
int s = 5; /* sparse would be 10 */
|
||||
int r = 2; /* sparse would be 4 */
|
||||
int k_rnd = (k / s) * s + MAX2(k % s, r);
|
||||
double kernel_first_util = ((double)ifm_shape.depth / round_away(ifm_shape.depth, 16)) *
|
||||
((double)k / k_rnd);
|
||||
double depth_first_util = (double)ifm_shape.depth / round_away(ifm_shape.depth, 64);
|
||||
is_part_kernel = (kernel_first_util >= depth_first_util);
|
||||
|
||||
DBG("UBLOCK: kernel-first heuristic: k=%d k_rnd=%d ifm_depth=%d\n", k, k_rnd, ifm_shape.depth);
|
||||
DBG("UBLOCK: kernel_first_util=%.3f depth_first_util=%.3f is_part_kernel=%d\n",
|
||||
kernel_first_util, depth_first_util, is_part_kernel);
|
||||
}
|
||||
|
||||
/* Setup common search parameters */
|
||||
struct find_config_common common;
|
||||
common.acc_bits = 32; /* TODO: Determine from operation */
|
||||
|
||||
/* Find microblock */
|
||||
struct ethosu_block ofm_ublock = find_ublock(operation, is_part_kernel);
|
||||
|
||||
common.ofm_block_max.width = 128; /* TODO: Get from screen */
|
||||
common.ofm_block_max.height = 128; /* TODO: Get from screen */
|
||||
common.ofm_block_max.depth = 1024; /* TODO: Get from screen */
|
||||
common.ublock = ofm_ublock;
|
||||
common.granule.width = ofm_ublock.width;
|
||||
common.granule.height = ofm_ublock.height;
|
||||
common.granule.depth = ACC_DEPTH_GRANULE_U85;
|
||||
common.ifm_block_depth = 0;
|
||||
common.is_pooling = is_pooling;
|
||||
|
||||
/* Handle elementwise operations */
|
||||
if (is_elementwise) {
|
||||
config.ofm_block = find_elementwise_config_u85(operation, common,
|
||||
cb_ram_size_bytes,
|
||||
ob_ram_size_bytes);
|
||||
config.ifm_block = config.ofm_block;
|
||||
config.ofm_ublock = ofm_ublock;
|
||||
return config;
|
||||
}
|
||||
|
||||
/* Handle depthwise/pooling operations
|
||||
* Exception: 1x1 pooling (memory copy) uses full block search instead */
|
||||
bool is_memory_copy_pooling = is_pooling &&
|
||||
operation->kernel.width == 1 &&
|
||||
operation->kernel.height == 1;
|
||||
|
||||
if ((is_depthwise || is_pooling) && !is_memory_copy_pooling) {
|
||||
config.ofm_block = find_depthwise_config_u85(operation, common,
|
||||
&config.ifm_block,
|
||||
acc_ram_size_bytes,
|
||||
ifm_ram_size_bytes, macs);
|
||||
config.ofm_ublock = ofm_ublock;
|
||||
config.is_partkernel = false;
|
||||
return config;
|
||||
}
|
||||
|
||||
/* Handle regular convolutions and memory-copy pooling - Full block search */
|
||||
|
||||
struct ethosu_block ofm_shape = operation->ofm.shape;
|
||||
struct ethosu_block ifm_shape = operation->ifm.shape;
|
||||
|
||||
int ifm_block_depth = 64;
|
||||
if (is_part_kernel) {
|
||||
ifm_block_depth = 16;
|
||||
} else if (macs == 128 || macs == 256) {
|
||||
if (ofm_ublock.depth == 16) {
|
||||
ifm_block_depth = 32;
|
||||
}
|
||||
}
|
||||
|
||||
common.ifm_block_depth = ifm_block_depth;
|
||||
|
||||
/* Search space setup */
|
||||
struct ethosu_block search_space_step;
|
||||
search_space_step.width = MAX2(ofm_ublock.width, common.granule.width);
|
||||
search_space_step.height = MAX2(ofm_ublock.height, common.granule.height);
|
||||
search_space_step.depth = MAX2(ofm_ublock.depth, common.granule.depth);
|
||||
|
||||
struct ethosu_block search_space_end;
|
||||
search_space_end.width = MIN2(ofm_shape.width, common.ofm_block_max.width);
|
||||
search_space_end.height = MIN2(ofm_shape.height, common.ofm_block_max.height);
|
||||
search_space_end.depth = MIN2(ofm_shape.depth, common.ofm_block_max.depth);
|
||||
|
||||
search_space_end.width = round_away(MAX2(search_space_end.width,
|
||||
search_space_step.width),
|
||||
ofm_ublock.width);
|
||||
search_space_end.height = round_away(MAX2(search_space_end.height,
|
||||
search_space_step.height),
|
||||
ofm_ublock.height);
|
||||
search_space_end.depth = round_away(MAX2(search_space_end.depth,
|
||||
search_space_step.depth),
|
||||
ofm_ublock.depth);
|
||||
|
||||
float best_cost = FLT_MAX;
|
||||
float best_coverage = FLT_MAX;
|
||||
int ofm_elements = ofm_shape.width * ofm_shape.height * ofm_shape.depth;
|
||||
|
||||
int depth = MAX2(ofm_ublock.depth, MIN2(search_space_end.depth, OFM_SPLIT_DEPTH));
|
||||
int restart_depth = depth;
|
||||
if (depth < ofm_shape.depth) {
|
||||
depth = round_away(depth, OFM_SPLIT_DEPTH);
|
||||
}
|
||||
|
||||
struct ethosu_block ifm_alloc_unit = calc_ifm_au_size_u85(ifm_block_depth,
|
||||
ofm_ublock, macs);
|
||||
int num_blocks_in_ram = 2;
|
||||
|
||||
/* Block search loop */
|
||||
while (depth <= search_space_end.depth) {
|
||||
if (is_equal_depth_op) {
|
||||
ifm_block_depth = depth;
|
||||
struct ethosu_block new_au = calc_ifm_au_size_u85(depth, ofm_ublock, macs);
|
||||
if (new_au.depth != ifm_alloc_unit.depth ||
|
||||
new_au.width != ifm_alloc_unit.width ||
|
||||
new_au.height != ifm_alloc_unit.height) {
|
||||
ifm_alloc_unit = new_au;
|
||||
}
|
||||
}
|
||||
|
||||
for (int height = search_space_step.height;
|
||||
height <= search_space_end.height;
|
||||
height += search_space_step.height) {
|
||||
for (int width = search_space_step.width;
|
||||
width <= search_space_end.width;
|
||||
width += search_space_step.width) {
|
||||
|
||||
struct ethosu_block ofm_block = {width, height, depth};
|
||||
struct ethosu_block ifm_block = get_arch_ifm_block_size_u85(
|
||||
ofm_block, operation, ifm_alloc_unit, SUB_KERNEL_MAX,
|
||||
1, 0, ifm_block_depth);
|
||||
|
||||
/* Test if blocks fit in RAM */
|
||||
if (try_block_config_u85(operation->type, ofm_block, ifm_block,
|
||||
ifm_shape, common.acc_bits,
|
||||
ifm_ram_size_bytes, acc_ram_size_bytes,
|
||||
ifm_alloc_unit.depth, num_blocks_in_ram,
|
||||
is_equal_depth_op)) {
|
||||
|
||||
/* Calculate cost metric */
|
||||
int dilated_h = operation->kernel.height * operation->kernel.dilation_y -
|
||||
(operation->kernel.dilation_y - 1);
|
||||
int dilated_w = operation->kernel.width * operation->kernel.dilation_x -
|
||||
(operation->kernel.dilation_x - 1);
|
||||
|
||||
float blocks_w = (float)ofm_shape.width / width;
|
||||
float blocks_h = (float)ofm_shape.height / height;
|
||||
float blocks_d = (float)ofm_shape.depth / depth;
|
||||
|
||||
int full_blocks_w = (ofm_shape.width + width - 1) / width;
|
||||
int full_blocks_h = (ofm_shape.height + height - 1) / height;
|
||||
int full_blocks_d = (ofm_shape.depth + depth - 1) / depth;
|
||||
|
||||
float weight_area = is_convolution ? operation->kernel.width * operation->kernel.height : 0;
|
||||
|
||||
int ifm_repeats = ((dilated_w + SUB_KERNEL_MAX.width - 1) /
|
||||
SUB_KERNEL_MAX.width) *
|
||||
((dilated_h + SUB_KERNEL_MAX.height - 1) /
|
||||
SUB_KERNEL_MAX.height);
|
||||
|
||||
/* Weight fetch */
|
||||
float weight_fetch = weight_area * ifm_shape.depth *
|
||||
full_blocks_w * full_blocks_h;
|
||||
if (!is_depthwise) {
|
||||
weight_fetch *= blocks_d * depth;
|
||||
}
|
||||
|
||||
/* IFM fetch */
|
||||
/* Note: blocks_h and blocks_w operands appear in this specific order to match
|
||||
* Vela's calculation to match its rounding behavior */
|
||||
float ifm_fetch = (float)(ifm_block.width * ifm_block.height) *
|
||||
ifm_shape.depth * ifm_repeats * blocks_h * blocks_w;
|
||||
if (!is_equal_depth_op) {
|
||||
ifm_fetch *= full_blocks_d;
|
||||
}
|
||||
|
||||
float relative_cost = (ifm_fetch + weight_fetch) / ofm_elements;
|
||||
|
||||
/* Bias for encompassing entire IFM */
|
||||
if (ifm_shape.width * ifm_shape.height * ifm_shape.depth <
|
||||
ifm_block.width * ifm_block.height * ifm_block.depth * 2) {
|
||||
relative_cost = relative_cost / 2.0f;
|
||||
}
|
||||
|
||||
/* Choose based on minimum cost */
|
||||
if (relative_cost <= best_cost) {
|
||||
bool choose_this = false;
|
||||
|
||||
if (relative_cost == best_cost) {
|
||||
int coverage_w = MIN2(ifm_shape.width, ifm_block.width);
|
||||
int coverage_h = MIN2(ifm_shape.height, ifm_block.height);
|
||||
float coverage = (float)(ifm_shape.width * ifm_shape.height) /
|
||||
MAX2(coverage_w * coverage_h, 1);
|
||||
|
||||
if (coverage <= best_coverage && (height <= 4 && width <= 4)) {
|
||||
best_coverage = coverage;
|
||||
choose_this = true;
|
||||
}
|
||||
} else {
|
||||
best_coverage = FLT_MAX;
|
||||
choose_this = true;
|
||||
}
|
||||
|
||||
if (choose_this) {
|
||||
best_cost = relative_cost;
|
||||
config.ifm_block = ifm_block;
|
||||
config.ifm_block.depth = ifm_block_depth;
|
||||
config.ofm_block = ofm_block;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Next depth level */
|
||||
depth = depth + ofm_ublock.depth;
|
||||
if (depth < ofm_shape.depth) {
|
||||
depth = round_away(depth, OFM_SPLIT_DEPTH);
|
||||
}
|
||||
|
||||
/* Retry with single buffer if nothing found */
|
||||
if (depth > search_space_end.depth && best_cost == FLT_MAX &&
|
||||
num_blocks_in_ram == 2) {
|
||||
num_blocks_in_ram = 1;
|
||||
depth = restart_depth;
|
||||
}
|
||||
}
|
||||
|
||||
config.ofm_ublock = ofm_ublock;
|
||||
config.is_partkernel = is_part_kernel;
|
||||
|
||||
return config;
|
||||
}
|
||||
14
src/gallium/drivers/ethosu/ethosu_sched_u85.h
Normal file
14
src/gallium/drivers/ethosu/ethosu_sched_u85.h
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
/*
|
||||
* Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#ifndef ETHOSU_SCHED_U85_H
|
||||
#define ETHOSU_SCHED_U85_H
|
||||
|
||||
#include "ethosu_ml.h"
|
||||
|
||||
struct ethosu_block_config find_block_config_u85(struct ethosu_subgraph *subgraph,
|
||||
struct ethosu_operation *operation);
|
||||
|
||||
#endif /* ETHOSU_SCHED_U85_H */
|
||||
|
|
@ -19,6 +19,7 @@ files_ethosu = files(
|
|||
'ethosu_lower.c',
|
||||
'ethosu_ml.c',
|
||||
'ethosu_sched.c',
|
||||
'ethosu_sched_u85.c',
|
||||
)
|
||||
|
||||
libethosu = static_library(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue