ethosu: Improve parallelism by detecting overlaps for BLOCKDEP
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39611>
This commit is contained in:
parent
2cf3d0b273
commit
a735fe040b
1 changed files with 154 additions and 25 deletions
|
|
@ -693,49 +693,179 @@ fill_memory_accesses(struct ethosu_subgraph *subgraph)
|
|||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
fm_ranges_overlap(struct ethosu_subgraph *subgraph,
|
||||
struct ethosu_feature_map *a, struct ethosu_feature_map *b)
|
||||
/* Box structure for spatial overlap detection */
|
||||
struct box {
|
||||
int start_h, start_w, start_d;
|
||||
int end_h, end_w, end_d;
|
||||
};
|
||||
|
||||
/* Check if two 1D ranges overlap */
|
||||
static inline bool
|
||||
range_overlaps(int start1, int end1, int start2, int end2)
|
||||
{
|
||||
struct ethosu_tensor *ta = ethosu_find_tensor(subgraph, a->tensor_idx);
|
||||
struct ethosu_tensor *tb = ethosu_find_tensor(subgraph, b->tensor_idx);
|
||||
return start1 < end2 && start2 < end1;
|
||||
}
|
||||
|
||||
if (!ta || !tb || ta->size == 0 || tb->size == 0)
|
||||
return false;
|
||||
/* Check if two boxes overlap in 3D space */
|
||||
static bool
|
||||
box_overlaps(const struct box *a, const struct box *b)
|
||||
{
|
||||
return range_overlaps(a->start_h, a->end_h, b->start_h, b->end_h) &&
|
||||
range_overlaps(a->start_w, a->end_w, b->start_w, b->end_w) &&
|
||||
range_overlaps(a->start_d, a->end_d, b->start_d, b->end_d);
|
||||
}
|
||||
|
||||
return ta->offset < tb->offset + tb->size &&
|
||||
tb->offset < ta->offset + ta->size;
|
||||
/* Calculate IFM job shape from OFM block considering kernel properties */
|
||||
static void
|
||||
calc_ifm_job_shape(const struct ethosu_block *ofm_block,
|
||||
const struct ethosu_kernel *kernel,
|
||||
int ifm_block_depth,
|
||||
struct ethosu_block *ifm_job)
|
||||
{
|
||||
/* Calculate dilated kernel size */
|
||||
int dilated_h = (kernel->height - 1) * kernel->dilation_y + 1;
|
||||
int dilated_w = (kernel->width - 1) * kernel->dilation_x + 1;
|
||||
|
||||
/* Calculate required input size */
|
||||
int h = (ofm_block->height - 1) * kernel->stride_y + dilated_h;
|
||||
int w = (ofm_block->width - 1) * kernel->stride_x + dilated_w;
|
||||
|
||||
ifm_job->height = h;
|
||||
ifm_job->width = w;
|
||||
ifm_job->depth = ifm_block_depth;
|
||||
}
|
||||
|
||||
/* Get jobs (blocks) from a feature map area
|
||||
* area: total work area (feature map dimensions)
|
||||
* job_shape: size of each job/block
|
||||
* max_jobs: maximum number of jobs to retrieve
|
||||
* from_start: if true get first jobs, if false get last jobs
|
||||
* jobs: output array to fill
|
||||
* Returns: number of jobs added
|
||||
*/
|
||||
static int
|
||||
get_jobs(const struct ethosu_block *area,
|
||||
const struct ethosu_block *job_shape,
|
||||
int max_jobs,
|
||||
bool from_start,
|
||||
struct box *jobs)
|
||||
{
|
||||
/* Calculate how many jobs needed in each dimension */
|
||||
int job_split_h = (area->height + job_shape->height - 1) / job_shape->height;
|
||||
int job_split_w = (area->width + job_shape->width - 1) / job_shape->width;
|
||||
int job_split_d = (area->depth + job_shape->depth - 1) / job_shape->depth;
|
||||
|
||||
int total_jobs = job_split_h * job_split_w * job_split_d;
|
||||
|
||||
int first_job = from_start ? 0 : MAX2(0, total_jobs - max_jobs);
|
||||
int last_job = from_start ? MIN2(total_jobs, max_jobs) : total_jobs;
|
||||
int count = 0;
|
||||
|
||||
/* Iterate jobs in z, x, y order (depth, width, height) */
|
||||
for (int i = first_job; i < last_job; i++) {
|
||||
int h_idx = i / (job_split_d * job_split_w);
|
||||
int w_idx = (i / job_split_d) % job_split_w;
|
||||
int d_idx = i % job_split_d;
|
||||
|
||||
jobs[count].start_h = h_idx * job_shape->height;
|
||||
jobs[count].start_w = w_idx * job_shape->width;
|
||||
jobs[count].start_d = d_idx * job_shape->depth;
|
||||
|
||||
jobs[count].end_h = MIN2(jobs[count].start_h + job_shape->height, area->height);
|
||||
jobs[count].end_w = MIN2(jobs[count].start_w + job_shape->width, area->width);
|
||||
jobs[count].end_d = MIN2(jobs[count].start_d + job_shape->depth, area->depth);
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
calc_blockdep(struct ethosu_subgraph *subgraph, struct ethosu_operation *prev_op, struct ethosu_operation *operation)
|
||||
{
|
||||
struct ethosu_screen *screen = ethosu_screen(subgraph->base.context->screen);
|
||||
|
||||
if (!prev_op)
|
||||
return 0;
|
||||
|
||||
// Check if the reserved shram will be used in current/prev op
|
||||
bool prev_uses_lut = false; // prev_op->activation && prev_op->activation->op_type == NpuActivationOp.TABLE_LOOKUP;
|
||||
bool curr_uses_lut = false; // operation->activation && operation->activation->op_type == NpuActivationOp.TABLE_LOOKUP;
|
||||
if (prev_uses_lut && SHRAM_RESERVED_UNUSED_BANKS == 0 && !curr_uses_lut)
|
||||
/* Check if previous OFM matches current IFM (same tensor) */
|
||||
int ifm_index = 0;
|
||||
if (operation->ifm2.tensor_idx != 0 &&
|
||||
operation->ifm2.tensor_idx == prev_op->ofm.tensor_idx) {
|
||||
ifm_index = 1;
|
||||
} else if (operation->ifm.tensor_idx != prev_op->ofm.tensor_idx) {
|
||||
/* Previous operation doesn't produce current operation's IFM */
|
||||
return screen->max_concurrent_blocks;
|
||||
}
|
||||
|
||||
const struct ethosu_feature_map *ifm = (ifm_index == 0) ? &operation->ifm : &operation->ifm2;
|
||||
const struct ethosu_feature_map *prev_ofm = &prev_op->ofm;
|
||||
|
||||
/* Check if shapes match (no reshape between operations) */
|
||||
if (ifm->shape.height != prev_ofm->shape.height ||
|
||||
ifm->shape.width != prev_ofm->shape.width ||
|
||||
ifm->shape.depth != prev_ofm->shape.depth) {
|
||||
/* OFM has been reshaped; overlap calculations don't work */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* For operations with 1:1 IFM/OFM block mapping (elementwise, pooling, depthwise),
|
||||
* always use BLOCKDEP=0 for safety */
|
||||
if (operation->type == ETHOSU_OPERATION_TYPE_ELTWISE ||
|
||||
operation->type == ETHOSU_OPERATION_TYPE_POOLING ||
|
||||
(operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION && operation->conv.depthwise) ||
|
||||
prev_op->type == ETHOSU_OPERATION_TYPE_ELTWISE ||
|
||||
prev_op->type == ETHOSU_OPERATION_TYPE_POOLING ||
|
||||
(prev_op->type == ETHOSU_OPERATION_TYPE_CONVOLUTION && prev_op->conv.depthwise))
|
||||
return 0;
|
||||
|
||||
/* If the previous op writes to the same buffer that the current op
|
||||
* reads from, we need to wait for it to finish first.
|
||||
*/
|
||||
bool ifm_overlaps = fm_ranges_overlap(subgraph, &prev_op->ofm, &operation->ifm);
|
||||
bool ifm2_overlaps = operation->type == ETHOSU_OPERATION_TYPE_ELTWISE &&
|
||||
fm_ranges_overlap(subgraph, &prev_op->ofm, &operation->ifm2);
|
||||
/* Calculate block shapes */
|
||||
struct ethosu_block prev_block = prev_op->block_config.ofm_block;
|
||||
struct ethosu_block curr_ifm_job;
|
||||
calc_ifm_job_shape(&operation->block_config.ofm_block,
|
||||
&operation->kernel,
|
||||
operation->block_config.ifm_block.depth,
|
||||
&curr_ifm_job);
|
||||
|
||||
if (ifm_overlaps || ifm2_overlaps)
|
||||
return 0;
|
||||
/* Get last jobs from previous operation */
|
||||
int max_jobs = screen->max_concurrent_blocks;
|
||||
assert(max_jobs <= 8);
|
||||
struct box last_prev_jobs[8];
|
||||
int prev_count = get_jobs(&prev_ofm->shape, &prev_block, max_jobs, false, last_prev_jobs);
|
||||
|
||||
return ethosu_screen(subgraph->base.context->screen)->max_concurrent_blocks; /* TODO: Check if there is actually overlap between the FMs */
|
||||
/* Get first jobs from current operation */
|
||||
struct box first_curr_jobs[8];
|
||||
int curr_count = get_jobs(&ifm->shape, &curr_ifm_job, max_jobs, true, first_curr_jobs);
|
||||
|
||||
/* Find highest blockdep with no overlap between jobs */
|
||||
int min_count = MIN2(prev_count, curr_count);
|
||||
int prev_last_idx = prev_count - 1;
|
||||
|
||||
for (int blockdep = 0; blockdep < min_count; blockdep++) {
|
||||
bool overlaps = false;
|
||||
|
||||
/* Check if any combination of jobs within blockdep range overlaps */
|
||||
for (int i = 0; !overlaps && i <= blockdep; i++) {
|
||||
for (int j = blockdep - i; !overlaps && i + j <= blockdep; j++) {
|
||||
if (box_overlaps(&first_curr_jobs[i], &last_prev_jobs[prev_last_idx - j])) {
|
||||
overlaps = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (overlaps) {
|
||||
return blockdep;
|
||||
}
|
||||
}
|
||||
|
||||
/* No overlap found */
|
||||
return min_count;
|
||||
}
|
||||
|
||||
void
|
||||
ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph)
|
||||
{
|
||||
struct ethosu_screen *screen = ethosu_screen(subgraph->base.context->screen);
|
||||
struct ethosu_operation *prev_op = NULL;
|
||||
struct util_dynarray outstanding_dma_ops;
|
||||
struct util_dynarray outstanding_npu_ops;
|
||||
|
|
@ -751,7 +881,7 @@ ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph)
|
|||
|
||||
/* Compile */
|
||||
|
||||
if (ethosu_is_u65(screen))
|
||||
if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen)))
|
||||
EMIT0(NPU_SET_PARALLEL_MODE, 0x0);
|
||||
|
||||
util_dynarray_foreach (&subgraph->operations, struct ethosu_operation, operation) {
|
||||
|
|
@ -778,7 +908,6 @@ ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph)
|
|||
|
||||
if (operation->type != ETHOSU_OPERATION_TYPE_DMA) {
|
||||
unsigned blockdep = calc_blockdep(subgraph, prev_op, operation);
|
||||
blockdep = MIN2(blockdep, screen->max_concurrent_blocks);
|
||||
EMIT0(NPU_SET_BLOCKDEP, blockdep);
|
||||
|
||||
prev_op = operation;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue