vc4: Add support for register allocation for threaded shaders.

We have two major requirements: Make sure that only the bottom half of the
physical reg space is used, and make sure that none of our values are live
in an accumulator across a switch.
This commit is contained in:
Eric Anholt 2016-11-10 15:10:15 -08:00
parent fdad4d2402
commit 755037173d
3 changed files with 85 additions and 20 deletions

View file

@ -318,11 +318,12 @@ struct vc4_context {
uint64_t next_compiled_program_id;
struct ra_regs *regs;
unsigned int reg_class_any;
unsigned int reg_class_a_or_b_or_acc;
unsigned int reg_class_any[2];
unsigned int reg_class_a_or_b[2];
unsigned int reg_class_a_or_b_or_acc[2];
unsigned int reg_class_r0_r3;
unsigned int reg_class_r4_or_a;
unsigned int reg_class_a;
unsigned int reg_class_r4_or_a[2];
unsigned int reg_class_a[2];
uint8_t prim_mode;

View file

@ -533,6 +533,13 @@ struct vc4_compile {
uint32_t program_id;
uint32_t variant_id;
/* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
* is used to hide texturing latency at the cost of limiting ourselves
* to the bottom half of physical reg space.
*/
bool fs_threaded;
bool failed;
};

View file

@ -115,22 +115,33 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true);
vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
/* The physical regfiles split us into two classes, with [0] being the
* whole space and [1] being the bottom half (for threaded fragment
* shaders).
*/
for (int i = 0; i < 2; i++) {
vc4->reg_class_any[i] = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_a_or_b[i] = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_r4_or_a[i] = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_a[i] = ra_alloc_reg_class(vc4->regs);
}
vc4->reg_class_r0_r3 = ra_alloc_reg_class(vc4->regs);
/* r0-r3 */
for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) {
ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i);
}
/* R4 gets a special class because it can't be written as a general
* purpose register. (it's TMU_NOSWAP as a write address).
*/
ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, ACC_INDEX + 4);
for (int i = 0; i < 2; i++) {
ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[i],
ACC_INDEX + 4);
}
/* A/B */
for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) {
@ -140,13 +151,28 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
if (vc4_regs[i].addr == 31)
continue;
ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i);
ra_class_add_reg(vc4->regs, vc4->reg_class_any[0], i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[0], i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i);
if (vc4_regs[i].addr < 16) {
ra_class_add_reg(vc4->regs, vc4->reg_class_any[1], i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[1], i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i);
}
/* A only */
if (((i - AB_INDEX) & 1) == 0) {
ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
ra_class_add_reg(vc4->regs, vc4->reg_class_a[0], i);
ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[0], i);
if (vc4_regs[i].addr < 16) {
ra_class_add_reg(vc4->regs,
vc4->reg_class_a[1], i);
ra_class_add_reg(vc4->regs,
vc4->reg_class_r4_or_a[1], i);
}
}
}
@ -252,6 +278,17 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3;
break;
case QOP_THRSW:
/* All accumulators are invalidated across a thread
* switch.
*/
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
class_bits[i] &= ~(CLASS_BIT_R0_R3 |
CLASS_BIT_R4);
}
break;
default:
break;
}
@ -288,21 +325,38 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
switch (class_bits[i]) {
case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3:
ra_set_node_class(g, node, vc4->reg_class_any);
ra_set_node_class(g, node,
vc4->reg_class_any[c->fs_threaded]);
break;
case CLASS_BIT_A | CLASS_BIT_B:
ra_set_node_class(g, node,
vc4->reg_class_a_or_b[c->fs_threaded]);
break;
case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3:
ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc);
ra_set_node_class(g, node,
vc4->reg_class_a_or_b_or_acc[c->fs_threaded]);
break;
case CLASS_BIT_A | CLASS_BIT_R4:
ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
ra_set_node_class(g, node,
vc4->reg_class_r4_or_a[c->fs_threaded]);
break;
case CLASS_BIT_A:
ra_set_node_class(g, node, vc4->reg_class_a);
ra_set_node_class(g, node,
vc4->reg_class_a[c->fs_threaded]);
break;
case CLASS_BIT_R0_R3:
ra_set_node_class(g, node, vc4->reg_class_r0_r3);
break;
default:
/* DDX/DDY used across thread switched might get us
* here.
*/
if (c->fs_threaded) {
c->failed = true;
return NULL;
}
fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
i, class_bits[i]);
abort();
@ -323,8 +377,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
bool ok = ra_allocate(g);
if (!ok) {
fprintf(stderr, "Failed to register allocate:\n");
qir_dump(c);
if (!c->fs_threaded) {
fprintf(stderr, "Failed to register allocate:\n");
qir_dump(c);
}
c->failed = true;
return NULL;
}