diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index 3eceeb8f7bd..e40298edb3d 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -1,5 +1,6 @@ @@ -790,7 +791,8 @@ instruction isn't generated yet, but it may also be because there is a duplicate instruction in the Bifrost or pseudo XML files --> - + + Do nothing. Useful at the start of a block for waiting on slots required by the first actual instruction of the block, to reconcile dependencies @@ -798,7 +800,8 @@ - + + Branches to a specified relative offset if its source is nonzero (default) or if its source is zero (if `.eq` is set). The offset is 27-bits and @@ -820,7 +823,8 @@ - + + Evaluates the given condition, and if it passes, discards the current fragment and terminates the thread. Only valid in a **fragment** shader. @@ -830,7 +834,8 @@ Right value to compare - + + Jump to an indirectly specified (absolute or relative) address. Used to jump to blend shaders at the end of a fragment shader. @@ -842,7 +847,8 @@ - + + General-purpose barrier. Must use slot #7. Must be paired with a `.wait` flow on the instruction. @@ -851,8 +857,12 @@ - - + + + + + + Evaluates the given condition and outputs either the true source or the false source. @@ -865,10 +875,18 @@ - - - - + + + + + + + + + + + + Evaluates the given condition and outputs either the true source or the false source. @@ -885,7 +903,8 @@ Return value if false - + + @@ -899,8 +918,12 @@ Interpolates a given varying from hardware buffer - - + + + + + + @@ -915,8 +938,12 @@ Interpolates a given varying from hardware buffer - - + + + + + + @@ -929,7 +956,8 @@ - + + Interpolates a given varying from a software buffer @@ -942,7 +970,8 @@ Varying index and table - + + Interpolates a given varying from a software buffer @@ -956,7 +985,8 @@ - + + Fetches a given varying from a software buffer @@ -966,7 +996,8 @@ Varying index and table - + + Fetches a given varying from a software buffer @@ -977,7 +1008,8 @@ - + + Load `vecsize` components from the attribute descriptor at entry `index` of resource table `table` at index (vertex ID, instance ID), converting @@ -995,7 +1027,8 @@ - + + Load `vecsize` components from the attribute descriptor at the specified location at index (vertex ID, instance ID), converting @@ -1014,14 +1047,16 @@ Index and table - + + Load the 64-bit global clock, either a cycle counter or the system clock. - + + Load `vecsize` components from the texture descriptor at entry `index` of resource table `table`, converting @@ -1039,7 +1074,8 @@ - + + Load `vecsize` components from the texture descriptor at the specified location at index, converting @@ -1056,7 +1092,8 @@ Index and table - + + Load the effective address of an attribute specified with the given immediate index. Returns three staging register: the low/high @@ -1072,7 +1109,8 @@ - + + Load the effective address of an attribute specified with the given index. Returns three staging register: the low/high @@ -1088,7 +1126,8 @@ Attribute index and table - + + Load the effective address of a texel from the image specified with the given immediate index. Returns three staging registers: the low/high @@ -1109,7 +1148,8 @@ - + + Load the effective address of a texel from the image specified with the given index. Returns three staging register: the low/high @@ -1130,7 +1170,8 @@ Index and table - + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1147,7 +1188,8 @@ Mode descriptor - + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1164,7 +1206,8 @@ Mode descriptor - + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1181,7 +1224,8 @@ Mode descriptor - + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1198,7 +1242,8 @@ Mode descriptor - + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1215,7 +1260,8 @@ Mode descriptor - + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1232,7 +1278,8 @@ Mode descriptor - + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1249,7 +1296,8 @@ Mode descriptor - + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1267,7 +1315,8 @@ - + + Load effective address of a buffer with an offset added. @@ -1278,7 +1327,8 @@ Mode descriptor - + + Load effective address of a buffer with an immediate offset added. @@ -1290,7 +1340,8 @@ Structure index - + + Loads from main memory @@ -1302,7 +1353,8 @@ - + + Loads from main memory @@ -1314,7 +1366,8 @@ - + + Loads from main memory @@ -1326,7 +1379,8 @@ - + + Loads from main memory @@ -1338,7 +1392,8 @@ - + + Loads from main memory @@ -1350,7 +1405,8 @@ - + + Loads from main memory @@ -1362,7 +1418,8 @@ - + + Loads from main memory @@ -1374,7 +1431,8 @@ - + + Loads from main memory @@ -1386,7 +1444,8 @@ - + + Stores to main memory @@ -1404,7 +1463,8 @@ - + + Load effective address of a simple buffer with an offset added. @@ -1415,7 +1475,8 @@ Index - + + Load from memory with data conversion. The address to load from is given in the first source, which must be a 64-bit register (a pair of 32-bit @@ -1432,7 +1493,8 @@ Internal conversion descriptor - + + Store to memory with data conversion. The address to store to is given in the first source, which must be a 64-bit register (a pair of 32-bit @@ -1451,7 +1513,8 @@ Internal conversion descriptor - + + Loads a given render target, specified in the pixel indices descriptor, at a given location and sample, and convert to the format specified in the @@ -1468,7 +1531,8 @@ Conversion descriptor - + + Store to given render target, specified in the pixel indices descriptor, at a given location and sample, and convert to the format specified in the @@ -1484,7 +1548,8 @@ Conversion descriptor - + + Blends a given render target. This loads the API-specified blend state for the render target from the first source. Blend descriptors are available @@ -1520,7 +1585,8 @@ - + + Does alpha-to-coverage testing, updating the sample coverage mask. ATEST does not do an implicit discard. It should be executed before the first @@ -1533,7 +1599,8 @@ - + + Programatically writes out depth, stencil, or both, depending on which modifiers are set. Used to implement gl_FragDepth and gl_FragStencil. @@ -1548,7 +1615,8 @@ - + + Performs the given data conversion. Note that floating-point rounding is handled via the same hardware and therefore shares an encoding. Round mode @@ -1569,7 +1637,8 @@ Value to convert - + + Performs the given data conversion. @@ -1586,7 +1655,8 @@ Value to convert - + + Performs the given data conversion. @@ -1594,7 +1664,8 @@ Value to convert - + + Performs the given data conversion. @@ -1608,13 +1679,15 @@ Value to convert - + + Converts up with the specified round mode. Value to convert - + + Performs the given data conversion. @@ -1632,7 +1705,8 @@ Value to convert - + + Performs the given data conversion. @@ -1649,7 +1723,8 @@ Value to convert - + + Performs the given rounding, using the convert unit. @@ -1663,33 +1738,38 @@ Value to convert - + + Canonical register-to-register move. - + + Used as a primitive for various bitwise operations. - + + Used as a primitive for various bitwise operations. - + + Used as a primitive for various bitwise operations. - + + 64-bit abs may be constructed in 4 instructions (5 clocks) by checking the sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with @@ -1698,16 +1778,19 @@ - + + - + + - + + Only available as 32-bit. Smaller bitsizes require explicit conversions. 64-bit popcount may be constructed in 3 clocks by separate 32-bit @@ -1717,28 +1800,32 @@ - + + Only available as 32-bit. Other bitsizes may be derived with swizzles. - + + For fully featured bitwise operation, see the shift opcodes. - + + For fully featured bitwise operation, see the shift opcodes. - + + Returns the mask of lanes ever active within the warp (subgroup), such that the source is nonzero. The number of work-items in a subgroup is @@ -1754,7 +1841,8 @@ - + + @@ -1769,7 +1857,8 @@ - + + @@ -1788,7 +1877,8 @@ - + + @@ -1810,7 +1900,8 @@ - + + @@ -1824,8 +1915,12 @@ - - + + + + + + $A + B$ @@ -1835,8 +1930,12 @@ - - + + + + + + $\min \{ A, B \}$ A @@ -1844,8 +1943,12 @@ - - + + + + + + $\max \{ A, B \}$ A @@ -1854,7 +1957,9 @@ - + + + Given a pair of 32-bit floats, output a pair of 16-bit floats packed into a 32-bit destination. @@ -1866,8 +1971,12 @@ - - + + + + + + Computes $A \cdot 2^B$ by adding B to the exponent of A. Used to calculate various special functions, particularly base-2 exponents. Special case @@ -1880,7 +1989,8 @@ - + + Calculates the base-2 exponent of an argument specified as a 8:24 fixed-point. The original argument is passed as well for correct handling @@ -1891,7 +2001,8 @@ Input as 32-bit float - + + Performs a floating-point addition specialized for logarithm computation. @@ -1900,7 +2011,8 @@ B - + + Used for `atan2()` implementation. Destination is two 16-bit values (int and float) for the first form, and a single 32-bit float when @@ -1918,38 +2030,71 @@ As Valhall lacks swizzle instructions, `IADD.v2i16` with zero is the canonical lowering for swizzles. - - + + + + + + - - - + + + + + + + + + - - - + + + + + + + + + A B - + + Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)` A B - - + + + + + + - - - + + + + + + + + + - - - + + + + + + + + + $A - B$ with optional saturation A B @@ -1964,7 +2109,9 @@ .neg indicates SEG_SUB instead. - + + + A @@ -1977,21 +2124,39 @@ 64-bit value A. These instructions accelerate address arithmetic, but may be used in full generality for 64-bit integer arithmetic. - - + + + + + + A B - - - - - - - + + + + + + + + + + + + + + + + + + + + + $A \cdot B$ with optional saturation. Note the multipliers can only handle up to 32-bit by 32-bit multiplies. The 64-bit "multiply" acts like IMUL.u32 but @@ -2006,12 +2171,24 @@ - - - - - - + + + + + + + + + + + + + + + + + + A B @@ -2022,7 +2199,8 @@ - + + Selects the value of A in the subgroup lane given by B. This implements subgroup broadcasts. It may be used as a primitive for screen space @@ -2036,8 +2214,12 @@ - - + + + + + + $A \cdot B + C$ @@ -2047,10 +2229,18 @@ - - - - + + + + + + + + + + + + Left shifts its first source by a specified amount and bitwise ANDs it with the @@ -2063,10 +2253,18 @@ - - - - + + + + + + + + + + + + Right shifts its first source by a specified amount and bitwise ANDs it with the @@ -2082,10 +2280,18 @@ - - - - + + + + + + + + + + + + Left shifts its first source by a specified amount and bitwise ORs it with the @@ -2098,10 +2304,18 @@ - - - - + + + + + + + + + + + + Right shifts its first source by a specified amount and bitwise ORs it with the @@ -2117,10 +2331,18 @@ - - - - + + + + + + + + + + + + Left shifts its first source by a specified amount and bitwise XORs it with the @@ -2133,10 +2355,18 @@ - - - - + + + + + + + + + + + + Right shifts its first source by a specified amount and bitwise XORs it with the @@ -2151,7 +2381,8 @@ B - + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2165,7 +2396,8 @@ Mask - + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2179,7 +2411,8 @@ Mask - + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2193,21 +2426,24 @@ Mask - + + During a cube map transform, select the S coordinate given a selected face. Z coordinate as 32-bit floating point X coordinate as 32-bit floating point Cube face index - + + During a cube map transform, select the T coordinate given a selected face. Y coordinate as 32-bit floating point Z coordinate as 32-bit floating point Cube face index - + + Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD. @@ -2222,21 +2458,24 @@ CD - + + Select the maximum absolute value of its arguments. X coordinate as 32-bit floating point Y coordinate as 32-bit floating point Z coordinate as 32-bit floating point - + + Select the cube face index corresponding to the arguments. X coordinate as 32-bit floating point Y coordinate as 32-bit floating point Z coordinate as 32-bit floating point - + + 8-bit integer dot product between 4 channel vectors, intended for machine learning. Available in both unsigned and signed variants, controlling @@ -2264,10 +2503,16 @@ for chaining together conditions without intermediate bitwise arithmetic; when this is not desired, tie it to zero. - - + + + + + + - + + + A @@ -2282,10 +2527,16 @@ one, integer minus one, or floating-point one). The third source is useful for chaining together conditions without intermediate bitwise arithmetic. - - + + + + + + - + + + A @@ -2301,8 +2552,12 @@ for chaining together conditions without intermediate bitwise arithmetic; when this is not desired, tie it to zero. - - + + + + + + A @@ -2317,8 +2572,12 @@ one, integer minus one, or floating-point one). The third source is useful for chaining together conditions without intermediate bitwise arithmetic. - - + + + + + + A @@ -2333,10 +2592,16 @@ one, integer minus one, or floating-point one). The third source is useful for chaining together conditions without intermediate bitwise arithmetic. - - + + + + + + - + + + A @@ -2351,10 +2616,16 @@ one, integer minus one, or floating-point one). The third source is useful for chaining together conditions without intermediate bitwise arithmetic. - - + + + + + + - + + + A @@ -2376,8 +2647,12 @@ result type on the low half, the `m1` result type on the high half, and the result of the low half comparison passed as the third source. - - + + + + + + A @@ -2385,7 +2660,8 @@ C - + + Adds an arbitrary 32-bit immediate embedded within the instruction stream. If no modifiers are required, this is preferred to `IADD.i32` with a @@ -2398,7 +2674,8 @@ - + + Adds an arbitrary pair of 16-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2411,7 +2688,8 @@ - + + Adds an arbitrary quad of 8-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2423,7 +2701,8 @@ - + + Adds an arbitrary 32-bit immediate embedded within the instruction stream. If no modifiers are required, this is preferred to `FADD.f32` with a @@ -2434,7 +2713,8 @@ - + + Adds an arbitrary pair of 16-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2446,7 +2726,8 @@ - + + @@ -2458,7 +2739,8 @@ - + + @@ -2470,7 +2752,8 @@ - + + @@ -2481,7 +2764,8 @@ - + + @@ -2492,7 +2776,8 @@ - + + @@ -2509,7 +2794,8 @@ - + + @@ -2526,7 +2812,8 @@ - + + Unfiltered textured instruction. @@ -2550,7 +2837,8 @@ - + + Ordinary texturing instruction using a sampler. @@ -2576,7 +2864,8 @@ - + + Texture gather instruction. @@ -2603,7 +2892,8 @@ - + + Texture sample with explicit gradient. @@ -2627,7 +2917,8 @@ - + + Pair of texture instructions. @@ -2650,7 +2941,8 @@ Image to read from - + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -2672,7 +2964,8 @@ Varying offset - + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -2695,7 +2988,8 @@ Varying offset - + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -2718,7 +3012,8 @@ Varying offset - + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units. @@ -2740,7 +3035,8 @@ Varying offset - + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -2762,7 +3058,8 @@ Varying offset - + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -2785,7 +3082,8 @@ Varying offset - + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -2808,7 +3106,8 @@ Varying offset - + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units. @@ -2830,7 +3129,8 @@ Varying offset - + + First calculates $A \cdot B + C$ and then biases the exponent by D. Used in special transcendental function sequences. It should not be used for @@ -2845,7 +3145,8 @@ D - + + First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply $A \cdot B$ is treated as zero even if an @@ -2861,7 +3162,8 @@ D - + + First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply is treated as $A$ even if an @@ -2877,7 +3179,8 @@ D - + + First calculates $A \cdot B + C$ and then biases the exponent by D, interpreted as a 16-bit value. Used in special transcendental function diff --git a/src/panfrost/compiler/bifrost/valhall/asm.py b/src/panfrost/compiler/bifrost/valhall/asm.py index 8301789bb06..560a2b1834d 100644 --- a/src/panfrost/compiler/bifrost/valhall/asm.py +++ b/src/panfrost/compiler/bifrost/valhall/asm.py @@ -315,7 +315,7 @@ def parse_asm(line): operands = operands[len(ins.immediates):] # Encode the operation itself - encoded |= (ins.opcode << 48) + encoded |= (ins.opcode.value << ins.opcode.start) encoded |= (ins.opcode2 << ins.secondary_shift) # Encode FAU page diff --git a/src/panfrost/compiler/bifrost/valhall/disasm.py b/src/panfrost/compiler/bifrost/valhall/disasm.py index 58ac0de8c4e..fca5f87f877 100644 --- a/src/panfrost/compiler/bifrost/valhall/disasm.py +++ b/src/panfrost/compiler/bifrost/valhall/disasm.py @@ -271,7 +271,7 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose) # Bucket by opcode for hierarchical disassembly OPCODE_BUCKETS = {} for ins in instructions: - opc = ins.opcode + opc = ins.opcode.value OPCODE_BUCKETS[opc] = OPCODE_BUCKETS.get(opc, []) + [ins] # Check that each bucket may be disambiguated diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.c.py b/src/panfrost/compiler/bifrost/valhall/valhall.c.py index ea1a559b072..47f7d4a328a 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py @@ -147,7 +147,7 @@ valhall_opcodes[BI_NUM_OPCODES] = { # Exact value to be ORed in to every opcode def exact_op(op): - return (op.opcode << 48) | (op.opcode2 << op.secondary_shift) + return (op.opcode.value << op.opcode.start) | (op.opcode2 << op.secondary_shift) try: print(Template(template).render(immediates = immediates, instructions = instructions, skip = SKIP, exact = exact_op, typesize = typesize)) diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.py b/src/panfrost/compiler/bifrost/valhall/valhall.py index a3bd52e5191..0ac517a47e4 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.py @@ -157,6 +157,12 @@ class Immediate: self.size = size self.signed = signed +class Opcode: + def __init__(self, value, start, mask): + self.value = value + self.start = start + self.mask = mask + class Instruction: def __init__(self, name, opcode, opcode2, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None): self.name = name @@ -179,7 +185,7 @@ class Instruction: self.secondary_mask |= 0x100 if len(srcs) == 3 and (srcs[1].widen or srcs[1].lanes or srcs[1].swizzle): self.secondary_mask &= ~0xC # conflicts - if opcode == 0x90: + if opcode.value == 0x90: # XXX: XMLify this, but disambiguates sign of conversions self.secondary_mask |= 0x10 if name.startswith("LOAD.i") or name.startswith("STORE.i") or name.startswith("LD_PKA.i"): @@ -238,14 +244,22 @@ def build_modifier(el): return Modifier(name, start, size, implied) +def build_opcode(el, name): + opcode = el.find(name) + if opcode is None: + return None + value = int(opcode.get('val'), base=0) + start = int(opcode.get('start')) + mask = int(opcode.get('mask'), base=0) + return Opcode(value, start, mask) + # Build a single instruction from XML and group based overrides def build_instr(el, overrides = {}): # Get overridables name = overrides.get('name') or el.attrib.get('name') - opcode = overrides.get('opcode') or el.attrib.get('opcode') + opcode = overrides.get('opcode') or build_opcode(el, 'opcode') opcode2 = overrides.get('opcode2') or el.attrib.get('opcode2') unit = overrides.get('unit') or el.attrib.get('unit') - opcode = int(opcode, base=0) opcode2 = int(opcode2, base=0) if opcode2 else None # Get explicit sources/dests @@ -295,7 +309,7 @@ def build_group(el): for ins in el.findall('ins'): build_instr(el, overrides = { 'name': ins.attrib['name'], - 'opcode': ins.attrib.get('opcode'), + 'opcode': build_opcode(ins, 'opcode'), 'opcode2': ins.attrib.get('opcode2'), 'unit': ins.attrib.get('unit'), }) @@ -335,16 +349,16 @@ def safe_name(name): return name.lower() # Parses out the size part of an opcode name -def typesize(opcode): - if opcode[-3:] == '128': +def typesize(name): + if name[-3:] == '128': return 128 - if opcode[-2:] == '48': + if name[-2:] == '48': return 48 - elif opcode[-1] == '8': + elif name[-1] == '8': return 8 else: try: - return int(opcode[-2:]) + return int(name[-2:]) except: return 32