nir_opcodes.py 34 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#
# Copyright (C) 2014 Connor Abbott
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#
# Authors:
#    Connor Abbott (cwabbott0@gmail.com)

26
import re
27

28
29
30
31
32
33
34
35
# Class that represents all the information we have about the opcode
# NOTE: this must be kept in sync with nir_op_info

class Opcode(object):
   """Class that represents all the information we have about the opcode
   NOTE: this must be kept in sync with nir_op_info
   """
   def __init__(self, name, output_size, output_type, input_sizes,
36
                input_types, is_conversion, algebraic_properties, const_expr):
37
38
39
40
41
      """Parameters:

      - name is the name of the opcode (prepend nir_op_ for the enum name)
      - all types are strings that get nir_type_ prepended to them
      - input_types is a list of types
42
      - is_conversion is true if this opcode represents a type conversion
43
44
      - algebraic_properties is a space-seperated string, where nir_op_is_ is
        prepended before each entry
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
      - const_expr is an expression or series of statements that computes the
        constant value of the opcode given the constant values of its inputs.

      Constant expressions are formed from the variables src0, src1, ...,
      src(N-1), where N is the number of arguments.  The output of the
      expression should be stored in the dst variable.  Per-component input
      and output variables will be scalars and non-per-component input and
      output variables will be a struct with fields named x, y, z, and w
      all of the correct type.  Input and output variables can be assumed
      to already be of the correct type and need no conversion.  In
      particular, the conversion from the C bool type to/from  NIR_TRUE and
      NIR_FALSE happens automatically.

      For per-component instructions, the entire expression will be
      executed once for each component.  For non-per-component
      instructions, the expression is expected to store the correct values
      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
      constant expression, an assignment to dst will happen automatically
      and the result will be equivalent to "dst = <expression>" for
      per-component instructions and "dst.x = dst.y = ... = <expression>"
      for non-per-component instructions.
66
67
68
69
70
71
72
73
      """
      assert isinstance(name, str)
      assert isinstance(output_size, int)
      assert isinstance(output_type, str)
      assert isinstance(input_sizes, list)
      assert isinstance(input_sizes[0], int)
      assert isinstance(input_types, list)
      assert isinstance(input_types[0], str)
74
      assert isinstance(is_conversion, bool)
75
      assert isinstance(algebraic_properties, str)
76
      assert isinstance(const_expr, str)
77
78
79
80
81
82
83
84
85
86
87
88
      assert len(input_sizes) == len(input_types)
      assert 0 <= output_size <= 4
      for size in input_sizes:
         assert 0 <= size <= 4
         if output_size != 0:
            assert size != 0
      self.name = name
      self.num_inputs = len(input_sizes)
      self.output_size = output_size
      self.output_type = output_type
      self.input_sizes = input_sizes
      self.input_types = input_types
89
      self.is_conversion = is_conversion
90
      self.algebraic_properties = algebraic_properties
91
      self.const_expr = const_expr
92
93
94
95

# helper variables for strings
tfloat = "float"
tint = "int"
96
tbool = "bool"
97
tbool1 = "bool1"
98
tbool32 = "bool32"
99
tuint = "uint"
100
tuint16 = "uint16"
101
102
103
tfloat32 = "float32"
tint32 = "int32"
tuint32 = "uint32"
104
tint64 = "int64"
105
tuint64 = "uint64"
106
tfloat64 = "float64"
107

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')

def type_has_size(type_):
    m = _TYPE_SPLIT_RE.match(type_)
    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
    return m.group('bits') is not None

def type_size(type_):
    m = _TYPE_SPLIT_RE.match(type_)
    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
    assert m.group('bits') is not None, \
           'NIR type string has no bit size: "{}"'.format(type_)
    return int(m.group('bits'))

def type_sizes(type_):
    if type_has_size(type_):
        return [type_size(type_)]
125
    elif type_ == 'bool':
126
        return [1, 32]
127
128
129
    elif type_ == 'float':
        return [16, 32, 64]
    else:
130
        return [1, 8, 16, 32, 64]
131
132
133
134
135
136

def type_base_type(type_):
    m = _TYPE_SPLIT_RE.match(type_)
    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
    return m.group('type')

137
138
139
140
141
142
143
commutative = "commutative "
associative = "associative "

# global dictionary of opcodes
opcodes = {}

def opcode(name, output_size, output_type, input_sizes, input_types,
144
           is_conversion, algebraic_properties, const_expr):
145
146
   assert name not in opcodes
   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
147
148
                          input_types, is_conversion, algebraic_properties,
                          const_expr)
149

150
def unop_convert(name, out_type, in_type, const_expr):
151
   opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
152
153

def unop(name, ty, const_expr):
154
   opcode(name, 0, ty, [0], [ty], False, "", const_expr)
155
156
157

def unop_horiz(name, output_size, output_type, input_size, input_type,
               const_expr):
158
159
   opcode(name, output_size, output_type, [input_size], [input_type],
          False, "", const_expr)
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
                reduce_expr, final_expr):
   def prereduce(src):
      return "(" + prereduce_expr.format(src=src) + ")"
   def final(src):
      return final_expr.format(src="(" + src + ")")
   def reduce_(src0, src1):
      return reduce_expr.format(src0=src0, src1=src1)
   src0 = prereduce("src0.x")
   src1 = prereduce("src0.y")
   src2 = prereduce("src0.z")
   src3 = prereduce("src0.w")
   unop_horiz(name + "2", output_size, output_type, 2, input_type,
              final(reduce_(src0, src1)))
   unop_horiz(name + "3", output_size, output_type, 3, input_type,
              final(reduce_(reduce_(src0, src1), src2)))
   unop_horiz(name + "4", output_size, output_type, 4, input_type,
              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
179

180
181
def unop_numeric_convert(name, out_type, in_type, const_expr):
   opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
182
183
184

# These two move instructions differ in what modifiers they support and what
# the negate modifier means. Otherwise, they are identical.
185
186
187
188
189
190
unop("fmov", tfloat, "src0")
unop("imov", tint, "src0")

unop("ineg", tint, "-src0")
unop("fneg", tfloat, "-src0")
unop("inot", tint, "~src0") # invert every bit of the integer
191
192
193
194
195
unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
                      "((src0 == 0.0f) ? 1.0f : 0.0f)"))
unop("fsign", tfloat, ("bit_size == 64 ? " +
                       "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
                       "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196
unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197
unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
198
unop("fabs", tfloat, "fabs(src0)")
199
200
201
202
unop("fsat", tfloat, ("bit_size == 64 ? " +
                      "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
                      "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203
204
unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205
206
unop("fexp2", tfloat, "exp2f(src0)")
unop("flog2", tfloat, "log2f(src0)")
207
208

# Generate all of the numeric conversion opcodes
209
210
211
212
213
214
215
for src_t in [tint, tuint, tfloat, tbool]:
   if src_t == tbool:
      dst_types = [tfloat, tint]
   elif src_t == tint:
      dst_types = [tfloat, tint, tbool]
   elif src_t == tuint:
      dst_types = [tfloat, tuint]
216
   elif src_t == tfloat:
217
      dst_types = [tint, tuint, tfloat, tbool]
218
219

   for dst_t in dst_types:
220
      for bit_size in type_sizes(dst_t):
221
          if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
222
              rnd_modes = ['_rtne', '_rtz', '']
223
              for rnd_mode in rnd_modes:
224
225
226
                  unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
                                                              bit_size, rnd_mode),
                                       dst_t + str(bit_size), src_t, "src0")
227
          else:
228
              conv_expr = "src0 != 0" if dst_t == tbool else "src0"
229
230
              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
                                   dst_t + str(bit_size), src_t, conv_expr)
231

232

233
234
235
# Unary floating-point rounding operations.


236
237
238
239
240
unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
241

242
unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
243
244
245
246

# Trigonometric operations.


247
248
unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
249

250
# dfrexp
251
252
unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
253
254
255
256

# Partial derivatives.


257
258
259
260
261
262
unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
unop("fddy", tfloat, "0.0")
unop("fddx_fine", tfloat, "0.0")
unop("fddy_fine", tfloat, "0.0")
unop("fddx_coarse", tfloat, "0.0")
unop("fddy_coarse", tfloat, "0.0")
263
264
265
266


# Floating point pack and unpack operations.

267
def pack_2x16(fmt):
268
   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
269
270
271
272
273
dst.x = (uint32_t) pack_fmt_1x16(src0.x);
dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
""".replace("fmt", fmt))

def pack_4x8(fmt):
274
   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
275
276
277
278
279
280
281
dst.x = (uint32_t) pack_fmt_1x8(src0.x);
dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
""".replace("fmt", fmt))

def unpack_2x16(fmt):
282
   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
283
284
285
286
287
dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
""".replace("fmt", fmt))

def unpack_4x8(fmt):
288
   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
""".replace("fmt", fmt))


pack_2x16("snorm")
pack_4x8("snorm")
pack_2x16("unorm")
pack_4x8("unorm")
pack_2x16("half")
unpack_2x16("snorm")
unpack_4x8("snorm")
unpack_2x16("unorm")
unpack_4x8("unorm")
unpack_2x16("half")
306

307
unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
308
dst.x = (src0.x & 0xffff) | (src0.y << 16);
309
310
""")

311
unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
312
313
314
315
316
dst.x = (src0.x <<  0) |
        (src0.y <<  8) |
        (src0.z << 16) |
        (src0.w << 24);
""")
317

318
319
320
unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
           "dst.x = src0.x | ((uint32_t)src0.y << 16);")

321
unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
322
           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
323

324
325
326
unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
           "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")

327
unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
328
329
           "dst.x = src0.x; dst.y = src0.x >> 32;")

330
331
332
333
334
335
unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
           "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")

unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
           "dst.x = src0.x; dst.y = src0.x >> 16;")

336
337
338
# Lowered floating point unpacking operations.


339
340
341
342
unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
             "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
             "unpack_half_1x16((uint16_t)(src0 >> 16))")
343

344
345
346
unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")

347
348
unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
349
350
351
352

# Bit operations, part of ARB_gpu_shader5.


353
unop("bitfield_reverse", tuint32, """
354
355
356
357
358
/* we're not winning any awards for speed here, but that's ok */
dst = 0;
for (unsigned bit = 0; bit < 32; bit++)
   dst |= ((src0 >> bit) & 1) << (31 - bit);
""")
359
unop_convert("bit_count", tuint32, tuint, """
360
dst = 0;
361
for (unsigned bit = 0; bit < bit_size; bit++) {
362
363
364
365
366
   if ((src0 >> bit) & 1)
      dst++;
}
""")

367
unop_convert("ufind_msb", tint32, tuint, """
368
dst = -1;
369
for (int bit = bit_size - 1; bit >= 0; bit--) {
370
371
372
373
374
375
376
   if ((src0 >> bit) & 1) {
      dst = bit;
      break;
   }
}
""")

377
unop("ifind_msb", tint32, """
378
379
380
381
382
383
384
385
386
387
388
389
390
dst = -1;
for (int bit = 31; bit >= 0; bit--) {
   /* If src0 < 0, we're looking for the first 0 bit.
    * if src0 >= 0, we're looking for the first 1 bit.
    */
   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
      (!((src0 >> bit) & 1) && (src0 < 0))) {
      dst = bit;
      break;
   }
}
""")

391
unop_convert("find_lsb", tint32, tint, """
392
dst = -1;
393
for (unsigned bit = 0; bit < bit_size; bit++) {
394
395
396
397
398
399
   if ((src0 >> bit) & 1) {
      dst = bit;
      break;
   }
}
""")
400
401


402
403
for i in range(1, 5):
   for j in range(1, 5):
404
      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
405

406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433

# AMD_gcn_shader extended instructions
unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
dst.x = dst.y = 0.0;
float absX = fabs(src0.x);
float absY = fabs(src0.y);
float absZ = fabs(src0.z);
if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
""")

unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
float absX = fabs(src0.x);
float absY = fabs(src0.y);
float absZ = fabs(src0.z);
if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
""")


434
def binop_convert(name, out_type, in_type, alg_props, const_expr):
435
436
   opcode(name, 0, out_type, [0, 0], [in_type, in_type],
          False, alg_props, const_expr)
437

438
439
def binop(name, ty, alg_props, const_expr):
   binop_convert(name, ty, ty, alg_props, const_expr)
440

441
442
443
def binop_compare(name, ty, alg_props, const_expr):
   binop_convert(name, tbool1, ty, alg_props, const_expr)

444
def binop_compare32(name, ty, alg_props, const_expr):
445
   binop_convert(name, tbool32, ty, alg_props, const_expr)
446
447

def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
448
449
                src2_type, const_expr):
   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
450
          False, "", const_expr)
451
452
453
454
455
456
457
458
459
460
461
462
463
464

def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
                 reduce_expr, final_expr):
   def final(src):
      return final_expr.format(src= "(" + src + ")")
   def reduce_(src0, src1):
      return reduce_expr.format(src0=src0, src1=src1)
   def prereduce(src0, src1):
      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
   src0 = prereduce("src0.x", "src1.x")
   src1 = prereduce("src0.y", "src1.y")
   src2 = prereduce("src0.z", "src1.z")
   src3 = prereduce("src0.w", "src1.w")
   opcode(name + "2", output_size, output_type,
465
          [2, 2], [src_type, src_type], False, commutative,
466
          final(reduce_(src0, src1)))
467
   opcode(name + "3", output_size, output_type,
468
          [3, 3], [src_type, src_type], False, commutative,
469
          final(reduce_(reduce_(src0, src1), src2)))
470
   opcode(name + "4", output_size, output_type,
471
          [4, 4], [src_type, src_type], False, commutative,
472
          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
473

474
475
binop("fadd", tfloat, commutative + associative, "src0 + src1")
binop("iadd", tint, commutative + associative, "src0 + src1")
476
477
478
479
480
binop("iadd_sat", tint, commutative + associative, """
      src1 > 0 ?
         (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
         (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
""")
481
binop("uadd_sat", tuint, commutative,
482
      "(src0 + src1) < src0 ? UINT_MAX : (src0 + src1)")
483
484
485
486
487
488
489
binop("isub_sat", tint, "", """
      src1 < 0 ?
         (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
         (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
""")
binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")

490
491
binop("fsub", tfloat, "", "src0 - src1")
binop("isub", tint, "", "src0 - src1")
492

493
binop("fmul", tfloat, commutative + associative, "src0 * src1")
494
# low 32-bits of signed/unsigned integer multiply
495
binop("imul", tint, commutative + associative, "src0 * src1")
496

497
498
499
500
501
502
# Generate 64 bit result from 2 32 bits quantity
binop_convert("imul_2x32_64", tint64, tint32, commutative,
              "(int64_t)src0 * (int64_t)src1")
binop_convert("umul_2x32_64", tuint64, tuint32, commutative,
              "(uint64_t)src0 * (uint64_t)src1")

503
# high 32-bits of signed integer multiply
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
binop("imul_high", tint, commutative, """
if (bit_size == 64) {
   /* We need to do a full 128-bit x 128-bit multiply in order for the sign
    * extension to work properly.  The casts are kind-of annoying but needed
    * to prevent compiler warnings.
    */
   uint32_t src0_u32[4] = {
      src0,
      (int64_t)src0 >> 32,
      (int64_t)src0 >> 63,
      (int64_t)src0 >> 63,
   };
   uint32_t src1_u32[4] = {
      src1,
      (int64_t)src1 >> 32,
      (int64_t)src1 >> 63,
      (int64_t)src1 >> 63,
   };
   uint32_t prod_u32[4];
   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
} else {
   dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
}
""")

530
# high 32-bits of unsigned integer multiply
531
532
533
534
535
536
537
538
539
540
541
542
binop("umul_high", tuint, commutative, """
if (bit_size == 64) {
   /* The casts are kind-of annoying but needed to prevent compiler warnings. */
   uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
   uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
   uint32_t prod_u32[4];
   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
} else {
   dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
}
""")
543

544
binop("fdiv", tfloat, "", "src0 / src1")
545
546
binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
547
548
549
550

# returns a boolean representing the carry resulting from the addition of
# the two unsigned arguments.

551
binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
552
553
554
555

# returns a boolean representing the borrow resulting from the subtraction
# of the two unsigned arguments.

556
binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
557

558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
# hadd: (a + b) >> 1 (without overflow)
# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
#       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
#       = 2 *  (x & y) + (x & ~y) +                (~x & y)
#       =     ((x & y) << 1) + (x ^ y)
#
# Since we know that the bottom bit of (x & y) << 1 is zero,
#
# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
#              =   (x & y) +      ((x ^ y)  >> 1)
binop("ihadd", tint, commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
binop("uhadd", tuint, commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")

# rhadd: (a + b + 1) >> 1 (without overflow)
# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
#           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
#           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
#           =     ((x | y) << 1) - (x ^ y) + 1
#
# Since we know that the bottom bit of (x & y) << 1 is zero,
#
# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
#                  = (x | y) -  ((x ^ y)      >> 1)
binop("irhadd", tint, commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
binop("urhadd", tuint, commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")

584
binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
585

Jason Ekstrand's avatar
Jason Ekstrand committed
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
# For signed integers, there are several different possible definitions of
# "modulus" or "remainder".  We follow the conventions used by LLVM and
# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
# operation while the imod opcode implements the more mathematical
# "modulus" operation.  For details on the difference, see
#
# http://mathforum.org/library/drmath/view/52343.html

binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
binop("imod", tint, "",
      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
      "                 src0 % src1 : src0 % src1 + src1)")
binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")

601
602
603
604
605
606
607
#
# Comparisons
#


# these integer-aware comparisons return a boolean (0 or ~0)

608
609
610
611
612
613
614
615
616
617
binop_compare("flt", tfloat, "", "src0 < src1")
binop_compare("fge", tfloat, "", "src0 >= src1")
binop_compare("feq", tfloat, commutative, "src0 == src1")
binop_compare("fne", tfloat, commutative, "src0 != src1")
binop_compare("ilt", tint, "", "src0 < src1")
binop_compare("ige", tint, "", "src0 >= src1")
binop_compare("ieq", tint, commutative, "src0 == src1")
binop_compare("ine", tint, commutative, "src0 != src1")
binop_compare("ult", tuint, "", "src0 < src1")
binop_compare("uge", tuint, "", "src0 >= src1")
618
619
620
621
622
623
624
625
626
627
binop_compare32("flt32", tfloat, "", "src0 < src1")
binop_compare32("fge32", tfloat, "", "src0 >= src1")
binop_compare32("feq32", tfloat, commutative, "src0 == src1")
binop_compare32("fne32", tfloat, commutative, "src0 != src1")
binop_compare32("ilt32", tint, "", "src0 < src1")
binop_compare32("ige32", tint, "", "src0 >= src1")
binop_compare32("ieq32", tint, commutative, "src0 == src1")
binop_compare32("ine32", tint, commutative, "src0 != src1")
binop_compare32("ult32", tuint, "", "src0 < src1")
binop_compare32("uge32", tuint, "", "src0 >= src1")
628
629
630

# integer-aware GLSL-style comparisons that compare floats and ints

631
632
633
634
635
636
637
638
639
binop_reduce("ball_fequal",  1, tbool1, tfloat, "{src0} == {src1}",
             "{src0} && {src1}", "{src}")
binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
             "{src0} || {src1}", "{src}")
binop_reduce("ball_iequal",  1, tbool1, tint, "{src0} == {src1}",
             "{src0} && {src1}", "{src}")
binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
             "{src0} || {src1}", "{src}")

640
binop_reduce("b32all_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
641
             "{src0} && {src1}", "{src}")
642
binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
643
             "{src0} || {src1}", "{src}")
644
binop_reduce("b32all_iequal",  1, tbool32, tint, "{src0} == {src1}",
645
             "{src0} && {src1}", "{src}")
646
binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
647
             "{src0} || {src1}", "{src}")
648
649
650

# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0

651
binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
652
             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
653
binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
654
             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
655
656
657
658

# These comparisons for integer-less hardware return 1.0 and 0.0 for true
# and false respectively

659
binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
660
binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
661
662
binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
663

664
665
666
# SPIRV shifts are undefined for shift-operands >= bitsize,
# but SM5 shifts are defined to use the least significant bits, only
# The NIR definition is according to the SM5 specification.
667
668
669
670
671
672
opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
       "src0 << (src1 & (sizeof(src0) * 8 - 1))")
opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
       "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
       "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
673
674
675
676
677
678
679

# bitwise logic operators
#
# These are also used as boolean and, or, xor for hardware supporting
# integers.


680
681
682
binop("iand", tuint, commutative + associative, "src0 & src1")
binop("ior", tuint, commutative + associative, "src0 | src1")
binop("ixor", tuint, commutative + associative, "src0 ^ src1")
683
684
685
686
687
688
689


# floating point logic operators
#
# These use (src != 0.0) for testing the truth of the input, and output 1.0
# for true and 0.0 for false

690
binop("fand", tfloat32, commutative,
691
      "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
692
binop("for", tfloat32, commutative,
693
      "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
694
binop("fxor", tfloat32, commutative,
695
696
697
698
699
      "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")

binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
             "{src}")

700
701
702
binop_reduce("fdot_replicated", 4, tfloat, tfloat,
             "{src0} * {src1}", "{src0} + {src1}", "{src}")

703
opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
704
       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
705
opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
706
707
       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")

708
709
binop("fmin", tfloat, "", "fminf(src0, src1)")
binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
710
binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
711
712
binop("fmax", tfloat, "", "fmaxf(src0, src1)")
binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
713
binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
714

715
# Saturated vector add for 4 8bit ints.
716
binop("usadd_4x8", tint32, commutative + associative, """
717
718
719
720
721
722
723
dst = 0;
for (int i = 0; i < 32; i += 8) {
   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
}
""")

# Saturated vector subtract for 4 8bit ints.
724
binop("ussub_4x8", tint32, "", """
725
726
727
728
729
730
731
732
733
734
dst = 0;
for (int i = 0; i < 32; i += 8) {
   int src0_chan = (src0 >> i) & 0xff;
   int src1_chan = (src1 >> i) & 0xff;
   if (src0_chan > src1_chan)
      dst |= (src0_chan - src1_chan) << i;
}
""")

# vector min for 4 8bit ints.
735
binop("umin_4x8", tint32, commutative + associative, """
736
737
738
739
740
741
742
dst = 0;
for (int i = 0; i < 32; i += 8) {
   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
}
""")

# vector max for 4 8bit ints.
743
binop("umax_4x8", tint32, commutative + associative, """
744
745
746
747
748
749
750
dst = 0;
for (int i = 0; i < 32; i += 8) {
   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
}
""")

# unorm multiply: (a * b) / 255.
751
binop("umul_unorm_4x8", tint32, commutative + associative, """
752
753
754
755
756
757
758
759
dst = 0;
for (int i = 0; i < 32; i += 8) {
   int src0_chan = (src0 >> i) & 0xff;
   int src1_chan = (src1 >> i) & 0xff;
   dst |= ((src0_chan * src1_chan) / 255) << i;
}
""")

760
binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
761

762
binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
763
764
            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")

765
binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
766
767
              "src0 | ((uint64_t)src1 << 32)")

768
769
770
binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
              "src0 | ((uint32_t)src1 << 16)")

771
772
773
# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
# if either of its arguments are 32.
774
binop_convert("bfm", tuint32, tint32, "", """
775
int bits = src0, offset = src1;
776
777
if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
   dst = 0; /* undefined */
778
else
779
   dst = ((1u << bits) - 1) << offset;
780
781
""")

782
opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
783
dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
784
/* flush denormals to zero. */
785
if (!isnormal(dst))
Matt Turner's avatar
Matt Turner committed
786
   dst = copysignf(0.0f, src0);
787
""")
788
789
790

# Combines the first component of each input to make a 2-component vector.

791
binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
792
793
794
dst.x = src0.x;
dst.y = src1.x;
""")
795

796
797
798
799
800
801
802
803
804
# Byte extraction
binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")

# Word extraction
binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")


805
def triop(name, ty, const_expr):
806
   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, "", const_expr)
807
def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
808
   opcode(name, output_size, tuint,
809
   [src1_size, src2_size, src3_size],
810
   [tuint, tuint, tuint], False, "", const_expr)
811

812
triop("ffma", tfloat, "src0 * src1 + src2")
813

814
triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
815
816
817
818
819
820
821
822

# Conditional Select
#
# A vector conditional select instruction (like ?:, but operating per-
# component on vectors). There are two versions, one for floating point
# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).


823
triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
824
825
826
827
828
829
830
831
832
833
834
835
836
837

# 3 way min/max/med
triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))")
triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))")
triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))")

triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))")
triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))")
triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))")

triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")

838
opcode("bcsel", 0, tuint, [0, 0, 0],
839
      [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
840
opcode("b32csel", 0, tuint, [0, 0, 0],
841
       [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
842

843
# SM5 bfi assembly
844
triop("bfi", tuint32, """
845
unsigned mask = src0, insert = src1, base = src2;
846
847
848
849
850
851
852
853
if (mask == 0) {
   dst = base;
} else {
   unsigned tmp = mask;
   while (!(tmp & 1)) {
      tmp >>= 1;
      insert <<= 1;
   }
854
   dst = (base & ~mask) | (insert & mask);
855
856
857
}
""")

Matt Turner's avatar
Matt Turner committed
858
# SM5 ubfe/ibfe assembly
859
opcode("ubfe", 0, tuint32,
860
       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
Matt Turner's avatar
Matt Turner committed
861
862
863
864
865
866
867
868
869
870
871
872
unsigned base = src0;
int offset = src1, bits = src2;
if (bits == 0) {
   dst = 0;
} else if (bits < 0 || offset < 0) {
   dst = 0; /* undefined */
} else if (offset + bits < 32) {
   dst = (base << (32 - bits - offset)) >> (32 - bits);
} else {
   dst = base >> offset;
}
""")
873
opcode("ibfe", 0, tint32,
874
       [0, 0, 0], [tint32, tint32, tint32], False, "", """
Matt Turner's avatar
Matt Turner committed
875
876
877
878
879
880
881
882
883
884
885
886
887
888
int base = src0;
int offset = src1, bits = src2;
if (bits == 0) {
   dst = 0;
} else if (bits < 0 || offset < 0) {
   dst = 0; /* undefined */
} else if (offset + bits < 32) {
   dst = (base << (32 - bits - offset)) >> (32 - bits);
} else {
   dst = base >> offset;
}
""")

# GLSL bitfieldExtract()
889
opcode("ubitfield_extract", 0, tuint32,
890
       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
891
unsigned base = src0;
892
int offset = src1, bits = src2;
893
894
895
896
897
if (bits == 0) {
   dst = 0;
} else if (bits < 0 || offset < 0 || offset + bits > 32) {
   dst = 0; /* undefined per the spec */
} else {
Matt Turner's avatar
Matt Turner committed
898
   dst = (base >> offset) & ((1ull << bits) - 1);
899
900
}
""")
901
opcode("ibitfield_extract", 0, tint32,
902
       [0, 0, 0], [tint32, tint32, tint32], False, "", """
903
int base = src0;
904
int offset = src1, bits = src2;
905
906
907
908
909
910
911
912
if (bits == 0) {
   dst = 0;
} else if (offset < 0 || bits < 0 || offset + bits > 32) {
   dst = 0;
} else {
   dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
}
""")
913
914
915

# Combines the first component of each input to make a 3-component vector.

916
917
918
919
920
triop_horiz("vec3", 3, 1, 1, 1, """
dst.x = src0.x;
dst.y = src1.x;
dst.z = src2.x;
""")
921

922
923
def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
                 src4_size, const_expr):
924
   opcode(name, output_size, tuint,
925
          [src1_size, src2_size, src3_size, src4_size],
926
          [tuint, tuint, tuint, tuint],
927
          False, "", const_expr)
928

929
opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
930
       [tuint32, tuint32, tint32, tint32], False, "", """
931
unsigned base = src0, insert = src1;
932
int offset = src2, bits = src3;
933
if (bits == 0) {
934
   dst = base;
935
936
937
} else if (offset < 0 || bits < 0 || bits + offset > 32) {
   dst = 0;
} else {
Matt Turner's avatar
Matt Turner committed
938
   unsigned mask = ((1ull << bits) - 1) << offset;
939
   dst = (base & ~mask) | ((insert << offset) & mask);
940
941
942
943
944
945
946
947
948
}
""")

quadop_horiz("vec4", 4, 1, 1, 1, 1, """
dst.x = src0.x;
dst.y = src1.x;
dst.z = src2.x;
dst.w = src3.x;
""")
949
950