fd6_program.c 41.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/*
 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
 * Copyright © 2018 Google, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors:
 *    Rob Clark <robclark@freedesktop.org>
 */

#include "pipe/p_state.h"
#include "util/u_string.h"
#include "util/u_memory.h"
31
#include "util/u_helpers.h"
32
#include "util/u_inlines.h"
33
#include "util/format/u_format.h"
34
35
36
37
38
#include "util/bitset.h"

#include "freedreno_program.h"

#include "fd6_program.h"
39
#include "fd6_const.h"
40
41
42
#include "fd6_emit.h"
#include "fd6_texture.h"
#include "fd6_format.h"
43
#include "fd6_pack.h"
44
45

void
46
47
fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
				const struct ir3_shader_variant *so)
48
49
50
{
	enum a6xx_state_block sb = fd6_stage2shadersb(so->type);

51
	uint32_t first_exec_offset = 0;
52
	uint32_t instrlen = 0;
53
54
55

	switch (so->type) {
	case MESA_SHADER_VERTEX:
56
		first_exec_offset = REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET;
57
58
59
		instrlen = REG_A6XX_SP_VS_INSTRLEN;
		break;
	case MESA_SHADER_TESS_CTRL:
60
		first_exec_offset = REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET;
61
62
63
		instrlen = REG_A6XX_SP_HS_INSTRLEN;
		break;
	case MESA_SHADER_TESS_EVAL:
64
		first_exec_offset = REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET;
65
66
67
		instrlen = REG_A6XX_SP_DS_INSTRLEN;
		break;
	case MESA_SHADER_GEOMETRY:
68
		first_exec_offset = REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET;
69
70
71
		instrlen = REG_A6XX_SP_GS_INSTRLEN;
		break;
	case MESA_SHADER_FRAGMENT:
72
		first_exec_offset = REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET;
73
74
75
76
		instrlen = REG_A6XX_SP_FS_INSTRLEN;
		break;
	case MESA_SHADER_COMPUTE:
	case MESA_SHADER_KERNEL:
77
		first_exec_offset = REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET;
78
79
		instrlen = REG_A6XX_SP_CS_INSTRLEN;
		break;
80
81
82
83
84
85
86
87
88
	case MESA_SHADER_TASK:
	case MESA_SHADER_MESH:
	case MESA_SHADER_RAYGEN:
	case MESA_SHADER_ANY_HIT:
	case MESA_SHADER_CLOSEST_HIT:
	case MESA_SHADER_MISS:
	case MESA_SHADER_INTERSECTION:
	case MESA_SHADER_CALLABLE:
		unreachable("Unsupported shader stage");
89
90
91
92
	case MESA_SHADER_NONE:
		unreachable("");
	}

93
94
95
96
97
98
99
#ifdef DEBUG
	/* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */
	const char *name = so->shader->nir->info.name;
	if (name)
		fd_emit_string5(ring, name, strlen(name));
#endif

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
	uint32_t fibers_per_sp = ctx->screen->info.fibers_per_sp;
	uint32_t num_sp_cores = ctx->screen->info.num_sp_cores;

	uint32_t per_fiber_size = ALIGN(so->pvtmem_size, 512);
	if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) {
		if (ctx->pvtmem[so->pvtmem_per_wave].bo)
			fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo);
		ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size;
		uint32_t total_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12)
			* num_sp_cores;
		ctx->pvtmem[so->pvtmem_per_wave].bo =
			fd_bo_new(ctx->screen->dev, total_size,
					  DRM_FREEDRENO_GEM_TYPE_KMEM, "pvtmem_%s_%d",
					  so->pvtmem_per_wave ? "per_wave" : "per_fiber",
					  per_fiber_size);
	} else {
		per_fiber_size = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size;
	}

	uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12);

121
122
123
	OUT_PKT4(ring, instrlen, 1);
	OUT_RING(ring, so->instrlen);

124
125
126
127
128
129
130
131
132
133
134
135
	OUT_PKT4(ring, first_exec_offset, 7);
	OUT_RING(ring, 0);	/* SP_xS_OBJ_FIRST_EXEC_OFFSET */
	OUT_RELOC(ring, so->bo, 0, 0, 0);	/* SP_xS_OBJ_START_LO */
	OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(per_fiber_size));
	if (so->pvtmem_size > 0) {	/* SP_xS_PVT_MEM_ADDR */
		OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0);
	} else {
		OUT_RING(ring, 0);
		OUT_RING(ring, 0);
	 }
	OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) |
			       COND(so->pvtmem_per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
136

137
	OUT_PKT7(ring, fd6_stage2opcode(so->type), 3);
138
139
	OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
			CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
140
			CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
141
142
			CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
			CP_LOAD_STATE6_0_NUM_UNIT(so->instrlen));
143
	OUT_RELOC(ring, so->bo, 0, 0, 0);
144
145
146
147
148
149
150
151
}

/* Add any missing varyings needed for stream-out.  Otherwise varyings not
 * used by fragment shader will be stripped out.
 */
static void
link_stream_out(struct ir3_shader_linkage *l, const struct ir3_shader_variant *v)
{
152
	const struct ir3_stream_output_info *strmout = &v->shader->stream_output;
153
154
155
156
157
158

	/*
	 * First, any stream-out varyings not already in linkage map (ie. also
	 * consumed by frag shader) need to be added:
	 */
	for (unsigned i = 0; i < strmout->num_outputs; i++) {
159
		const struct ir3_stream_output *out = &strmout->output[i];
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
		unsigned k = out->register_index;
		unsigned compmask =
			(1 << (out->num_components + out->start_component)) - 1;
		unsigned idx, nextloc = 0;

		/* psize/pos need to be the last entries in linkage map, and will
		 * get added link_stream_out, so skip over them:
		 */
		if ((v->outputs[k].slot == VARYING_SLOT_PSIZ) ||
				(v->outputs[k].slot == VARYING_SLOT_POS))
			continue;

		for (idx = 0; idx < l->cnt; idx++) {
			if (l->var[idx].regid == v->outputs[k].regid)
				break;
			nextloc = MAX2(nextloc, l->var[idx].loc + 4);
		}

		/* add if not already in linkage map: */
		if (idx == l->cnt)
			ir3_link_add(l, v->outputs[k].regid, compmask, nextloc);

		/* expand component-mask if needed, ie streaming out all components
		 * but frag shader doesn't consume all components:
		 */
		if (compmask & ~l->var[idx].compmask) {
			l->var[idx].compmask |= compmask;
			l->max_loc = MAX2(l->max_loc,
				l->var[idx].loc + util_last_bit(l->var[idx].compmask));
		}
	}
}

static void
Rob Clark's avatar
Rob Clark committed
194
setup_stream_out(struct fd6_program_state *state, const struct ir3_shader_variant *v,
195
196
		struct ir3_shader_linkage *l)
{
197
	const struct ir3_stream_output_info *strmout = &v->shader->stream_output;
Rob Clark's avatar
Rob Clark committed
198

199
200
201
	uint32_t ncomp[PIPE_MAX_SO_BUFFERS];
	uint32_t prog[256/2];
	uint32_t prog_count;
202

203
204
	memset(ncomp, 0, sizeof(ncomp));
	memset(prog, 0, sizeof(prog));
Rob Clark's avatar
Rob Clark committed
205

206
207
208
	prog_count = align(l->max_loc, 2) / 2;

	debug_assert(prog_count < ARRAY_SIZE(prog));
209
210

	for (unsigned i = 0; i < strmout->num_outputs; i++) {
211
		const struct ir3_stream_output *out = &strmout->output[i];
212
213
214
		unsigned k = out->register_index;
		unsigned idx;

215
		ncomp[out->output_buffer] += out->num_components;
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

		/* linkage map sorted by order frag shader wants things, so
		 * a bit less ideal here..
		 */
		for (idx = 0; idx < l->cnt; idx++)
			if (l->var[idx].regid == v->outputs[k].regid)
				break;

		debug_assert(idx < l->cnt);

		for (unsigned j = 0; j < out->num_components; j++) {
			unsigned c   = j + out->start_component;
			unsigned loc = l->var[idx].loc + c;
			unsigned off = j + out->dst_offset;  /* in dwords */

			if (loc & 1) {
232
				prog[loc/2] |= A6XX_VPC_SO_PROG_B_EN |
233
234
235
						A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
						A6XX_VPC_SO_PROG_B_OFF(off * 4);
			} else {
236
				prog[loc/2] |= A6XX_VPC_SO_PROG_A_EN |
237
238
239
240
241
242
						A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
						A6XX_VPC_SO_PROG_A_OFF(off * 4);
			}
		}
	}

243
244
245
	struct fd_ringbuffer *ring = state->streamout_stateobj;

	OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 12 + (2 * prog_count));
246
247
248
249
250
251
	OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
	OUT_RING(ring, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(0x1) |
			COND(ncomp[0] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1)) |
			COND(ncomp[1] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1)) |
			COND(ncomp[2] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1)) |
			COND(ncomp[3] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1)));
252
253
254
255
256
257
258
259
260
	OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(0));
	OUT_RING(ring, ncomp[0]);
	OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(1));
	OUT_RING(ring, ncomp[1]);
	OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(2));
	OUT_RING(ring, ncomp[2]);
	OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(3));
	OUT_RING(ring, ncomp[3]);
	OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
261
	OUT_RING(ring, A6XX_VPC_SO_CNTL_RESET);
262
263
264
265
	for (unsigned i = 0; i < prog_count; i++) {
		OUT_RING(ring, REG_A6XX_VPC_SO_PROG);
		OUT_RING(ring, prog[i]);
	}
266
267
}

268
269
270
static void
setup_config_stateobj(struct fd_ringbuffer *ring, struct fd6_program_state *state)
{
271
272
273
274
275
276
277
278
279
280
	OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD(
			.vs_state = true,
			.hs_state = true,
			.ds_state = true,
			.gs_state = true,
			.fs_state = true,
			.cs_state = true,
			.gfx_ibo = true,
			.cs_ibo = true,
		));
281

282
	debug_assert(state->vs->constlen >= state->bs->constlen);
283
284

	OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4);
285
	OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(state->vs->constlen) |
286
			A6XX_HLSQ_VS_CNTL_ENABLED);
287
288
	OUT_RING(ring, COND(state->hs,
					A6XX_HLSQ_HS_CNTL_ENABLED |
289
					A6XX_HLSQ_HS_CNTL_CONSTLEN(state->hs->constlen)));
290
291
	OUT_RING(ring, COND(state->ds,
					A6XX_HLSQ_DS_CNTL_ENABLED |
292
					A6XX_HLSQ_DS_CNTL_CONSTLEN(state->ds->constlen)));
293
294
	OUT_RING(ring, COND(state->gs,
					A6XX_HLSQ_GS_CNTL_ENABLED |
295
					A6XX_HLSQ_GS_CNTL_CONSTLEN(state->gs->constlen)));
296
	OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1);
297
	OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(state->fs->constlen) |
298
299
300
301
			A6XX_HLSQ_FS_CNTL_ENABLED);

	OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1);
	OUT_RING(ring, COND(state->vs, A6XX_SP_VS_CONFIG_ENABLED) |
302
			A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(state->vs)) |
303
304
305
306
			A6XX_SP_VS_CONFIG_NTEX(state->vs->num_samp) |
			A6XX_SP_VS_CONFIG_NSAMP(state->vs->num_samp));

	OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1);
307
308
	OUT_RING(ring, COND(state->hs,
					A6XX_SP_HS_CONFIG_ENABLED |
309
					A6XX_SP_HS_CONFIG_NIBO(ir3_shader_nibo(state->hs)) |
310
311
					A6XX_SP_HS_CONFIG_NTEX(state->hs->num_samp) |
					A6XX_SP_HS_CONFIG_NSAMP(state->hs->num_samp)));
312
313

	OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1);
314
315
	OUT_RING(ring, COND(state->ds,
					A6XX_SP_DS_CONFIG_ENABLED |
316
					A6XX_SP_DS_CONFIG_NIBO(ir3_shader_nibo(state->ds)) |
317
318
					A6XX_SP_DS_CONFIG_NTEX(state->ds->num_samp) |
					A6XX_SP_DS_CONFIG_NSAMP(state->ds->num_samp)));
319
320

	OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1);
321
322
	OUT_RING(ring, COND(state->gs,
					A6XX_SP_GS_CONFIG_ENABLED |
323
					A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(state->gs)) |
324
325
326
327
328
					A6XX_SP_GS_CONFIG_NTEX(state->gs->num_samp) |
					A6XX_SP_GS_CONFIG_NSAMP(state->gs->num_samp)));

	OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1);
	OUT_RING(ring, COND(state->fs, A6XX_SP_FS_CONFIG_ENABLED) |
329
			A6XX_SP_FS_CONFIG_NIBO(ir3_shader_nibo(state->fs)) |
330
331
			A6XX_SP_FS_CONFIG_NTEX(state->fs->num_samp) |
			A6XX_SP_FS_CONFIG_NSAMP(state->fs->num_samp));
332
333

	OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1);
334
	OUT_RING(ring, ir3_shader_nibo(state->fs));
335
336
}

337
338
339
static inline uint32_t
next_regid(uint32_t reg, uint32_t increment)
{
340
	if (VALIDREG(reg))
341
		return reg + increment;
342
343
	else
		return regid(63,0);
344
345
}

Rob Clark's avatar
Rob Clark committed
346
static void
347
setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
348
349
		struct fd6_program_state *state, const struct ir3_shader_key *key,
		bool binning_pass)
350
{
351
	uint32_t pos_regid, psize_regid, color_regid[8], posz_regid;
352
	uint32_t clip0_regid, clip1_regid;
353
354
	uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
	uint32_t smask_in_regid, smask_regid;
355
	uint32_t stencilref_regid;
356
	uint32_t vertex_regid, instance_regid, layer_regid, primitive_regid;
357
358
	uint32_t hs_invocation_regid;
	uint32_t tess_coord_x_regid, tess_coord_y_regid, hs_patch_regid, ds_patch_regid;
359
	uint32_t ij_regid[IJ_COUNT];
360
	uint32_t gs_header_regid;
361
	enum a3xx_threadsize fssz;
362
	uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0;
363
	uint8_t clip0_loc, clip1_loc;
364
365
	int i, j;

366
367
	static const struct ir3_shader_variant dummy_fs = {0};
	const struct ir3_shader_variant *vs = binning_pass ? state->bs : state->vs;
368
369
370
	const struct ir3_shader_variant *hs = state->hs;
	const struct ir3_shader_variant *ds = state->ds;
	const struct ir3_shader_variant *gs = state->gs;
371
	const struct ir3_shader_variant *fs = binning_pass ? &dummy_fs : state->fs;
372

373
374
375
376
377
	/* binning VS is wrong when GS is present, so use nonbinning VS
	 * TODO: compile both binning VS/GS variants correctly
	 */
	if (binning_pass && state->gs)
		vs = state->vs;
378

379
	bool sample_shading = fs->per_samp | key->sample_shading;
380

381
	fssz = FOUR_QUADS;
382

383
384
	pos_regid = ir3_find_output_regid(vs, VARYING_SLOT_POS);
	psize_regid = ir3_find_output_regid(vs, VARYING_SLOT_PSIZ);
385
386
	clip0_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST0);
	clip1_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST1);
387
388
	vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
	instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
389

390
391
392
393
394
395
396
397
398
	if (hs) {
		tess_coord_x_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD);
		tess_coord_y_regid = next_regid(tess_coord_x_regid, 1);
		hs_patch_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID);
		ds_patch_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID);
		hs_invocation_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3);

		pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS);
		psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ);
399
400
		clip0_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST0);
		clip1_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST1);
401
402
403
404
405
406
407
408
	} else {
		tess_coord_x_regid = regid(63, 0);
		tess_coord_y_regid = regid(63, 0);
		hs_patch_regid = regid(63, 0);
		ds_patch_regid = regid(63, 0);
		hs_invocation_regid = regid(63, 0);
	}

409
	if (gs) {
410
		gs_header_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3);
411
412
413
		primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID);
		pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS);
		psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ);
414
415
		clip0_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST0);
		clip1_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST1);
416
417
418
419
420
421
422
		layer_regid = ir3_find_output_regid(gs, VARYING_SLOT_LAYER);
	} else {
		gs_header_regid = regid(63, 0);
		primitive_regid = regid(63, 0);
		layer_regid = regid(63, 0);
	}

423
	if (fs->color0_mrt) {
424
425
		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
		color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
426
			ir3_find_output_regid(fs, FRAG_RESULT_COLOR);
427
	} else {
428
429
430
431
432
433
434
435
		color_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0);
		color_regid[1] = ir3_find_output_regid(fs, FRAG_RESULT_DATA1);
		color_regid[2] = ir3_find_output_regid(fs, FRAG_RESULT_DATA2);
		color_regid[3] = ir3_find_output_regid(fs, FRAG_RESULT_DATA3);
		color_regid[4] = ir3_find_output_regid(fs, FRAG_RESULT_DATA4);
		color_regid[5] = ir3_find_output_regid(fs, FRAG_RESULT_DATA5);
		color_regid[6] = ir3_find_output_regid(fs, FRAG_RESULT_DATA6);
		color_regid[7] = ir3_find_output_regid(fs, FRAG_RESULT_DATA7);
436
437
	}

438
439
440
441
	samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
	smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
	face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
	coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
442
	zwcoord_regid   = next_regid(coord_regid, 2);
443
444
	posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
	smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
445
	stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
446
447
	for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
		ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
448

449
450
451
452
	/* If we have pre-dispatch texture fetches, then ij_pix should not
	 * be DCE'd, even if not actually used in the shader itself:
	 */
	if (fs->num_sampler_prefetch > 0) {
453
		assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
454
		/* also, it seems like ij_pix is *required* to be r0.x */
455
		assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
456
457
	}

458
459
460
461
462
	/* we can't write gl_SampleMask for !msaa..  if b0 is zero then we
	 * end up masking the single sample!!
	 */
	if (!key->msaa)
		smask_regid = regid(63, 0);
463
464
465
466
467

	/* we could probably divide this up into things that need to be
	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
	 */

468
469
470
471
472
473
474
475
476
477
478
479
480
481
	OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
	OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
			A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) |
			0x7000);    // XXX
	for (int i = 0; i < fs->num_sampler_prefetch; i++) {
		const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
		OUT_RING(ring, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) |
				A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) |
				A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) |
				A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) |
				A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) |
				COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) |
				A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
	}
482
483
484
485

	OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1);
	OUT_RING(ring, 0);

486
487
	OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1);
	OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
488

489
490
491
	bool fs_has_dual_src_color = !binning_pass &&
		fs->shader->nir->info.fs.color_is_dual_source;

492
493
	OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
	OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
494
			 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
495
			 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
496
			 COND(fs_has_dual_src_color,
497
					A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
498

499
500
501
502
503
504
505
	enum a3xx_threadsize vssz;
	if (ds || hs) {
		vssz = TWO_QUADS;
	} else {
		vssz = FOUR_QUADS;
	}

506
	OUT_PKT4(ring, REG_A6XX_SP_VS_CTRL_REG0, 1);
507
	OUT_RING(ring, A6XX_SP_VS_CTRL_REG0_THREADSIZE(vssz) |
508
			A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) |
509
510
			A6XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vs->info.max_half_reg + 1) |
			COND(vs->mergedregs, A6XX_SP_VS_CTRL_REG0_MERGEDREGS) |
511
512
			A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(vs->branchstack) |
			COND(vs->need_pixlod, A6XX_SP_VS_CTRL_REG0_PIXLODENABLE));
513

514
515
	fd6_emit_shader(ctx, ring, vs);
	fd6_emit_immediates(ctx->screen, vs, ring);
516

517
	struct ir3_shader_linkage l = {0};
518
	const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
519
520

	bool do_streamout = (last_shader->shader->stream_output.num_outputs > 0);
521
522
	uint8_t clip_mask = last_shader->clip_mask, cull_mask = last_shader->cull_mask;
	uint8_t clip_cull_mask = clip_mask | cull_mask;
523
524
525
526
527
528
529
530
531

	/* If we have streamout, link against the real FS, rather than the
	 * dummy FS used for binning pass state, to ensure the OUTLOC's
	 * match.  Depending on whether we end up doing sysmem or gmem,
	 * the actual streamout could happen with either the binning pass
	 * or draw pass program, but the same streamout stateobj is used
	 * in either case:
	 */
	ir3_link_shaders(&l, last_shader, do_streamout ? state->fs : fs, true);
532

533
	bool primid_passthru = l.primid_loc != 0xff;
534
535
	clip0_loc = l.clip0_loc;
	clip1_loc = l.clip1_loc;
536

537
	OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4);
538
539
540
541
	OUT_RING(ring, ~l.varmask[0]);  /* VPC_VAR[0].DISABLE */
	OUT_RING(ring, ~l.varmask[1]);  /* VPC_VAR[1].DISABLE */
	OUT_RING(ring, ~l.varmask[2]);  /* VPC_VAR[2].DISABLE */
	OUT_RING(ring, ~l.varmask[3]);  /* VPC_VAR[3].DISABLE */
542

543
	/* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */
544
	link_stream_out(&l, last_shader);
545

546
547
548
549
550
	if (VALIDREG(layer_regid)) {
		layer_loc = l.max_loc;
		ir3_link_add(&l, layer_regid, 0x1, l.max_loc);
	}

551
552
	if (VALIDREG(pos_regid)) {
		pos_loc = l.max_loc;
553
		ir3_link_add(&l, pos_regid, 0xf, l.max_loc);
554
	}
555

556
	if (VALIDREG(psize_regid)) {
557
558
559
560
		psize_loc = l.max_loc;
		ir3_link_add(&l, psize_regid, 0x1, l.max_loc);
	}

561
562
563
564
565
566
567
568
569
570
571
572
573
574
	/* Handle the case where clip/cull distances aren't read by the FS. Make
	 * sure to avoid adding an output with an empty writemask if the user
	 * disables all the clip distances in the API so that the slot is unused.
	 */
	if (clip0_loc == 0xff && VALIDREG(clip0_regid) && (clip_cull_mask & 0xf) != 0) {
		clip0_loc = l.max_loc;
		ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc);
	}

	if (clip1_loc == 0xff && VALIDREG(clip1_regid) && (clip_cull_mask >> 4) != 0) {
		clip1_loc = l.max_loc;
		ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc);
	}

575
576
577
578
579
580
581
	/* If we have stream-out, we use the full shader for binning
	 * pass, rather than the optimized binning pass one, so that we
	 * have all the varying outputs available for xfb.  So streamout
	 * state should always be derived from the non-binning pass
	 * program:
	 */
	if (do_streamout && !binning_pass) {
582
		setup_stream_out(state, last_shader, &l);
583
584
	}

585
	debug_assert(l.cnt < 32);
586
587
	if (gs)
		OUT_PKT4(ring, REG_A6XX_SP_GS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2));
588
589
	else if (ds)
		OUT_PKT4(ring, REG_A6XX_SP_DS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2));
590
591
592
	else
		OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2));

593
	for (j = 0; j < l.cnt; ) {
594
595
596
597
598
599
600
601
602
603
604
605
606
		uint32_t reg = 0;

		reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid);
		reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask);
		j++;

		reg |= A6XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid);
		reg |= A6XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask);
		j++;

		OUT_RING(ring, reg);
	}

607
608
	if (gs)
		OUT_PKT4(ring, REG_A6XX_SP_GS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4));
609
610
	else if (ds)
		OUT_PKT4(ring, REG_A6XX_SP_DS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4));
611
612
613
	else
		OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4));

614
	for (j = 0; j < l.cnt; ) {
615
616
617
618
619
620
621
622
623
624
		uint32_t reg = 0;

		reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc);
		reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc);
		reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc);
		reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc);

		OUT_RING(ring, reg);
	}

625
626
627
628
	if (hs) {
		OUT_PKT4(ring, REG_A6XX_SP_HS_CTRL_REG0, 1);
		OUT_RING(ring, A6XX_SP_HS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
			A6XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT(hs->info.max_reg + 1) |
629
630
			A6XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT(hs->info.max_half_reg + 1) |
			COND(hs->mergedregs, A6XX_SP_HS_CTRL_REG0_MERGEDREGS) |
631
632
633
			A6XX_SP_HS_CTRL_REG0_BRANCHSTACK(hs->branchstack) |
			COND(hs->need_pixlod, A6XX_SP_HS_CTRL_REG0_PIXLODENABLE));

634
635
636
		fd6_emit_shader(ctx, ring, hs);
		fd6_emit_immediates(ctx->screen, hs, ring);
		fd6_emit_link_map(ctx->screen, vs, hs, ring);
637
638
639
640

		OUT_PKT4(ring, REG_A6XX_SP_DS_CTRL_REG0, 1);
		OUT_RING(ring, A6XX_SP_DS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
			A6XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT(ds->info.max_reg + 1) |
641
642
			A6XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT(ds->info.max_half_reg + 1) |
			COND(ds->mergedregs, A6XX_SP_DS_CTRL_REG0_MERGEDREGS) |
643
644
645
			A6XX_SP_DS_CTRL_REG0_BRANCHSTACK(ds->branchstack) |
			COND(ds->need_pixlod, A6XX_SP_DS_CTRL_REG0_PIXLODENABLE));

646
647
648
		fd6_emit_shader(ctx, ring, ds);
		fd6_emit_immediates(ctx->screen, ds, ring);
		fd6_emit_link_map(ctx->screen, hs, ds, ring);
649
650
651
652
653
654

		shader_info *hs_info = &hs->shader->nir->info;
		OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
		OUT_RING(ring, hs_info->tess.tcs_vertices_out);

		/* Total attribute slots in HS incoming patch. */
655
		OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
656
		OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->output_size / 4);
657
658

		OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
659
		OUT_RING(ring, vs->output_size);
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675

		shader_info *ds_info = &ds->shader->nir->info;
		OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1);
		uint32_t output;
		if (ds_info->tess.point_mode)
			output = TESS_POINTS;
		else if (ds_info->tess.primitive_mode == GL_ISOLINES)
			output = TESS_LINES;
		else if (ds_info->tess.ccw)
			output = TESS_CCW_TRIS;
		else
			output = TESS_CW_TRIS;

		OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING(fd6_gl2spacing(ds_info->tess.spacing)) |
				A6XX_PC_TESS_CNTL_OUTPUT(output));

676
		OUT_PKT4(ring, REG_A6XX_VPC_DS_CLIP_CNTL, 1);
677
678
679
		OUT_RING(ring, A6XX_VPC_DS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
				       A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
					   A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
680

681
		OUT_PKT4(ring, REG_A6XX_VPC_DS_LAYER_CNTL, 1);
682
683
		OUT_RING(ring, 0x0000ffff);

684
		OUT_PKT4(ring, REG_A6XX_GRAS_DS_LAYER_CNTL, 1);
685
686
		OUT_RING(ring, 0x0);

687
		OUT_PKT4(ring, REG_A6XX_GRAS_DS_CL_CNTL, 1);
688
689
		OUT_RING(ring, A6XX_GRAS_DS_CL_CNTL_CLIP_MASK(clip_mask) |
				       A6XX_GRAS_DS_CL_CNTL_CULL_MASK(cull_mask));
690

691
692
693
694
		OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1);
		OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) |
				 A6XX_VPC_VS_PACK_PSIZELOC(255) |
				 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc));
695

696
697
698
699
		OUT_PKT4(ring, REG_A6XX_VPC_DS_PACK, 1);
		OUT_RING(ring, A6XX_VPC_DS_PACK_POSITIONLOC(pos_loc) |
				 A6XX_VPC_DS_PACK_PSIZELOC(psize_loc) |
				 A6XX_VPC_DS_PACK_STRIDE_IN_VPC(l.max_loc));
700
701

		OUT_PKT4(ring, REG_A6XX_SP_DS_PRIMITIVE_CNTL, 1);
702
		OUT_RING(ring, A6XX_SP_DS_PRIMITIVE_CNTL_OUT(l.cnt));
703

704
705
		OUT_PKT4(ring, REG_A6XX_PC_DS_OUT_CNTL, 1);
		OUT_RING(ring, A6XX_PC_DS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
706
707
				CONDREG(psize_regid, A6XX_PC_DS_OUT_CNTL_PSIZE) |
				A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
708
709
710
711
712

	} else {
		OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
		OUT_RING(ring, 0);
	}
713

714
715
	OUT_PKT4(ring, REG_A6XX_SP_VS_PRIMITIVE_CNTL, 1);
	OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(l.cnt));
716

717
	bool enable_varyings = fs->total_in > 0;
718
719

	OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1);
720
	OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
721
			 COND(enable_varyings, A6XX_VPC_CNTL_0_VARYING) |
722
			 A6XX_VPC_CNTL_0_PRIMIDLOC(l.primid_loc) |
723
			 A6XX_VPC_CNTL_0_VIEWIDLOC(0xff));
724

725
726
	OUT_PKT4(ring, REG_A6XX_PC_VS_OUT_CNTL, 1);
	OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
727
728
			CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
			A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
729

730
731
732
	OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1);
	OUT_RING(ring, 0);

733
734
735
736
	OUT_PKT4(ring, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
	OUT_RING(ring, 0x7);                /* XXX */
	OUT_RING(ring, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
			 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
737
			 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
738
739
740
741
742
743
			 A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE]));
	OUT_RING(ring,
			 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
			 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
			 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) |
			 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID]));
744
	OUT_RING(ring, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
745
			 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
746
747
			 A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) |
			 A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE]));
748
749
750
	OUT_RING(ring, 0xfc);              /* XXX */

	OUT_PKT4(ring, REG_A6XX_HLSQ_UNKNOWN_B980, 1);
Rob Clark's avatar
Rob Clark committed
751
	OUT_RING(ring, enable_varyings ? 3 : 1);
752
753
754

	OUT_PKT4(ring, REG_A6XX_SP_FS_CTRL_REG0, 1);
	OUT_RING(ring, A6XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
Rob Clark's avatar
Rob Clark committed
755
			COND(enable_varyings, A6XX_SP_FS_CTRL_REG0_VARYING) |
756
			0x1000000 |
757
			A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fs->info.max_reg + 1) |
758
759
			A6XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fs->info.max_half_reg + 1) |
			COND(fs->mergedregs, A6XX_SP_FS_CTRL_REG0_MERGEDREGS) |
760
761
			A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(fs->branchstack) |
			COND(fs->need_pixlod, A6XX_SP_FS_CTRL_REG0_PIXLODENABLE));
762

763
	OUT_PKT4(ring, REG_A6XX_VPC_VS_LAYER_CNTL, 1);
764
765
	OUT_RING(ring, 0x0000ffff);        /* XXX */

766
767
768
769
770
771
772
773
774
775
776
777
778
	bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
	bool need_size_persamp = false;
	if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) {
		if (sample_shading)
			need_size_persamp = true;
		else
			need_size = true;
	}
	if (VALIDREG(ij_regid[IJ_LINEAR_PIXEL]))
		need_size = true;

	/* XXX: enable bits for linear centroid and linear sample bary */

779
	OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1);
780
	OUT_RING(ring,
781
782
783
784
785
786
			CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
			CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
			CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
			COND(need_size, A6XX_GRAS_CNTL_SIZE) |
			COND(need_size_persamp, A6XX_GRAS_CNTL_SIZE_PERSAMP) |
			COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
787
788

	OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2);
789
	OUT_RING(ring,
790
791
792
793
			CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
			CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
			CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
			COND(need_size, A6XX_RB_RENDER_CONTROL0_SIZE) |
794
			COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
795
796
797
			COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) |
			COND(fs->fragcoord_compmask != 0,
					A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
798

799
	OUT_RING(ring,
800
			CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
801
			CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
802
			CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) |
803
			COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
804

805
806
807
808
809
810
811
812
813
	OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1);
	OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));

	OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8101, 1);
	OUT_RING(ring, COND(sample_shading, 0x6));  // XXX

	OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
	OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));

814
815
816
	OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
	for (i = 0; i < 8; i++) {
		OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) |
817
				COND(color_regid[i] & HALF_REG_ID, A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
818
819
	}

820
821
822
823
	OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1);
	OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) |
			 A6XX_VPC_VS_PACK_PSIZELOC(psize_loc) |
			 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc));
824

825
	if (gs) {
826
827
828
		OUT_PKT4(ring, REG_A6XX_SP_GS_CTRL_REG0, 1);
		OUT_RING(ring, A6XX_SP_GS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
			A6XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(gs->info.max_reg + 1) |
829
830
			A6XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT(gs->info.max_half_reg + 1) |
			COND(gs->mergedregs, A6XX_SP_GS_CTRL_REG0_MERGEDREGS) |
831
832
833
			A6XX_SP_GS_CTRL_REG0_BRANCHSTACK(gs->branchstack) |
			COND(gs->need_pixlod, A6XX_SP_GS_CTRL_REG0_PIXLODENABLE));

834
835
		fd6_emit_shader(ctx, ring, gs);
		fd6_emit_immediates(ctx->screen, gs, ring);
836
		if (ds)
837
			fd6_emit_link_map(ctx->screen, ds, gs, ring);
838
		else
839
			fd6_emit_link_map(ctx->screen, vs, gs, ring);
840

841
842
843
844
		OUT_PKT4(ring, REG_A6XX_VPC_GS_PACK, 1);
		OUT_RING(ring, A6XX_VPC_GS_PACK_POSITIONLOC(pos_loc) |
				 A6XX_VPC_GS_PACK_PSIZELOC(psize_loc) |
				 A6XX_VPC_GS_PACK_STRIDE_IN_VPC(l.max_loc));
845

846
847
		OUT_PKT4(ring, REG_A6XX_VPC_GS_LAYER_CNTL, 1);
		OUT_RING(ring, A6XX_VPC_GS_LAYER_CNTL_LAYERLOC(layer_loc) | 0xff00);
848

849
850
		OUT_PKT4(ring, REG_A6XX_GRAS_GS_LAYER_CNTL, 1);
		OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER));
851
852
853

		uint32_t flags_regid = ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3);

854
855
856
		OUT_PKT4(ring, REG_A6XX_SP_GS_PRIMITIVE_CNTL, 1);
		OUT_RING(ring, A6XX_SP_GS_PRIMITIVE_CNTL_OUT(l.cnt) |
				A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
857

858
859
860
861
		OUT_PKT4(ring, REG_A6XX_PC_GS_OUT_CNTL, 1);
		OUT_RING(ring, A6XX_PC_GS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
				CONDREG(psize_regid, A6XX_PC_GS_OUT_CNTL_PSIZE) |
				CONDREG(layer_regid, A6XX_PC_GS_OUT_CNTL_LAYER) |
862
863
				CONDREG(primitive_regid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) |
				A6XX_PC_GS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884

		uint32_t output;
		switch (gs->shader->nir->info.gs.output_primitive) {
		case GL_POINTS:
			output = TESS_POINTS;
			break;
		case GL_LINE_STRIP:
			output = TESS_LINES;
			break;
		case GL_TRIANGLE_STRIP:
			output = TESS_CW_TRIS;
			break;
		default:
			unreachable("");
		}
		OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
		OUT_RING(ring,
				A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(gs->shader->nir->info.gs.vertices_out - 1) |
				A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
				A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(gs->shader->nir->info.gs.invocations - 1));

885
		OUT_PKT4(ring, REG_A6XX_GRAS_GS_CL_CNTL, 1);
886
887
		OUT_RING(ring, A6XX_GRAS_GS_CL_CNTL_CLIP_MASK(clip_mask) |
				       A6XX_GRAS_GS_CL_CNTL_CULL_MASK(cull_mask));
888
889
890
891

		OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9100, 1);
		OUT_RING(ring, 0xff);

892
		OUT_PKT4(ring, REG_A6XX_VPC_GS_CLIP_CNTL, 1);
893
894
895
		OUT_RING(ring, A6XX_VPC_GS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
				       A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
					   A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
896

897
		const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs;
898
899
900
901

		/* Size of per-primitive alloction in ldlw memory in vec4s. */
		uint32_t vec4_size =
			gs->shader->nir->info.gs.vertices_in *
902
			DIV_ROUND_UP(prev->output_size, 4);
903
904
905
		OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
		OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));

906
		OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
907
908
		OUT_RING(ring, 0);

909
		OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1);
910
		OUT_RING(ring, prev->output_size);
911
912
913
	} else {
		OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
		OUT_RING(ring, 0);
914
		OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1);
915
		OUT_RING(ring, 0);
916
917
	}

918
	OUT_PKT4(ring, REG_A6XX_VPC_VS_CLIP_CNTL, 1);
919
920
921
922
923
924
925
	OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
				   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
				   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));

	OUT_PKT4(ring, REG_A6XX_GRAS_VS_CL_CNTL, 1);
	OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) |
				   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask));
926
927
928
929

	OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1);
	OUT_RING(ring, 0);

930
	if (fs->instrlen)
931
		fd6_emit_shader(ctx, ring, fs);
Rob Clark's avatar
Rob Clark committed
932

933
	OUT_REG(ring, A6XX_PC_PRIMID_PASSTHRU(primid_passthru));
934

935
936
937
938
939
	uint32_t non_sysval_input_count = 0;
	for (uint32_t i = 0; i < vs->inputs_count; i++)
		if (!vs->inputs[i].sysval)
			non_sysval_input_count++;

940
941
942
943
	OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_0, 1);
	OUT_RING(ring, A6XX_VFD_CONTROL_0_FETCH_CNT(non_sysval_input_count) |
			A6XX_VFD_CONTROL_0_DECODE_CNT(non_sysval_input_count));

944
945
946
947
948
949
950
	OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(0), non_sysval_input_count);
	for (uint32_t i = 0; i < non_sysval_input_count; i++) {
		assert(vs->inputs[i].compmask);
		OUT_RING(ring, A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) |
				A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid));
	}

Rob Clark's avatar
Rob Clark committed
951
952
953
	OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6);
	OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
			A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) |
954
955
			A6XX_VFD_CONTROL_1_REGID4PRIMID(primitive_regid) |
			0xfc000000);
956
957
958
959
960
	OUT_RING(ring, A6XX_VFD_CONTROL_2_REGID_HSPATCHID(hs_patch_regid) |
			A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
	OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSPATCHID(ds_patch_regid) |
			A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
			A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
961
			0xfc);
Rob Clark's avatar
Rob Clark committed
962
	OUT_RING(ring, 0x000000fc);   /* VFD_CONTROL_4 */
963
964
	OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gs_header_regid) |
			0xfc00);   /* VFD_CONTROL_5 */
965
966
	OUT_RING(ring,
			 COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU));   /* VFD_CONTROL_6 */
967

968
	if (!binning_pass)
969
		fd6_emit_immediates(ctx->screen, fs, ring);
Rob Clark's avatar
Rob Clark committed
970
971
}

972
static void emit_interp_state(struct fd_ringbuffer *ring, struct ir3_shader_variant *fs,
973
		bool rasterflat, bool fb_inverted, bool sprite_coord_mode, uint32_t sprite_coord_enable);
974

975
976
977
978
979
static struct fd_ringbuffer *
create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state)
{
	struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4);

980
	emit_interp_state(ring, state->fs, false, false, false, 0);
981
982
983
984
985
986
987

	return ring;
}

/* build the program streaming state which is not part of the pre-
 * baked stateobj because of dependency on other gl state (rasterflat
 * or sprite-coord-replacement)
Rob Clark's avatar
Rob Clark committed
988
 */
989
990
struct fd_ringbuffer *
fd6_program_interp_state(struct fd6_emit *emit)
Rob Clark's avatar
Rob Clark committed
991
992
993
{
	const struct fd6_program_state *state = fd6_emit_get_prog(emit);

994
995
	if (!unlikely(emit->rasterflat || emit->sprite_coord_enable ||
			emit->sprite_origin_upper_left)) {
Rob Clark's avatar
Rob Clark committed
996
		/* fastpath: */
997
		return fd_ringbuffer_ref(state->interp_stateobj);
Rob Clark's avatar
Rob Clark committed
998
	} else {
999
1000
1001
		struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
				emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING);

1002
		emit_interp_state(ring, state->fs, emit->rasterflat,
1003
				emit->sprite_origin_upper_left, emit->fb_inverted, emit->sprite_coord_enable);
1004

1005
1006
1007
		return ring;
	}
}
1008

1009
1010
static void
emit_interp_state(struct fd_ringbuffer *ring, struct ir3_shader_variant *fs,
1011
1012
		bool rasterflat, bool sprite_origin_upper_left, bool fb_inverted,
		uint32_t sprite_coord_enable)
1013
1014
{
	uint32_t vinterp[8], vpsrepl[8];
Rob Clark's avatar
Rob Clark committed
1015

1016
1017
	memset(vinterp, 0, sizeof(vinterp));
	memset(vpsrepl, 0, sizeof(vpsrepl));
1018

1019
	for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count; ) {
1020

1021
1022
1023
1024
1025
1026
1027
		/* NOTE: varyings are packed, so if compmask is 0xb
		 * then first, third, and fourth component occupy
		 * three consecutive varying slots:
		 */
		unsigned compmask = fs->inputs[j].compmask;

		uint32_t inloc = fs->inputs[j].inloc;
1028

1029
		if (fs->inputs[j].flat ||
1030
1031
1032
1033
1034
1035
1036
				(fs->inputs[j].rasterflat && rasterflat)) {
			uint32_t loc = inloc;

			for (int i = 0; i < 4; i++) {
				if (compmask & (1 << i)) {
					vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
					loc++;
1037
1038
				}
			}
1039
		}
1040

1041
		if (util_varying_is_point_coord(fs->inputs[j].slot, sprite_coord_enable)) {
1042
1043
1044
			bool upper_left = fs->inputs[j].slot == VARYING_SLOT_PNTC ?
				(sprite_origin_upper_left ^ fb_inverted) : sprite_origin_upper_left;

1045
1046
1047
1048
			/* mask is two 2-bit fields, where:
			 *   '01' -> S
			 *   '10' -> T
			 *   '11' -> 1 - T  (flip mode)
1049
			 */
1050
			unsigned mask = upper_left ? 0b1001