From 7609f83c70234725b7d4f2a618f82c197e09e4c6 Mon Sep 17 00:00:00 2001 From: Amber Date: Tue, 28 Feb 2023 14:14:35 +0100 Subject: ir3, freedreno: implement GL_ARB_shader_draw_parameters Part-of: --- src/freedreno/ir3/ir3_compiler.c | 4 +++ src/freedreno/ir3/ir3_compiler.h | 3 ++ src/freedreno/ir3/ir3_compiler_nir.c | 6 ++++ src/freedreno/ir3/ir3_context.h | 2 +- src/freedreno/ir3/ir3_nir.c | 4 +++ src/freedreno/ir3/ir3_shader.h | 7 ++-- src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc | 15 +++++++- src/gallium/drivers/freedreno/a6xx/fd6_const.cc | 6 ++-- src/gallium/drivers/freedreno/a6xx/fd6_draw.cc | 44 +++++++++++++++++------ src/gallium/drivers/freedreno/a6xx/fd6_emit.h | 1 + src/gallium/drivers/freedreno/freedreno_screen.c | 2 ++ src/gallium/drivers/freedreno/ir3/ir3_const.h | 8 +++-- src/gallium/drivers/freedreno/ir3/ir3_gallium.c | 4 +++ 13 files changed, 84 insertions(+), 22 deletions(-) diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 4636a7398c7..b55ca64ab39 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -285,6 +285,10 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->nir_options.force_indirect_unrolling = nir_var_all; } + if (options->lower_base_vertex) { + compiler->nir_options.lower_base_vertex = true; + } + /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but * this core NIR option enables some optimizations of 16-bit operations. */ diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 4abd574ff80..daf966a993a 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -67,6 +67,9 @@ struct ir3_compiler_options { /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */ bool storage_16bit; + + /* If base_vertex should be lowered in nir */ + bool lower_base_vertex; }; struct ir3_compiler { diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index f8316f417a4..c457d4db575 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -2204,6 +2204,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) } dst[0] = ctx->basevertex; break; + case nir_intrinsic_load_is_indexed_draw: + if (!ctx->is_indexed_draw) { + ctx->is_indexed_draw = create_driver_param(ctx, IR3_DP_IS_INDEXED_DRAW); + } + dst[0] = ctx->is_indexed_draw; + break; case nir_intrinsic_load_draw_id: if (!ctx->draw_id) { ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID); diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h index 0085d8abdbf..22c7b1b3cdf 100644 --- a/src/freedreno/ir3/ir3_context.h +++ b/src/freedreno/ir3/ir3_context.h @@ -85,7 +85,7 @@ struct ir3_context { /* For vertex shaders, keep track of the system values sources */ struct ir3_instruction *vertex_id, *basevertex, *instance_id, *base_instance, - *draw_id, *view_index; + *draw_id, *view_index, *is_indexed_draw; /* For fragment shaders: */ struct ir3_instruction *samp_id, *samp_mask_in; diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index b1cf668421e..da842e01c5c 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -864,6 +864,10 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st layout->num_driver_params = MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1); break; + case nir_intrinsic_load_is_indexed_draw: + layout->num_driver_params = + MAX2(layout->num_driver_params, IR3_DP_IS_INDEXED_DRAW + 1); + break; case nir_intrinsic_load_base_instance: layout->num_driver_params = MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1); diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 25125332e7c..6c921732c1a 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -69,11 +69,12 @@ enum ir3_driver_param { IR3_DP_VTXID_BASE = 1, IR3_DP_INSTID_BASE = 2, IR3_DP_VTXCNT_MAX = 3, + IR3_DP_IS_INDEXED_DRAW = 4, /* Note: boolean, ie. 0 or ~0 */ /* user-clip-plane components, up to 8x vec4's: */ - IR3_DP_UCP0_X = 4, + IR3_DP_UCP0_X = 5, /* .... */ - IR3_DP_UCP7_W = 35, - IR3_DP_VS_COUNT = 36, /* must be aligned to vec4 */ + IR3_DP_UCP7_W = 36, + IR3_DP_VS_COUNT = 40, /* must be aligned to vec4 */ /* TCS driver params: */ IR3_DP_HS_DEFAULT_OUTER_LEVEL_X = 0, diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc b/src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc index bc13f7fdf27..9a3d92949fb 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc @@ -186,12 +186,25 @@ fd6_memory_barrier(struct pipe_context *pctx, unsigned flags) if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_IMAGE | - PIPE_BARRIER_INDIRECT_BUFFER | PIPE_BARRIER_UPDATE_BUFFER | PIPE_BARRIER_UPDATE_TEXTURE)) { flushes |= FD6_FLUSH_CACHE | FD6_WAIT_FOR_IDLE; } + if (flags & PIPE_BARRIER_INDIRECT_BUFFER) { + flushes |= FD6_FLUSH_CACHE | FD6_WAIT_FOR_IDLE; + + /* Various firmware bugs/inconsistencies mean that some indirect draw opcodes + * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if + * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's + * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's + * before draw opcodes that don't need it. + */ + if (fd_context(pctx)->screen->info->a6xx.indirect_draw_wfm_quirk) { + flushes |= FD6_WAIT_FOR_ME; + } + } + if (flags & PIPE_BARRIER_FRAMEBUFFER) { fd6_texture_barrier(pctx, PIPE_TEXTURE_BARRIER_FRAMEBUFFER); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc index ba0acf4205e..61d19428d83 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc @@ -300,12 +300,12 @@ fd6_build_driver_params(struct fd6_emit *emit) if (emit->vs->need_driver_params) { ir3_emit_driver_params(emit->vs, dpconstobj, ctx, emit->info, - emit->indirect, emit->draw); + emit->indirect, emit->draw, emit->draw_id); } if (emit->gs && emit->gs->need_driver_params) { ir3_emit_driver_params(emit->gs, dpconstobj, ctx, emit->info, - emit->indirect, emit->draw); + emit->indirect, emit->draw, 0); } if (emit->hs && emit->hs->need_driver_params) { @@ -314,7 +314,7 @@ fd6_build_driver_params(struct fd6_emit *emit) if (emit->ds && emit->ds->need_driver_params) { ir3_emit_driver_params(emit->ds, dpconstobj, ctx, emit->info, - emit->indirect, emit->draw); + emit->indirect, emit->draw, 0); } fd6_ctx->has_dp_state = true; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_draw.cc b/src/gallium/drivers/freedreno/a6xx/fd6_draw.cc index 165d0fc0955..015cde6f3f2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_draw.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_draw.cc @@ -74,26 +74,39 @@ draw_emit_xfb(struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0, } static void -draw_emit_indirect(struct fd_ringbuffer *ring, +draw_emit_indirect(struct fd_context *ctx, + struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, - unsigned index_offset) + unsigned index_offset, uint32_t driver_param) { struct fd_resource *ind = fd_resource(indirect->buffer); if (info->index_size) { + OUT_PKT7(ring, CP_DRAW_INDIRECT_MULTI, 9); + OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); + OUT_RING(ring, + (A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) + | A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(driver_param))); struct pipe_resource *idx = info->index.resource; unsigned max_indices = (idx->width0 - index_offset) / info->index_size; - - OUT_PKT(ring, CP_DRAW_INDX_INDIRECT, pack_CP_DRAW_INDX_OFFSET_0(*draw0), - A5XX_CP_DRAW_INDX_INDIRECT_INDX_BASE(fd_resource(idx)->bo, - index_offset), - A5XX_CP_DRAW_INDX_INDIRECT_3(.max_indices = max_indices), - A5XX_CP_DRAW_INDX_INDIRECT_INDIRECT(ind->bo, indirect->offset)); + OUT_RING(ring, indirect->draw_count); + //index va + OUT_RELOC(ring, fd_resource(idx)->bo, index_offset, 0, 0); + //max indices + OUT_RING(ring, max_indices); + OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); + OUT_RING(ring, indirect->stride); } else { - OUT_PKT(ring, CP_DRAW_INDIRECT, pack_CP_DRAW_INDX_OFFSET_0(*draw0), - A5XX_CP_DRAW_INDIRECT_INDIRECT(ind->bo, indirect->offset)); + OUT_PKT7(ring, CP_DRAW_INDIRECT_MULTI, 6); + OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); + OUT_RING(ring, + (A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) + | A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(driver_param))); + OUT_RING(ring, indirect->draw_count); + OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); + OUT_RING(ring, indirect->stride); } } @@ -228,6 +241,7 @@ fd6_draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, emit.state.num_groups = 0; emit.streamout_mask = 0; emit.prog = NULL; + emit.draw_id = 0; if (!(ctx->prog.vs && ctx->prog.fs)) return; @@ -365,7 +379,14 @@ fd6_draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, if (indirect->count_from_stream_output) { draw_emit_xfb(ring, &draw0, info, indirect); } else { - draw_emit_indirect(ring, &draw0, info, indirect, index_offset); + const struct ir3_const_state *const_state = ir3_const_state(emit.vs); + uint32_t dst_offset_dp = const_state->offsets.driver_param; + + /* If unused, pass 0 for DST_OFF: */ + if (dst_offset_dp > emit.vs->constlen) + dst_offset_dp = 0; + + draw_emit_indirect(ctx, ring, &draw0, info, indirect, index_offset, dst_offset_dp); } } else { draw_emit(ring, &draw0, info, &draws[0], index_offset); @@ -401,6 +422,7 @@ fd6_draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, if (emit.dirty_groups) { emit.state.num_groups = 0; emit.draw = &draws[i]; + emit.draw_id = info->increment_draw_id ? i : 0; fd6_emit_3d_state(ring, &emit); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index 67572374fcc..939f7bb3af2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -183,6 +183,7 @@ struct fd6_emit { bool rasterflat : 1; bool primitive_restart : 1; uint8_t streamout_mask; + uint32_t draw_id; /* cached to avoid repeated lookups: */ const struct fd6_program_state *prog; diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 1dfb78284bf..f5081a3c10a 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -213,6 +213,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_DRAW_PARAMETERS: return is_a6xx(screen); case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index 21a4ab0232b..40e86544994 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -454,17 +454,19 @@ ir3_emit_driver_params(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count_bias *draw) assert_dt + const struct pipe_draw_start_count_bias *draw, + const uint32_t draw_id) assert_dt { assert(v->need_driver_params); const struct ir3_const_state *const_state = ir3_const_state(v); uint32_t offset = const_state->offsets.driver_param; uint32_t vertex_params[IR3_DP_VS_COUNT] = { - [IR3_DP_DRAWID] = 0, /* filled by hw (CP_DRAW_INDIRECT_MULTI) */ + [IR3_DP_DRAWID] = draw_id, /* filled by hw (CP_DRAW_INDIRECT_MULTI) */ [IR3_DP_VTXID_BASE] = info->index_size ? draw->index_bias : draw->start, [IR3_DP_INSTID_BASE] = info->start_instance, [IR3_DP_VTXCNT_MAX] = ctx->streamout.max_tf_vtx, + [IR3_DP_IS_INDEXED_DRAW] = info->index_size != 0 ? ~0 : 0, }; if (v->key.ucp_enables) { struct pipe_clip_state *ucp = &ctx->ucp; @@ -573,7 +575,7 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v, /* emit driver params every time: */ if (info && v->need_driver_params) { ring_wfi(ctx->batch, ring); - ir3_emit_driver_params(v, ring, ctx, info, indirect, draw); + ir3_emit_driver_params(v, ring, ctx, info, indirect, draw, 0); } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index 2ff88272556..ab677e79a6b 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -570,6 +570,10 @@ ir3_screen_init(struct pipe_screen *pscreen) .bindless_fb_read_slot = IR3_BINDLESS_IMAGE_OFFSET + IR3_BINDLESS_IMAGE_COUNT - 1 - screen->max_rts, }; + + if (screen->gen >= 6) { + options.lower_base_vertex = true; + } screen->compiler = ir3_compiler_create(screen->dev, screen->dev_id, &options); /* TODO do we want to limit things to # of fast cores, or just limit -- cgit v1.2.1