radv: yuuuge si_emit_cp_dma overhead
I was doing some quick profiling to check out latest gallium and drawoverhead changes when I found this: running drawoverhead -test 7
triggers a basic draw loop where a vertex buffer is updated between draws.
RadeonSI calls si_emit_cp_dma
1 time every N multidraws when it runs out of cs space.
Thread 14 "drawover:gdrv0" hit Breakpoint 1, si_emit_cp_dma (sctx=sctx@entry=0x44f4d0, cs=cs@entry=0x44f930, dst_va=dst_va@entry=0, src_va=src_va@entry=0, size=size@entry=0, flags=flags@entry=1, cache_policy=L2_BYPASS) at ../src/gallium/drivers/radeonsi/si_cp_dma.c:65
65 if (sctx->chip_class >= GFX9)
(gdb) bt
#0 si_emit_cp_dma (sctx=sctx@entry=0x44f4d0, cs=cs@entry=0x44f930, dst_va=dst_va@entry=0, src_va=src_va@entry=0, size=size@entry=0, flags=flags@entry=1, cache_policy=L2_BYPASS) at ../src/gallium/drivers/radeonsi/si_cp_dma.c:65
#1 0x00007ffff6818764 in si_cp_dma_wait_for_idle (sctx=sctx@entry=0x44f4d0, cs=cs@entry=0x44f930) at ../src/gallium/drivers/radeonsi/si_cp_dma.c:141
#2 0x00007ffff68269ab in si_flush_gfx_cs (ctx=0x44f4d0, flags=2147483656, fence=0x0) at ../src/gallium/drivers/radeonsi/si_gfx_cs.c:142
#3 0x00007ffff6a34942 in si_draw_vbo<(chip_class)12, (si_has_tess)0, (si_has_gs)0, (si_has_ngg)1, (si_has_prim_discard_cs)0> (ctx=0x44f4d0, info=0x5cee80, drawid_offset=0, indirect=0x0, draws=0x7fffcd3f8ce0, num_draws=1) at ../src/gallium/drivers/radeonsi/si_state_draw.cpp:1760
#4 0x00007ffff66c4b06 in tc_call_draw_single (pipe=<optimized out>, call=0x5cee78, last_ptr=<optimized out>) at ../src/gallium/auxiliary/util/u_threaded_context.c:3006
#5 0x00007ffff66c47e2 in tc_batch_execute (job=job@entry=0x5cd5d8, gdata=gdata@entry=0x0, thread_index=thread_index@entry=0) at ../src/gallium/auxiliary/util/u_threaded_context.c:190
#6 0x00007ffff6194f7c in util_queue_thread_func (input=input@entry=0x545120) at ../src/util/u_queue.c:313
#7 0x00007ffff6194a37 in impl_thrd_routine (p=<optimized out>) at ../include/c11/threads_posix.h:87
#8 0x00007ffff741a299 in start_thread () from /lib64/libpthread.so.0
#9 0x00007ffff786d353 in clone () from /lib64/libc.so.6
RADV calls si_emit_cp_dma
3 times every multidraw
Thread 10 "drawover:gdrv0" hit Breakpoint 2, si_emit_cp_dma (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, dst_va=18446603340516450304, src_va=18446603340516450304, size=480, flags=flags@entry=4) at ../src/amd/vulkan/si_cmd_buffer.c:1524
1524 {
(gdb) bt
#0 si_emit_cp_dma (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, dst_va=18446603340516450304, src_va=18446603340516450304, size=480, flags=flags@entry=4) at ../src/amd/vulkan/si_cmd_buffer.c:1524
#1 0x00007fffef431cf6 in si_cp_dma_prefetch (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, va=<optimized out>, size=<optimized out>) at ../src/amd/vulkan/si_cmd_buffer.c:1606
#2 0x00007fffef3a2bb8 in radv_emit_shader_prefetch (shader=<optimized out>, cmd_buffer=0x7fffc885c9b0) at ../src/amd/vulkan/radv_cmd_buffer.c:1030
#3 radv_emit_shader_prefetch (shader=<optimized out>, cmd_buffer=0x7fffc885c9b0) at ../src/amd/vulkan/radv_cmd_buffer.c:1021
#4 radv_emit_prefetch_L2 (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, pipeline=0x7fffc8011360, vertex_stage_only=vertex_stage_only@entry=true) at ../src/amd/vulkan/radv_cmd_buffer.c:1047
#5 0x00007fffef3ac955 in radv_before_draw (info=0x7fffdddf9a20, cmd_buffer=0x7fffc885c9b0) at ../src/amd/vulkan/radv_cmd_buffer.c:5684
#6 radv_CmdDrawMultiIndexedEXT (commandBuffer=0x7fffc885c9b0, drawCount=1, pIndexInfo=0x7fffdddf9ce0, instanceCount=<optimized out>, firstInstance=<optimized out>, stride=12, pVertexOffset=0x7fffdddf9ce8) at ../src/amd/vulkan/radv_cmd_buffer.c:5815
#7 0x00007ffff69406eb in draw_indexed<(zink_multidraw)1> (needs_drawid=false, draw_id=<optimized out>, num_draws=1, draws=0x7fffdddf9ce0, dinfo=0x1048ae0, ctx=0x44f500) at ../src/gallium/drivers/zink/zink_draw.cpp:288
#8 zink_draw_vbo<(zink_multidraw)1, (zink_dynamic_state)1, (zink_dynamic_state2)1, (zink_dynamic_vertex_input)0, false, false, (zink_drawid)0, (zink_basevertex)0> (pctx=0x44f500, dinfo=0x1048ae0, drawid_offset=<optimized out>, dindirect=0x0, draws=<optimized out>, num_draws=<optimized out>) at ../src/gallium/drivers/zink/zink_draw.cpp:730
#9 0x00007ffff66c4b06 in tc_call_draw_single (pipe=<optimized out>, call=0x1048ad8, last_ptr=<optimized out>) at ../src/gallium/auxiliary/util/u_threaded_context.c:3006
#10 0x00007ffff66c47e2 in tc_batch_execute (job=job@entry=0x1045e60, gdata=gdata@entry=0x0, thread_index=thread_index@entry=0) at ../src/gallium/auxiliary/util/u_threaded_context.c:190
#11 0x00007ffff6194f7c in util_queue_thread_func (input=input@entry=0x828d00) at ../src/util/u_queue.c:313
#12 0x00007ffff6194a37 in impl_thrd_routine (p=<optimized out>) at ../include/c11/threads_posix.h:87
#13 0x00007ffff741a299 in start_thread () from /lib64/libpthread.so.0
#14 0x00007ffff786d353 in clone () from /lib64/libc.so.6
(gdb) c
Continuing.
Thread 10 "drawover:gdrv0" hit Breakpoint 2, si_emit_cp_dma (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, dst_va=18446603336226552864, src_va=18446603336226552864, size=32, flags=flags@entry=4) at ../src/amd/vulkan/si_cmd_buffer.c:1524
1524 {
(gdb) bt
#0 si_emit_cp_dma (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, dst_va=18446603336226552864, src_va=18446603336226552864, size=32, flags=flags@entry=4) at ../src/amd/vulkan/si_cmd_buffer.c:1524
#1 0x00007fffef431cf6 in si_cp_dma_prefetch (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, va=<optimized out>, size=<optimized out>) at ../src/amd/vulkan/si_cmd_buffer.c:1606
#2 0x00007fffef3a2ce8 in radv_emit_prefetch_L2 (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, pipeline=0x7fffc8011360, vertex_stage_only=vertex_stage_only@entry=false) at ../src/amd/vulkan/radv_cmd_buffer.c:1050
#3 0x00007fffef3a5153 in radv_after_draw (cmd_buffer=0x7fffc885c9b0) at ../src/amd/vulkan/radv_cmd_buffer.c:5718
#4 0x00007ffff69406eb in draw_indexed<(zink_multidraw)1> (needs_drawid=false, draw_id=<optimized out>, num_draws=1, draws=0x7fffdddf9ce0, dinfo=0x1048ae0, ctx=0x44f500) at ../src/gallium/drivers/zink/zink_draw.cpp:288
#5 zink_draw_vbo<(zink_multidraw)1, (zink_dynamic_state)1, (zink_dynamic_state2)1, (zink_dynamic_vertex_input)0, false, false, (zink_drawid)0, (zink_basevertex)0> (pctx=0x44f500, dinfo=0x1048ae0, drawid_offset=<optimized out>, dindirect=0x0, draws=<optimized out>, num_draws=<optimized out>) at ../src/gallium/drivers/zink/zink_draw.cpp:730
#6 0x00007ffff66c4b06 in tc_call_draw_single (pipe=<optimized out>, call=0x1048ad8, last_ptr=<optimized out>) at ../src/gallium/auxiliary/util/u_threaded_context.c:3006
#7 0x00007ffff66c47e2 in tc_batch_execute (job=job@entry=0x1045e60, gdata=gdata@entry=0x0, thread_index=thread_index@entry=0) at ../src/gallium/auxiliary/util/u_threaded_context.c:190
#8 0x00007ffff6194f7c in util_queue_thread_func (input=input@entry=0x828d00) at ../src/util/u_queue.c:313
#9 0x00007ffff6194a37 in impl_thrd_routine (p=<optimized out>) at ../include/c11/threads_posix.h:87
#10 0x00007ffff741a299 in start_thread () from /lib64/libpthread.so.0
#11 0x00007ffff786d353 in clone () from /lib64/libc.so.6
(gdb) c
Continuing.
Thread 10 "drawover:gdrv0" hit Breakpoint 2, si_emit_cp_dma (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, dst_va=18446603340516450816, src_va=18446603340516450816, size=1248, flags=flags@entry=4) at ../src/amd/vulkan/si_cmd_buffer.c:1524
1524 {
(gdb) bt
#0 si_emit_cp_dma (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, dst_va=18446603340516450816, src_va=18446603340516450816, size=1248, flags=flags@entry=4) at ../src/amd/vulkan/si_cmd_buffer.c:1524
#1 0x00007fffef431cf6 in si_cp_dma_prefetch (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, va=<optimized out>, size=<optimized out>) at ../src/amd/vulkan/si_cmd_buffer.c:1606
#2 0x00007fffef3a2c70 in radv_emit_shader_prefetch (shader=<optimized out>, cmd_buffer=0x7fffc885c9b0) at ../src/amd/vulkan/radv_cmd_buffer.c:1030
#3 radv_emit_shader_prefetch (shader=<optimized out>, cmd_buffer=0x7fffc885c9b0) at ../src/amd/vulkan/radv_cmd_buffer.c:1021
#4 radv_emit_prefetch_L2 (cmd_buffer=cmd_buffer@entry=0x7fffc885c9b0, pipeline=0x7fffc8011360, vertex_stage_only=vertex_stage_only@entry=false) at ../src/amd/vulkan/radv_cmd_buffer.c:1065
#5 0x00007fffef3a5153 in radv_after_draw (cmd_buffer=0x7fffc885c9b0) at ../src/amd/vulkan/radv_cmd_buffer.c:5718
#6 0x00007ffff69406eb in draw_indexed<(zink_multidraw)1> (needs_drawid=false, draw_id=<optimized out>, num_draws=1, draws=0x7fffdddf9ce0, dinfo=0x1048ae0, ctx=0x44f500) at ../src/gallium/drivers/zink/zink_draw.cpp:288
#7 zink_draw_vbo<(zink_multidraw)1, (zink_dynamic_state)1, (zink_dynamic_state2)1, (zink_dynamic_vertex_input)0, false, false, (zink_drawid)0, (zink_basevertex)0> (pctx=0x44f500, dinfo=0x1048ae0, drawid_offset=<optimized out>, dindirect=0x0, draws=<optimized out>, num_draws=<optimized out>) at ../src/gallium/drivers/zink/zink_draw.cpp:730
#8 0x00007ffff66c4b06 in tc_call_draw_single (pipe=<optimized out>, call=0x1048ad8, last_ptr=<optimized out>) at ../src/gallium/auxiliary/util/u_threaded_context.c:3006
#9 0x00007ffff66c47e2 in tc_batch_execute (job=job@entry=0x1045e60, gdata=gdata@entry=0x0, thread_index=thread_index@entry=0) at ../src/gallium/auxiliary/util/u_threaded_context.c:190
#10 0x00007ffff6194f7c in util_queue_thread_func (input=input@entry=0x828d00) at ../src/util/u_queue.c:313
#11 0x00007ffff6194a37 in impl_thrd_routine (p=<optimized out>) at ../include/c11/threads_posix.h:87
#12 0x00007ffff741a299 in start_thread () from /lib64/libpthread.so.0
#13 0x00007ffff786d353 in clone () from /lib64/libc.so.6
This amounts to about 10% of total CPU time in the test case. Seems like something that should be investigated.