Scaletempo 2x playback causes high CPU loading
On an Android phone that I use on a daily basis (running Android O), I have found that when playing at 2x speed, the CPU loading of the entire GStreamer pipeline is relatively high, sometimes reaching around 60% cpu loading. I used simpleperf to run a callgraph hot spot map and found that the bottleneck is in the best_overlap_offset_s16() function in gstscaletempo.c,almost 98% CPU loading is consumed by this function. Therefore, I came here to seek help from the knowledgeable experts on this forum. Is anyone familiar with NEON programming? Can you help me optimize this function using the SIMP instruction in NEON? I have no prior experience with NEON, but I do have a mobile device handy that I can use to compare the performance before and after optimization.
Arch: arm64
Event: cpu-cycles:u (type 0, config 0)
Samples: 30624
Event count: 23246964527
Children Self Command Pid Tid Shared Object Symbol
34.42% 0.00% amcaudiodec-c 17305 17427 /apex/com.android.runtime/lib/bionic/libc.so __start_thread
|
-- __start_thread
|
-- __pthread_start(void*)
g_thread_proxy
g_thread_pool_thread_proxy
gst_task_func
|--0.04%-- [hit in function]
|
--99.96%-- gst_amc_audio_dec_loop
|--0.13%-- [hit in function]
|
|--95.13%-- gst_audio_decoder_finish_frame
| |--0.14%-- [hit in function]
| |
| |--99.21%-- gst_audio_decoder_output
| | |--0.03%-- [hit in function]
| | |
| | --99.97%-- gst_audio_decoder_push_forward
| | |
| | |--99.92%-- gst_pad_push
| | | gst_pad_push_data
| | | |--0.01%-- [hit in function]
| | | |
| | | |--99.93%-- gst_pad_chain_data_unchecked
| | | | |--0.08%-- [hit in function]
| | | | |
| | | | |--99.78%-- gst_proxy_pad_chain_default
| | | | | |--0.08%-- [hit in function]
| | | | | |
| | | | | |--99.87%-- gst_pad_push
| | | | | | |
| | | | | | |--99.95%-- gst_pad_push_data
| | | | | | | |--0.01%-- [hit in function]
| | | | | | | |
| | | | | | | |--99.95%-- gst_pad_chain_data_unchecked
| | | | | | | | |
| | | | | | | | |--99.96%-- gst_proxy_pad_chain_default
| | | | | | | | | gst_pad_push
| | | | | | | | | |--0.02%-- [hit in function]
| | | | | | | | | |
| | | | | | | | | --99.98%-- gst_pad_push_data
| | | | | | | | | |
| | | | | | | | | |--99.89%-- gst_pad_chain_data_unchecked
| | | | | | | | | | |--0.02%-- [hit in function]
| | | | | | | | | | |
| | | | | | | | | | |--99.95%-- gst_concat_sink_chain
| | | | | | | | | | | |
| | | | | | | | | | | |--99.95%-- gst_pad_push
| | | | | | | | | | | | gst_pad_push_data
| | | | | | | | | | | | |
| | | | | | | | | | | | |--99.97%-- gst_pad_chain_data_unchecked
| | | | | | | | | | | | | |--0.02%-- [hit in function]
| | | | | | | | | | | | | |
| | | | | | | | | | | | | |--99.91%-- gst_proxy_pad_chain_default
| | | | | | | | | | | | | | |
| | | | | | | | | | | | | | |--99.92%-- gst_pad_push
| | | | | | | | | | | | | | | gst_pad_push_data
| | | | | | | | | | | | | | | |--0.02%-- [hit in function]
| | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | |--99.97%-- gst_pad_chain_data_unchecked
| | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | |--99.93%-- gst_tee_chain
| | | | | | | | | | | | | | | | | |--0.05%-- [hit in function]
| | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | --99.95%-- gst_tee_handle_data
| | | | | | | | | | | | | | | | | |--0.06%-- [hit in function]
| | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | --99.94%-- gst_pad_push
| | | | | | | | | | | | | | | | | gst_pad_push_data
| | | | | | | | | | | | | | | | | |--0.01%-- [hit in function]
| | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | |--99.93%-- gst_pad_chain_data_unchecked
| | | | | | | | | | | | | | | | | | |--0.02%-- [hit in function]
| | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | |--99.93%-- gst_stream_synchronizer_sink_chain
| | | | | | | | | | | | | | | | | | | |--0.05%-- [hit in function]
| | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | |--99.88%-- gst_pad_push
| | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | |--99.98%-- gst_pad_push_data
| | | | | | | | | | | | | | | | | | | | | |--0.03%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | |--99.93%-- gst_pad_chain_data_unchecked
| | | | | | | | | | | | | | | | | | | | | | |--0.02%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | |--99.88%-- gst_proxy_pad_chain_default
| | | | | | | | | | | | | | | | | | | | | | | gst_pad_push
| | | | | | | | | | | | | | | | | | | | | | | gst_pad_push_data
| | | | | | | | | | | | | | | | | | | | | | | |--0.03%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | |--99.90%-- gst_pad_chain_data_unchecked
| | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | |--99.95%-- gst_base_transform_chain
| | | | | | | | | | | | | | | | | | | | | | | | | |--0.04%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | |--99.78%-- gst_pad_push
| | | | | | | | | | | | | | | | | | | | | | | | | | gst_pad_push_data
| | | | | | | | | | | | | | | | | | | | | | | | | | |--0.01%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | --99.99%-- gst_pad_chain_data_unchecked
| | | | | | | | | | | | | | | | | | | | | | | | | | |--0.02%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | |--99.95%-- gst_base_transform_chain
| | | | | | | | | | | | | | | | | | | | | | | | | | | |--0.01%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | |--99.47%-- default_generate_output
| | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | |--99.06%-- gst_scaletempo_transform
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--0.18%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--98.51%-- best_overlap_offset_s16
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--0.59%-- fill_queue
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--6.43%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--57.72%-- __memcpy_base_a55
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--22.51%-- gst_buffer_map
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | gst_buffer_map_range
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--26.49%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--47.69%-- gst_memory_make_mapped
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | gst_memory_map
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--48.71%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | --51.29%-- gst_mini_object_lock
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | --25.82%-- _get_merged_memory
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | gst_mini_object_ref
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | --13.34%-- gst_buffer_unmap
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--40.69%-- [hit in function]
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |--40.50%-- gst_mini_object_unlock
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | --18.81%-- gst_memory_unmap
/* buffer padding for loop optimization: sizeof(gint32) * (loop_size - 1) */
#define UNROLL_PADDING (4*3)
static guint
best_overlap_offset_s16 (GstScaletempo * st)
{
gint32 *pw, *ppc;
gint16 *po, *search_start;
gint64 best_corr = G_MININT64;
guint best_off = 0;
guint off;
glong i;
pw = st->table_window;
po = st->buf_overlap;
po += st->samples_per_frame;
ppc = st->buf_pre_corr;
for (i = st->samples_per_frame; i < st->samples_overlap; i++) {
*ppc++ = (*pw++ * *po++) >> 15;
}
search_start = (gint16 *) st->buf_queue + st->samples_per_frame;
for (off = 0; off < st->frames_search; off++) {
gint64 corr = 0;
gint16 *ps = search_start;
ppc = st->buf_pre_corr;
ppc += st->samples_overlap - st->samples_per_frame;
ps += st->samples_overlap - st->samples_per_frame;
i = -((glong) st->samples_overlap - (glong) st->samples_per_frame);
do {
corr += ppc[i + 0] * ps[i + 0];
corr += ppc[i + 1] * ps[i + 1];
corr += ppc[i + 2] * ps[i + 2];
corr += ppc[i + 3] * ps[i + 3];
i += 4;
} while (i < 0);
if (corr > best_corr) {
best_corr = corr;
best_off = off;
}
search_start += st->samples_per_frame;
}
return best_off * st->bytes_per_frame;
}