Manuel Stoeckl · Manuel Stoeckl · Manuel Stoeckl · Manuel Stoeckl · Manuel Stoeckl · Manuel Stoeckl
--- a/.gitignore
+++ b/.gitignore
 waypipe
 build/
+Doxyfile
+html
+latex
+doc
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+Contributing guidelines
+===============================================================================

-# Formatting
+## Formatting

 To avoid needless time spent formatting things, this project has autoformatting
 set up. Yes, it's often ugly, but after using it long enough you'll forget that
@@ -10,7 +12,7 @@ format all source code files in the project.
 [0] https://github.com/python/black
 [1] https://clang.llvm.org/docs/ClangFormat.html

-# Types
+## Types

 * Typedefs should be used only for function signatures, and never applied to
  structs.
@@ -22,3 +24,25 @@ format all source code files in the project.
 * `printf` should be called with the correct format codes. For example, `%zd`
  for `ssize_t`, and the `PRIu32` macro for `uint32_t`.
 * Avoid unnecessary casts.
+
+## Comments
+
+Explain precisely that which is not obvious. `/* ... */` is preferred to
+`// ...` for longer comments; the leading `/*` and trailing `*/ do not need
+lines of their own. Use Doxygen style (`/**`) for functions and structs that
+need commenting, but not to the point where it hinders source code readability.
+Waypipe is not a library.
+
+## Memory and errors
+
+All error conditions should be handled, including the errors produced by
+allocation failures. (It is relatively easy to test for allocation failure
+by `LD_PRELOAD`ing a library that redefines malloc et al.; see for instance
+"mallocfail" and "failmalloc". `ulimit -v` may be less effective.)
+
+Some errors are unrecoverable, and for those cases Waypipe should shut down
+cleanly. For instance, if Waypipe cannot replicate a file descriptor, then an
+application connected through it will almost certainly crash, and it's better
+to have Waypipe exit instead. Other errors can safely ignored -- if fine
+grained damage tracking fails, a sane fallback would be to assume that an
+entire surface is damaged.
--- a/meson.build
+++ b/meson.build
@@ -8,9 +8,13 @@ project(
 		'warning_level=3',
 		'werror=true',
 	],
-	version: '0.5.0',
+	version: '0.6.0',
 )

+# DEFAULT_SOURCE implies POSIX_C_SOURCE 200809L + extras like CMSG_LEN
+# requires glibc >= 4.19 (2014), freebsd libc (since 2016?), musl >= 1.15 (2014)
+add_project_arguments('-D_DEFAULT_SOURCE', language: 'c')
+
 cc = meson.get_compiler('c')
 config_data = configuration_data()

@@ -59,12 +63,12 @@ endif
 libavcodec = dependency('libavcodec', required: get_option('with_video'))
 libavutil = dependency('libavutil', required: get_option('with_video'))
 libswscale = dependency('libswscale', required: get_option('with_video'))
+libva = dependency('libva', required: get_option('with_vaapi'))
 if libavcodec.found() and libavutil.found() and libswscale.found()
 	config_data.set('HAS_VIDEO', 1, description: 'Enable video (de)compression')
-endif
-libva = dependency('libva', required: get_option('with_vaapi'))
-if libva.found()
-	config_data.set('HAS_VAAPI', 1, description: 'Enable hardware video (de)compression with VAAPI')
+	if libva.found()
+		config_data.set('HAS_VAAPI', 1, description: 'Enable hardware video (de)compression with VAAPI')
+	endif
 endif

 waypipe_includes = [include_directories('protocols'), include_directories('src')]

--- a/src/bench.c
+++ b/src/bench.c
@@ -23,10 +23,6 @@
 * SOFTWARE.
 */

-#if !defined(__DragonFly__) && !defined(__FreeBSD__) && !defined(__OpenBSD__)
-/* aligned_alloc isn't part of any X/Open version */
-#define _XOPEN_SOURCE 700
-#endif
 #include "shadow.h"
 #include "util.h"

@@ -57,9 +53,8 @@ static const struct compression_range comp_ranges[] = {
 static void *create_text_like_image(size_t size)
 {
 	uint8_t *data = malloc(size);
-	size_t step = 0;
 	for (size_t i = 0; i < size; i++) {
-		step = i / 203 - i / 501;
+		size_t step = i / 203 - i / 501;
 		bool s = step % 2 == 0;
 		data[i] = (uint8_t)(s ? ((step >> 1) & 0x2) + 0xfe : 0x00);
 	}
@@ -151,6 +146,24 @@ static int compare_timespec(const struct timespec *a, const struct timespec *b)
 	return 0;
 }

+/* requires delta >= 0 */
+static struct timespec timespec_add(struct timespec base, int64_t delta_ns)
+{
+	struct timespec ret;
+	ret.tv_sec = base.tv_sec + delta_ns / 1000000000LL;
+	ret.tv_nsec = base.tv_nsec + delta_ns % 1000000000LL;
+	if (ret.tv_nsec > 1000000000LL) {
+		ret.tv_nsec -= 1000000000LL;
+		ret.tv_sec++;
+	}
+	return ret;
+}
+
+static int64_t timespec_sub(struct timespec a, struct timespec b)
+{
+	return (a.tv_sec - b.tv_sec) * 1000000000LL + (a.tv_nsec - b.tv_nsec);
+}
+
 #define NSAMPLES 5

 static struct bench_result run_sub_bench(bool first,
@@ -185,9 +198,9 @@ static struct bench_result run_sub_bench(bool first,
 	render.drm_fd = 1;
 	render.av_disabled = true;

-	struct bytebuf msg = {.size = sizeof(sizeof(struct wmsg_open_file)),
+	struct bytebuf msg = {.size = sizeof(struct wmsg_open_file),
 			.data = (char *)&file_msg};
-	apply_update(&map, &pool, &render, WMSG_OPEN_FILE, 0, &msg);
+	(void)apply_update(&map, &pool, &render, WMSG_OPEN_FILE, 0, &msg);
 	struct shadow_fd *sfd = get_shadow_for_rid(&map, 0);

 	int iter = 0;
@@ -203,13 +216,14 @@ static struct bench_result run_sub_bench(bool first,
 		damage_everything(&sfd->damage);

 		/* Create transfer queue */
-		struct transfer_data transfer_data;
-		memset(&transfer_data, 0, sizeof(struct transfer_data));
-		pthread_mutex_init(&transfer_data.lock, NULL);
+		struct transfer_queue transfer_data;
+		memset(&transfer_data, 0, sizeof(struct transfer_queue));
+		pthread_mutex_init(&transfer_data.async_recv_queue.lock, NULL);

 		struct timespec t0, t1;
 		clock_gettime(CLOCK_REALTIME, &t0);
 		collect_update(&pool, sfd, &transfer_data);
+		start_parallel_work(&pool, &transfer_data.async_recv_queue);

 		/* A restricted main loop, in which transfer blocks are
 		 * instantaneously consumed when previous blocks have been
@@ -229,17 +243,17 @@ static struct bench_result run_sub_bench(bool first,
 				run_task(&task, &pool.threads[0]);

 				pthread_mutex_lock(&pool.work_mutex);
-				pool.queue_in_progress--;
+				pool.tasks_in_progress--;
 				pthread_mutex_unlock(&pool.work_mutex);
 			}

 			struct timespec cur_time;
 			clock_gettime(CLOCK_REALTIME, &cur_time);
 			if (compare_timespec(&next_write_time, &cur_time) < 0) {
-				pthread_mutex_lock(&transfer_data.lock);
-				if (transfer_data.end != transfer_data.start) {
+				transfer_load_async(&transfer_data);
+				if (transfer_data.start < transfer_data.end) {
 					struct iovec v =
-							transfer_data.data
+							transfer_data.vecs
 									[transfer_data.start++];
 					float delay_s = (float)v.iov_len /
 							(bandwidth_mBps * 1e6f);
@@ -254,39 +268,24 @@ static struct bench_result run_sub_bench(bool first,

 					/* Advance timer for next receipt */
 					int64_t delay_ns = (int64_t)(
-							delay_s * 1000000000LL);
-					next_write_time.tv_sec =
-							cur_time.tv_sec +
-							delay_ns / 1000000000LL;
-					next_write_time.tv_nsec =
-							cur_time.tv_nsec +
-							delay_ns % 1000000000LL;
-					if (next_write_time.tv_nsec >
-							1000000000LL) {
-						next_write_time.tv_nsec -=
-								1000000000LL;
-						next_write_time.tv_sec++;
-					}
+							delay_s * 1e9f);
+					next_write_time = timespec_add(
+							cur_time, delay_ns);
 				}
-				pthread_mutex_unlock(&transfer_data.lock);
 			} else {
 				/* Very short delay, for poll loop */
 				bool tasks_remaining = false;
 				pthread_mutex_lock(&pool.work_mutex);
-				tasks_remaining = pool.queue_end >
-						  pool.queue_start;
+				tasks_remaining = pool.stack_count > 0;
 				pthread_mutex_unlock(&pool.work_mutex);

 				struct timespec delay_time;
 				delay_time.tv_sec = 0;
 				delay_time.tv_nsec = 10000;
 				if (!tasks_remaining) {
-					int64_t nsecs_left =
-							(next_write_time.tv_sec -
-									cur_time.tv_sec) *
-									1000000000LL +
-							(next_write_time.tv_nsec -
-									cur_time.tv_nsec);
+					int64_t nsecs_left = timespec_sub(
+							next_write_time,
+							cur_time);
 					if (nsecs_left > 1000000000LL) {
 						nsecs_left = 1000000000LL;
 					}
@@ -297,9 +296,7 @@ static struct bench_result run_sub_bench(bool first,
 				nanosleep(&delay_time, NULL);
 			}
 			bool all_sent = false;
-			pthread_mutex_lock(&transfer_data.lock);
 			all_sent = transfer_data.start == transfer_data.end;
-			pthread_mutex_unlock(&transfer_data.lock);

 			if (done && all_sent) {
 				break;
@@ -307,7 +304,7 @@ static struct bench_result run_sub_bench(bool first,
 		}

 		finish_update(sfd);
-		cleanup_transfers(&transfer_data);
+		cleanup_transfer_queue(&transfer_data);
 		clock_gettime(CLOCK_REALTIME, &t1);

 		struct diff_comp_results r;
@@ -376,12 +373,9 @@ int run_bench(float bandwidth_mBps, int n_worker_threads)
 			calloc((size_t)ntests, sizeof(struct bench_result));
 	struct bench_result *iresults =
 			calloc((size_t)ntests, sizeof(struct bench_result));
-	int ntres = 0;
-	int nires = 0;
+	int ntres = 0, nires = 0;
 	for (int k = 0; k < 2; k++) {
 		bool text_like = k == 0;
-		struct bench_result *results = text_like ? tresults : iresults;
-		int *nresults = text_like ? &ntres : &nires;
 		int j = 0;
 		for (size_t c = 0;
 				!shutdown_flag &&
@@ -400,8 +394,13 @@ int run_bench(float bandwidth_mBps, int n_worker_threads)
 						text_like, test_size,
 						text_like ? text_image
 							  : vid_image);
-				results[j++] = res;
-				(*nresults)++;
+				if (text_like) {
+					tresults[j++] = res;
+					ntres++;
+				} else {
+					iresults[j++] = res;
+					nires++;
+				}
 			}
 		}
 	}

--- a/src/client.c
+++ b/src/client.c
@@ -23,22 +23,20 @@
 * SOFTWARE.
 */

-#define _XOPEN_SOURCE 700
-
 #include "main.h"

 #include <errno.h>
 #include <fcntl.h>
+#include <poll.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/poll.h>
 #include <sys/socket.h>
 #include <sys/un.h>
 #include <sys/wait.h>
 #include <unistd.h>

-static int get_inherited_socket()
+static int get_inherited_socket(void)
 {
 	const char *fd_no = getenv("WAYLAND_SOCKET");
 	char *endptr = NULL;
@@ -162,7 +160,7 @@ static int run_single_client_reconnector(
 	return retcode;
 }

-static int run_single_client(int channelsock, pid_t eol_pid,
+static int run_single_client(int channelsock, pid_t *eol_pid,
 		const struct main_config *config, int disp_fd)
 {
 	/* To support reconnection attempts, this mode creates a child
@@ -300,10 +298,13 @@ static int handle_new_client_connection(int channelsock, int chanclient,
 		// socket
 		close(channelsock);
 		close(linkfds[0]);
+		for (int i = 0; i < connmap->count; i++) {
+			close(connmap->data[i].linkfd);
+		}

 		int dfd = connect_to_socket(disp_path);
 		if (dfd == -1) {
-			return EXIT_FAILURE;
+			exit(EXIT_FAILURE);
 		}
 		// ignore retcode ?
 		main_interface_loop(chanclient, dfd, linkfds[1], config, true);
@@ -315,6 +316,7 @@ static int handle_new_client_connection(int channelsock, int chanclient,
 		goto fail_ps;
 	}
 	// Remove connection from this process
+	close(linkfds[1]);
 	close(chanclient);
 	connmap->data[connmap->count++] = (struct conn_addr){
 			.linkfd = linkfds[0], .token = conn_id, .pid = npid};
@@ -327,7 +329,7 @@ fail_cc:
 	return -1;
 }

-static int run_multi_client(int channelsock, pid_t eol_pid,
+static int run_multi_client(int channelsock, pid_t *eol_pid,
 		const struct main_config *config,
 		const char disp_path[static MAX_SOCKETPATH_LEN])
 {
@@ -342,8 +344,6 @@ static int run_multi_client(int channelsock, pid_t eol_pid,
 		int status = -1;
 		if (wait_for_pid_and_clean(
 				    eol_pid, &status, WNOHANG, &connmap)) {
-			eol_pid = 0; // < in case eol_pid is recycled
-
 			wp_debug("Child (ssh) died, exiting");
 			// Copy the exit code
 			retcode = WEXITSTATUS(status);
@@ -449,17 +449,17 @@ int run_client(const char *socket_path, const struct main_config *config,
 	int retcode;
 	if (oneshot) {
 		retcode = run_single_client(
-				channelsock, eol_pid, config, dispfd);
+				channelsock, &eol_pid, config, dispfd);
 	} else {
 		retcode = run_multi_client(
-				channelsock, eol_pid, config, disp_path);
+				channelsock, &eol_pid, config, disp_path);
 	}
 	unlink(socket_path);
 	int cleanup_type = shutdown_flag ? WNOHANG : 0;

 	int status = -1;
 	// Don't return until all child processes complete
-	if (wait_for_pid_and_clean(eol_pid, &status, cleanup_type, NULL)) {
+	if (wait_for_pid_and_clean(&eol_pid, &status, cleanup_type, NULL)) {
 		retcode = WEXITSTATUS(status);
 	}
 	return retcode;

--- a/src/dmabuf.c
+++ b/src/dmabuf.c
@@ -23,7 +23,6 @@
 * SOFTWARE.
 */

-#define _XOPEN_SOURCE 700
 #include "dmabuf.h"
 #include "util.h"

@@ -248,7 +247,7 @@ struct gbm_bo *import_dmabuf(struct render_data *rd, int fd, size_t *size,
 				strerror(errno));
 		return NULL;
 	}
-	if (read_modifier) {
+	if (read_modifier && info) {
 		info->modifier = gbm_bo_get_modifier(bo);
 		const uint64_t drm_format_mod_invalid = 0x00ffffffffffffffULL;
 		if (info->modifier == drm_format_mod_invalid) {
@@ -263,9 +262,9 @@ struct gbm_bo *import_dmabuf(struct render_data *rd, int fd, size_t *size,
 bool is_dmabuf(int fd)
 {
 	// Prepare an invalid request, with a dma-buf specific IOCTL
-	struct dma_buf_sync sync;
-	sync.flags = 0;
-	if (ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync) != -1) {
+	struct dma_buf_sync arg;
+	arg.flags = 0;
+	if (ioctl(fd, DMA_BUF_IOCTL_SYNC, &arg) != -1) {
 		wp_error("DMAbuf test ioctl succeeded when it should have errored");
 		return false;
 	}

--- a/src/dmabuf.h
+++ b/src/dmabuf.h
@@ -46,6 +46,7 @@ struct render_data {
 	VAConfigID av_copy_config;
 };

+/** Additional information to help serialize a dmabuf */
 struct dmabuf_slice_data {
 	/* This information partially duplicates that of a gbm_bo. However, for
 	 * instance with weston, it is possible for the compositor to handle
@@ -62,16 +63,18 @@ struct dmabuf_slice_data {
 	uint8_t using_planes[4];
 };

-/* Additional information to help serialize a dmabuf */
 int init_render_data(struct render_data *);
 void cleanup_render_data(struct render_data *);
 bool is_dmabuf(int fd);
 struct gbm_bo *make_dmabuf(struct render_data *rd, size_t size,
 		const struct dmabuf_slice_data *info);
 int export_dmabuf(struct gbm_bo *bo);
+/** Import DMABUF to a GBM buffer object; if `read_modifier` is true, then
+ * the `info->modifier` will be overwritten with whatever the modifier is */
 struct gbm_bo *import_dmabuf(struct render_data *rd, int fd, size_t *size,
 		struct dmabuf_slice_data *info, bool read_modifier);
 void destroy_dmabuf(struct gbm_bo *bo);
+/** Map a DMABUF for reading or for writing */
 void *map_dmabuf(struct gbm_bo *bo, bool write, void **map_handle);
 int unmap_dmabuf(struct gbm_bo *bo, void *map_handle);
 /** The handle values are unique among the set of currently active buffer

--- a/src/handlers.c
+++ b/src/handlers.c
@@ -23,8 +23,6 @@
 * SOFTWARE.
 */

-#define _XOPEN_SOURCE 700
-
 #include "main.h"
 #include "parsing.h"
 #include "shadow.h"
@@ -239,38 +237,47 @@ struct wp_object *create_wp_object(uint32_t id, const struct wp_interface *type)
 {
 	/* Note: if custom types are ever implemented for globals, they would
 	 * need special replacement logic when the type is set */
-	struct wp_object *new_obj;
+	size_t sz;
 	if (type == &intf_wl_shm_pool) {
-		new_obj = calloc(1, sizeof(struct wp_shm_pool));
+		sz = sizeof(struct wp_shm_pool);
 	} else if (type == &intf_wl_buffer) {
-		new_obj = calloc(1, sizeof(struct wp_buffer));
+		sz = sizeof(struct wp_buffer);
 	} else if (type == &intf_wl_surface) {
-		new_obj = calloc(1, sizeof(struct wp_surface));
-		((struct wp_surface *)new_obj)->scale = 1;
+		sz = sizeof(struct wp_surface);
 	} else if (type == &intf_wl_keyboard) {
-		new_obj = calloc(1, sizeof(struct wp_keyboard));
+		sz = sizeof(struct wp_keyboard);
 	} else if (type == &intf_zwlr_screencopy_frame_v1) {
-		new_obj = calloc(1, sizeof(struct wp_wlr_screencopy_frame));
+		sz = sizeof(struct wp_wlr_screencopy_frame);
 	} else if (type == &intf_wp_presentation) {
-		new_obj = calloc(1, sizeof(struct waypipe_presentation));
+		sz = sizeof(struct waypipe_presentation);
 	} else if (type == &intf_wp_presentation_feedback) {
-		new_obj = calloc(1,
-				sizeof(struct waypipe_presentation_feedback));
+		sz = sizeof(struct waypipe_presentation_feedback);
 	} else if (type == &intf_zwp_linux_buffer_params_v1) {
-		new_obj = calloc(1, sizeof(struct wp_linux_dmabuf_params));
-		struct wp_linux_dmabuf_params *params =
-				(struct wp_linux_dmabuf_params *)new_obj;
-		for (int i = 0; i < MAX_DMABUF_PLANES; i++) {
-			params->add[i].fd = -1;
-		}
+		sz = sizeof(struct wp_linux_dmabuf_params);
 	} else if (type == &intf_zwlr_export_dmabuf_frame_v1) {
-		new_obj = calloc(1, sizeof(struct wp_export_dmabuf_frame));
+		sz = sizeof(struct wp_export_dmabuf_frame);
 	} else {
-		new_obj = calloc(1, sizeof(struct wp_object));
+		sz = sizeof(struct wp_object);
+	}
+
+	struct wp_object *new_obj = calloc(1, sz);
+	if (!new_obj) {
+		wp_error("Failed to allocate new wp_object");
+		return NULL;
 	}
 	new_obj->obj_id = id;
 	new_obj->type = type;
 	new_obj->is_zombie = false;
+
+	if (type == &intf_zwp_linux_buffer_params_v1) {
+		struct wp_linux_dmabuf_params *params =
+				(struct wp_linux_dmabuf_params *)new_obj;
+		for (int i = 0; i < MAX_DMABUF_PLANES; i++) {
+			params->add[i].fd = -1;
+		}
+	} else if (type == &intf_wl_surface) {
+		((struct wp_surface *)new_obj)->scale = 1;
+	}
 	return new_obj;
 }

@@ -474,7 +481,7 @@ static int compute_damage_coordinates(int *xlow, int *xhigh, int *ylow,
 				scale);
 		return -1;
 	}
-	if (transform < 0 || transform > 8) {
+	if (transform < 0 || transform >= 8) {
 		wp_error("Not applying damage due to invalid buffer transform (%d)",
 				transform);
 		return -1;
@@ -674,29 +681,38 @@ void do_wl_surface_req_commit(struct context *ctx)
 	surface->damage_list_len = 0;
 	surface->damage_list_size = 0;
 }
-void do_wl_surface_req_damage(struct context *ctx, int32_t x, int32_t y,
-		int32_t width, int32_t height)
+static void append_damage_record(struct wp_surface *surface, int32_t x,
+		int32_t y, int32_t width, int32_t height,
+		bool in_buffer_coordinates)
 {
-	if (ctx->on_display_side) {
-		// The display side does not need to track the damage
+	if (buf_ensure_size(surface->damage_list_len + 1,
+			    sizeof(struct damage_record),
+			    &surface->damage_list_size,
+			    (void **)&surface->damage_list) == -1) {
+		wp_error("Failed to allocate space for damage list, dropping damage record");
 		return;
 	}
-	struct wp_surface *surface = (struct wp_surface *)ctx->obj;
-	buf_ensure_size(surface->damage_list_len + 1,
-			sizeof(struct damage_record),
-			&surface->damage_list_size,
-			(void **)&surface->damage_list);

 	// A rectangle of the buffer was damaged, hence backing buffers
 	// may be updated.
 	struct damage_record *damage =
 			&surface->damage_list[surface->damage_list_len++];
-	damage->buffer_coordinates = false;
+	damage->buffer_coordinates = in_buffer_coordinates;
 	damage->x = x;
 	damage->y = y;
 	damage->width = width;
 	damage->height = height;
 }
+void do_wl_surface_req_damage(struct context *ctx, int32_t x, int32_t y,
+		int32_t width, int32_t height)
+{
+	if (ctx->on_display_side) {
+		// The display side does not need to track the damage
+		return;
+	}
+	append_damage_record((struct wp_surface *)ctx->obj, x, y, width, height,
+			false);
+}
 void do_wl_surface_req_damage_buffer(struct context *ctx, int32_t x, int32_t y,
 		int32_t width, int32_t height)
 {
@@ -704,21 +720,8 @@ void do_wl_surface_req_damage_buffer(struct context *ctx, int32_t x, int32_t y,
 		// The display side does not need to track the damage
 		return;
 	}
-	struct wp_surface *surface = (struct wp_surface *)ctx->obj;
-	buf_ensure_size(surface->damage_list_len + 1,
-			sizeof(struct damage_record),
-			&surface->damage_list_size,
-			(void **)&surface->damage_list);
-
-	// A rectangle of the buffer was damaged, hence backing buffers
-	// may be updated.
-	struct damage_record *damage =
-			&surface->damage_list[surface->damage_list_len++];
-	damage->buffer_coordinates = true;
-	damage->x = x;
-	damage->y = y;
-	damage->width = width;
-	damage->height = height;
+	append_damage_record((struct wp_surface *)ctx->obj, x, y, width, height,
+			true);
 }
 void do_wl_surface_req_set_buffer_transform(
 		struct context *ctx, int32_t transform)
@@ -736,7 +739,7 @@ void do_wl_keyboard_evt_keymap(
 		struct context *ctx, uint32_t format, int fd, uint32_t size)
 {
 	size_t fdsz = 0;
-	fdcat_t fdtype = get_fd_type(fd, &fdsz);
+	enum fdcat fdtype = get_fd_type(fd, &fdsz);
 	if (fdtype != FDC_FILE || fdsz != size) {
 		wp_error("keymap candidate fd %d was not file-like (type=%s), and with size=%zu did not match %u",
 				fd, fdcat_to_str(fdtype), fdsz, size);
@@ -744,7 +747,7 @@ void do_wl_keyboard_evt_keymap(
 	}

 	struct shadow_fd *sfd = translate_fd(&ctx->g->map, &ctx->g->render, fd,
-			fdtype, fdsz, NULL, false);
+			fdtype, fdsz, NULL, false, false);
 	struct wp_keyboard *keyboard = (struct wp_keyboard *)ctx->obj;
 	keyboard->owned_buffer = shadow_incref_protocol(sfd);
 	(void)format;
@@ -761,7 +764,7 @@ void do_wl_shm_req_create_pool(
 	}

 	size_t fdsz = 0;
-	fdcat_t fdtype = get_fd_type(fd, &fdsz);
+	enum fdcat fdtype = get_fd_type(fd, &fdsz);
 	/* It may be valid for the file descriptor size to be larger
 	 * than the immediately advertised size, since the call to
 	 * wl_shm.create_pool may be followed by wl_shm_pool.resize,
@@ -775,7 +778,7 @@ void do_wl_shm_req_create_pool(
 	}

 	struct shadow_fd *sfd = translate_fd(&ctx->g->map, &ctx->g->render, fd,
-			fdtype, fdsz, NULL, false);
+			fdtype, fdsz, NULL, false, false);
 	the_shm_pool->owned_buffer = shadow_incref_protocol(sfd);
 }

@@ -1017,7 +1020,7 @@ void do_wl_drm_req_create_prime_buffer(struct context *ctx,

 #if !defined(__DragonFly__) && !defined(__FreeBSD__)
 	size_t fdsz = 0;
-	fdcat_t fdtype = get_fd_type(name, &fdsz);
+	enum fdcat fdtype = get_fd_type(name, &fdsz);
 	if (fdtype != FDC_DMABUF) {
 		wp_error("create_prime_buffer candidate fd %d was not a dmabuf (type=%s)",
 				name, fdcat_to_str(fdtype));
@@ -1026,7 +1029,7 @@ void do_wl_drm_req_create_prime_buffer(struct context *ctx,
 #endif

 	struct shadow_fd *sfd = translate_fd(&ctx->g->map, &ctx->g->render,
-			name, FDC_DMABUF, 0, &info, true);
+			name, FDC_DMABUF, 0, &info, true, false);
 	buf->type = BUF_DMA;
 	buf->dmabuf_nplanes = 1;
 	buf->dmabuf_buffers[0] = shadow_incref_protocol(sfd);
@@ -1239,7 +1242,7 @@ void do_zwp_linux_buffer_params_v1_req_create(struct context *ctx,

 #if !defined(__DragonFly__) && !defined(__FreeBSD__)
 		size_t fdsz = 0;
-		fdcat_t fdtype = get_fd_type(params->add[i].fd, &fdsz);
+		enum fdcat fdtype = get_fd_type(params->add[i].fd, &fdsz);
 		if (fdtype != FDC_DMABUF) {
 			wp_error("fd #%d for linux-dmabuf request wasn't a dmabuf, instead %s",
 					i, fdcat_to_str(fdtype));
@@ -1247,7 +1250,7 @@ void do_zwp_linux_buffer_params_v1_req_create(struct context *ctx,
 		}
 #endif

-		fdcat_t res_type = FDC_DMABUF;
+		enum fdcat res_type = FDC_DMABUF;
 		if (ctx->g->config->video_if_possible) {
 			// TODO: multibuffer support
 			if (all_same_fds && video_supports_dmabuf_format(format,
@@ -1259,7 +1262,7 @@ void do_zwp_linux_buffer_params_v1_req_create(struct context *ctx,

 		struct shadow_fd *sfd = translate_fd(&ctx->g->map,
 				&ctx->g->render, params->add[i].fd, res_type, 0,
-				&info, false);
+				&info, false, false);
 		/* increment for each extra time this fd will be sent */
 		if (sfd->has_owner) {
 			shadow_incref_transfer(sfd);
@@ -1348,7 +1351,7 @@ void do_zwlr_export_dmabuf_frame_v1_evt_object(struct context *ctx,

 #if !defined(__DragonFly__) && !defined(__FreeBSD__)
 	size_t fdsz = 0;
-	fdcat_t fdtype = get_fd_type(fd, &fdsz);
+	enum fdcat fdtype = get_fd_type(fd, &fdsz);
 	if (fdtype != FDC_DMABUF) {
 		wp_error("fd %d, #%d for wlr-export-dmabuf frame wasn't a dmabuf, instead %s",
 				fd, index, fdcat_to_str(fdtype));
@@ -1357,7 +1360,7 @@ void do_zwlr_export_dmabuf_frame_v1_evt_object(struct context *ctx,
 #endif

 	struct shadow_fd *sfd = translate_fd(&ctx->g->map, &ctx->g->render, fd,
-			FDC_DMABUF, 0, &info, false);
+			FDC_DMABUF, 0, &info, false, false);
 	if (sfd->buffer_size < size) {
 		wp_error("Frame object %u has a dmabuf with less (%u) than the advertised (%u) size",
 				index, (uint32_t)sfd->buffer_size, size);
@@ -1397,8 +1400,8 @@ static void translate_data_transfer_fd(struct context *context, int32_t fd)
 	 * socketpair, with additional properties. The fd being sent
 	 * around should be, according to the protocol, only written into and
 	 * closed */
-	translate_fd(&context->g->map, &context->g->render, fd, FDC_PIPE_IW, 0,
-			NULL, false);
+	translate_fd(&context->g->map, &context->g->render, fd, FDC_PIPE, 0,
+			NULL, false, true);
 }
 void do_gtk_primary_selection_offer_req_receive(
 		struct context *ctx, const char *mime_type, int fd)

--- a/src/interval.c
+++ b/src/interval.c
@@ -23,7 +23,6 @@
 * SOFTWARE.
 */

-#define _XOPEN_SOURCE 700
 #include "shadow.h"

 #include <stdlib.h>
@@ -91,9 +90,12 @@ static int fix_merge_stack_property(int size, struct merge_stack_elem *stack,
 			return size;
 		}

-		buf_ensure_size(top.count + nxt.count + 1,
-				sizeof(struct interval), &temp->size,
-				(void **)&temp->data);
+		if (buf_ensure_size(top.count + nxt.count + 1,
+				    sizeof(struct interval), &temp->size,
+				    (void **)&temp->data) == -1) {
+			wp_error("Failed to resize a merge buffer, some damage intervals may be lost");
+			return size;
+		}

 		int xs = stream_merge(top.count, &base->data[top.offset],
 				nxt.count, &base->data[nxt.offset], temp->data,
@@ -116,6 +118,33 @@ static int fix_merge_stack_property(int size, struct merge_stack_elem *stack,
 	return size;
 }

+static int unpack_ext_interval(struct interval *vec,
+		const struct ext_interval e, int alignment_bits)
+{
+	int iw = 0;
+	int last_end = INT32_MIN;
+	for (int ir = 0; ir < e.rep; ir++) {
+		int start = e.start + ir * e.stride;
+		int end = start + e.width;
+		start = (start >> alignment_bits) << alignment_bits;
+		end = ((end + (1 << alignment_bits) - 1) >> alignment_bits)
+		      << alignment_bits;
+
+		if (start > last_end) {
+			vec[iw].start = start;
+			vec[iw].end = end;
+			last_end = end;
+			iw++;
+		} else {
+			vec[iw - 1].end = end;
+			last_end = end;
+		}
+	}
+	/* end sentinel */
+	vec[iw] = (struct interval){.start = INT32_MAX, .end = INT32_MAX};
+	return iw;
+}
+
 /* By writing a mergesort by hand, we can detect duplicates early.
 *
 * TODO: optimize output with run-length-encoded segments
@@ -164,9 +193,9 @@ void merge_mergesort(const int old_count, struct interval *old_list,
 		bool force_combine = (absorbed > 30000) ||
 				     10 * remaining < src_count;

-		int64_t end = e.start + e.stride * (int64_t)(e.rep - 1) +
-			      e.width;
-		if (end >= INT32_MAX) {
+		int64_t intv_end = e.start + e.stride * (int64_t)(e.rep - 1) +
+				   e.width;
+		if (intv_end >= INT32_MAX) {
 			/* overflow protection */
 			e.width = INT32_MAX - 1 - e.start;
 			e.rep = 1;
@@ -179,34 +208,15 @@ void merge_mergesort(const int old_count, struct interval *old_list,
 			e.rep = 1;
 		}

-		buf_ensure_size(base.count + e.rep + 1, sizeof(struct interval),
-				&base.size, (void **)&base.data);
-
-		struct interval *vec = &base.data[base.count];
-		int iw = 0;
-		int last_end = INT32_MIN;
-		for (int ir = 0; ir < e.rep; ir++) {
-			int start = e.start + ir * e.stride;
-			int end = start + e.width;
-			start = (start >> alignment_bits) << alignment_bits;
-			end = ((end + (1 << alignment_bits) - 1) >>
-					      alignment_bits)
-			      << alignment_bits;
-
-			if (start > last_end) {
-				vec[iw].start = start;
-				vec[iw].end = end;
-				last_end = end;
-				iw++;
-			} else {
-				vec[iw - 1].end = end;
-				last_end = end;
-			}
+		if (buf_ensure_size(base.count + e.rep + 1,
+				    sizeof(struct interval), &base.size,
+				    (void **)&base.data) == -1) {
+			wp_error("Failed to resize a merge buffer, some damage intervals may be lost");
+			continue;
 		}
-		/* end sentinel */
-		vec[iw] = (struct interval){
-				.start = INT32_MAX, .end = INT32_MAX};

+		struct interval *vec = &base.data[base.count];
+		int iw = unpack_ext_interval(vec, e, alignment_bits);
 		src_count += iw;

 		substack[substack_size] = (struct merge_stack_elem){
@@ -250,27 +260,6 @@ void merge_damage_records(struct damage *base, int nintervals,
 			alignment_bits);
 }

-int get_damage_area(const struct damage *base)
-{
-	if (base->damage == DAMAGE_EVERYTHING) {
-		return INT32_MAX;
-	} else if (base->damage == NULL || base->ndamage_intvs == 0) {
-		return 0;
-	} else {
-		int low = INT32_MAX;
-		int high = INT32_MIN;
-		int tca = 0;
-		for (int i = 0; i < base->ndamage_intvs; i++) {
-			tca += base->damage[i].end - base->damage[i].start;
-		}
-		float cover_fraction =
-				(float)base->acc_damage_stat / (float)tca;
-		wp_debug("Damage interval: {%d(%d)} -> [%d, %d) [%d], %f",
-				base->ndamage_intvs, base->acc_count, low, high,
-				tca, cover_fraction);
-		return tca;
-	}
-}
 void reset_damage(struct damage *base)
 {
 	if (base->damage != DAMAGE_EVERYTHING) {

--- a/src/interval.h
+++ b/src/interval.h
@@ -27,39 +27,39 @@

 #include <stdint.h>

+/** A slight modification of the standard 'damage' rectangle
+ * formulation, written to be agnostic of whatever buffers
+ * underlie the system.
+ *
+ * [start,start+width),[start+stride,start+stride+width),
+ * ... [start+(rep-1)*stride,start+(rep-1)*stride+width) */
 struct ext_interval {
-	/* A slight modification of the standard 'damage' rectangle
-	 * formulation, written to be agnostic of whatever buffers
-	 * underlie the system.
-	 *
-	 * [start,start+width),[start+stride,start+stride+width),
-	 * ... [start+(rep-1)*stride,start+(rep-1)*stride+width) */
 	int32_t start;
-	/* Subinterval width */
+	/** Subinterval width */
 	int32_t width;
-	/* Number of distinct subinterval start positions. For a single
+	/** Number of distinct subinterval start positions. For a single
 	 * interval, this is one. */
 	int32_t rep;
-	/* Spacing between start positions, should be > width, unless
+	/** Spacing between start positions, should be > width, unless
 	 * the is only one subinterval, in which case the value shouldn't
 	 * matter and is conventionally set to 0. */
 	int32_t stride;
 };
+/** [start, end). (This is better than {start,width}, since width computations
+ * are rare and trivial, while merging code branches frequently off of
+ * endpoints) */
 struct interval {
-	/* start+end is better than start+width, since the limits are used
-	 * repeatedly by merge operations, while width is only needed for
-	 * e.g. streaming area estimates which are very fast anyway */
 	int32_t start;
 	int32_t end;
 };

 #define DAMAGE_EVERYTHING ((struct interval *)-1)

+/** Interval-based damage tracking. If damage is NULL, there is
+ * no recorded damage. If damage is DAMAGE_EVERYTHING, the entire
+ * region should be updated. If ndamage_intvs > 0, then
+ * damage points to an array of struct interval objects. */
 struct damage {
-	/* Interval-based damage tracking. If damage is NULL, there is
-	 * no recorded damage. If damage is DAMAGE_EVERYTHING, the entire
-	 * region should be updated. If ndamage_rects > 0, then
-	 * damage points to an array of struct damage_interval objects. */
 	struct interval *damage;
 	int ndamage_intvs;

@@ -74,8 +74,6 @@ struct damage {
 * `1 << alignment_bits`. */
 void merge_damage_records(struct damage *base, int nintervals,
 		const struct ext_interval *const new_list, int alignment_bits);
-/** Return the total area covered by the damage region */
-int get_damage_area(const struct damage *base);
 /** Set damage to empty  */
 void reset_damage(struct damage *base);
 /** Expand damage to cover everything */

--- a/src/kernel.c
+++ b/src/kernel.c
@@ -32,13 +32,6 @@
 #include <stdint.h>
 #include <string.h>

-#if defined(__linux__) && defined(__arm__)
-#include <asm/hwcap.h>
-#include <sys/auxv.h>
-#elif defined(__FreeBSD__) && defined(__arm__)
-#include <sys/auxv.h>
-#endif
-
 size_t run_interval_diff_C(const int diff_window_size,
 		const void *__restrict__ imod, void *__restrict__ ibase,
 		uint32_t *__restrict__ idiff, size_t i, const size_t i_end)
@@ -121,26 +114,15 @@ size_t run_interval_diff_avx2(const int diff_window_size,
 #endif

 #ifdef HAVE_NEON
-static bool neon_available(void)
-{
-	/* The actual methods are platform-dependent */
-#if defined(__linux__) && defined(__arm__)
-	return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
-#elif defined(__FreeBSD__) && defined(__arm__)
-	unsigned long hwcap = 0;
-	elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
-	return (hwcap & HWCAP_NEON) != 0;
-#endif
-	return true;
-}
+bool neon_available(void); // in platform.c
 size_t run_interval_diff_neon(const int diff_window_size,
 		const void *__restrict__ imod, void *__restrict__ ibase,
 		uint32_t *__restrict__ idiff, size_t i, const size_t i_end);
 #endif

-#ifdef HAVE_SSE41
-static bool sse41_available(void) { return __builtin_cpu_supports("sse4.1"); }
-size_t run_interval_diff_sse41(const int diff_window_size,
+#ifdef HAVE_SSE3
+static bool sse3_available(void) { return __builtin_cpu_supports("sse3"); }
+size_t run_interval_diff_sse3(const int diff_window_size,
 		const void *__restrict__ imod, void *__restrict__ ibase,
 		uint32_t *__restrict__ idiff, size_t i, const size_t i_end);
 #endif
@@ -166,10 +148,10 @@ interval_diff_fn_t get_diff_function(enum diff_type type, int *alignment_bits)
 		return run_interval_diff_neon;
 	}
 #endif
-#ifdef HAVE_SSE41
-	if ((type == DIFF_FASTEST || type == DIFF_SSE41) && sse41_available()) {
+#ifdef HAVE_SSE3
+	if ((type == DIFF_FASTEST || type == DIFF_SSE3) && sse3_available()) {
 		*alignment_bits = 5;
-		return run_interval_diff_sse41;
+		return run_interval_diff_sse3;
 	}
 #endif
 	if ((type == DIFF_FASTEST || type == DIFF_C)) {

--- a/src/kernel.h
+++ b/src/kernel.h
@@ -37,7 +37,7 @@ enum diff_type {
 	DIFF_FASTEST,
 	DIFF_AVX512F,
 	DIFF_AVX2,
-	DIFF_SSE41,
+	DIFF_SSE3,
 	DIFF_NEON,
 	DIFF_C,
 };
@@ -45,13 +45,18 @@ enum diff_type {
 /** Returns a function pointer to a diff construction kernel, and indicates
 * the alignment of the data which is to be passed in */
 interval_diff_fn_t get_diff_function(enum diff_type type, int *alignment_bits);
+/** Given intervals aligned to 1<<alignment_bits, create a diff of changed
+ * over base, and update base to match changed. */
 size_t construct_diff_core(interval_diff_fn_t idiff_fn, int alignment_bits,
 		const struct interval *__restrict__ damaged_intervals,
 		int n_intervals, void *__restrict__ base,
 		const void *__restrict__ changed, void *__restrict__ diff);
+/** If the bytes after the last multiple of 1<<alignment_bits differ, copy
+ * them over base and append the to the diff */
 size_t construct_diff_trailing(size_t size, int alignment_bits,
 		char *__restrict__ base, const char *__restrict__ changed,
 		char *__restrict__ diff);
+/** Apply a diff to both target buffers */
 void apply_diff(size_t size, char *__restrict__ target1,
 		char *__restrict__ target2, size_t diffsize, size_t ntrailing,
 		const char *__restrict__ diff);

--- a/src/kernel_avx2.c
+++ b/src/kernel_avx2.c
@@ -88,15 +88,15 @@ size_t run_interval_diff_avx2(const int diff_window_size,
 				size_t ncom = (size_t)tzcnt(~mask) >> 2;

 				size_t block_shift = (ncom & 7);
-				uint64_t mask = 0xffffffffuLL
-						<< (block_shift * 4);
+				uint64_t esmask = 0xffffffffuLL
+						  << (block_shift * 4);
 				__m128i halfsize = _mm_set_epi64x(
-						0uLL, (long long)mask);
-				__m256i storemask =
+						0uLL, (long long)esmask);
+				__m256i estoremask =
 						_mm256_cvtepi8_epi64(halfsize);
 				_mm256_maskstore_epi32(
 						(int *)&diff[dc - block_shift],
-						storemask, ncom < 8 ? m0 : m1);
+						estoremask, ncom < 8 ? m0 : m1);
 				if (ncom < 8) {
 					_mm256_storeu_si256(
 							(__m256i *)&diff[dc +

--- a/src/kernel_avx512f.c
+++ b/src/kernel_avx512f.c
@@ -46,8 +46,7 @@ size_t run_interval_diff_avx512f(const int diff_window_size,
 		for (; i < i_end; i++) {
 			__m512i m = _mm512_load_si512(&mod[i]);
 			__m512i b = _mm512_load_si512(&base[i]);
-			uint32_t mask = _cvtmask16_u32(
-					_mm512_cmpeq_epi32_mask(m, b));
+			uint32_t mask = (uint32_t)_mm512_cmpeq_epi32_mask(m, b);
 			if (mask != 0xffff) {
 				_mm512_store_si512(&base[i], m);

@@ -90,8 +89,7 @@ size_t run_interval_diff_avx512f(const int diff_window_size,
 		for (; i < i_end; i++) {
 			__m512i m = _mm512_load_si512(&mod[i]);
 			__m512i b = _mm512_load_si512(&base[i]);
-			uint32_t mask = _cvtmask16_u32(
-					_mm512_cmpeq_epi32_mask(m, b));
+			uint32_t mask = (uint32_t)_mm512_cmpeq_epi32_mask(m, b);

 			/* Reset trailing counter if anything changed */
 			uint32_t amask = ~(mask << 16);

--- a/src/kernel_sse41.c
+++ b/src/kernel_sse41.c
@@ -29,11 +29,9 @@

 #include <emmintrin.h> // sse
 #include <pmmintrin.h> // sse2
-#include <smmintrin.h> // sse4.1
 #include <tmmintrin.h> // sse3
-#include <xmmintrin.h> // ssse3

-size_t run_interval_diff_sse41(const int diff_window_size,
+size_t run_interval_diff_sse3(const int diff_window_size,
 		const void *__restrict__ imod, void *__restrict__ ibase,
 		uint32_t *__restrict__ diff, size_t i, const size_t i_end)
 {
@@ -64,12 +62,17 @@ size_t run_interval_diff_sse41(const int diff_window_size,
 				_mm_storeu_si128(&base[2 * i + 1], m1);

 				/* Write the changed bytes, starting at the
-				 * first modified term, and set the
-				 * unchanged counter */
+				 * first modified term, and set the unchanged
+				 * counter.  */
 				size_t ncom = (size_t)__builtin_ctz(~mask) >> 2;
-				uint32_t *mod = (uint32_t *)&base[2 * i];
+				union {
+					__m128i s[2];
+					uint32_t v[8];
+				} tmp;
+				tmp.s[0] = m0;
+				tmp.s[1] = m1;
 				for (size_t z = ncom; z < 8; z++) {
-					diff[dc++] = mod[z];
+					diff[dc++] = tmp.v[z];
 				}
 				trailing_unchanged = __builtin_clz(~mask) >> 2;
 				ctrl_blocks[0] = (uint32_t)(8 * i + ncom);

--- a/src/main.h
+++ b/src/main.h
@@ -47,17 +47,22 @@ struct globals {
 	struct thread_pool threads;
 };

-/* chanfd: connected socket to channel
+/** Main processing loop
+ *
+ * chanfd: connected socket to channel
 * progfd: connected socket to Wayland program
 * linkfd: optional connected socket providing new chanfds */
 int main_interface_loop(int chanfd, int progfd, int linkfd,
 		const struct main_config *config, bool display_side);
+/** Act as a Wayland server */
 int run_server(const char *socket_path, const char *display_path,
 		const char *control_path, const struct main_config *config,
 		bool oneshot, bool unlink_at_end, const char *application,
 		char *const app_argv[]);
+/** Act as a Wayland client */
 int run_client(const char *socket_path, const struct main_config *config,
 		bool oneshot, bool via_socket, pid_t eol_pid);
+/** Run benchmarking tool; n_worker_threads defined as with \ref main_config */
 int run_bench(float bandwidth_mBps, int n_worker_threads);

 #endif // WAYPIPE_MAIN_H
--- a/src/mainloop.c
+++ b/src/mainloop.c
--- a/src/meson.build
+++ b/src/meson.build

-waypipe_source_files = ['bench.c', 'client.c', 'dmabuf.c', 'handlers.c', 'kernel.c', 'mainloop.c', 'parsing.c', 'server.c', 'shadow.c', 'interval.c', 'util.c', 'video.c']
-waypipe_dependencies = [
-	libgbm,          # General GPU buffer creation, aligned with dmabuf proto
-	liblz4,          # Fast compression option
-	libzstd,         # Slow compression option
-	libavcodec,libavutil,libswscale, # Video encoding
+waypipe_source_files = ['dmabuf.c', 'handlers.c', 'kernel.c', 'mainloop.c', 'parsing.c', 'platform.c', 'shadow.c', 'interval.c', 'util.c', 'video.c']
+waypipe_deps = [
 	pthreads,        # To run expensive computations in parallel
 	protos,          # Wayland protocol data
 	rt,              # For shared memory
-	libva,           # For NV12->RGB conversions
 ]
+if config_data.has('HAS_DMABUF')
+	# General GPU buffer creation, aligned with dmabuf proto
+	waypipe_deps += [libgbm]
+endif
+if config_data.has('HAS_LZ4')
+	waypipe_deps += [liblz4] # Fast compression option
+endif
+if config_data.has('HAS_ZSTD')
+	waypipe_deps += [libzstd] # Slow compression option
+endif
+if config_data.has('HAS_VIDEO')
+	waypipe_deps += [libavcodec,libavutil,libswscale]
+endif
+if config_data.has('HAS_VAAPI')
+	waypipe_deps += [libva] # For NV12->RGB conversions
+endif

 # Conditionally compile SIMD-optimized code.
 # (The meson simd module is a bit too limited for this)
@@ -22,9 +33,9 @@ if cc.has_argument('-mavx2') and cc.has_argument('-mlzcnt') and cc.has_argument(
 	kernel_libs += static_library('kernel_avx2', 'kernel_avx2.c', c_args:['-mavx2', '-mlzcnt', '-mbmi'])
 	config_data.set('HAVE_AVX2', 1, description: 'Compiler supports AVX2')
 endif
-if cc.has_argument('-msse4.1')
-	kernel_libs += static_library('kernel_sse41', 'kernel_sse41.c', c_args:['-msse4.1'])
-	config_data.set('HAVE_SSE41', 1, description: 'Compiler supports SSE 4.1')
+if cc.has_argument('-msse3')
+	kernel_libs += static_library('kernel_sse3', 'kernel_sse3.c', c_args:['-msse3'])
+	config_data.set('HAVE_SSE3', 1, description: 'Compiler supports SSE 3')
 endif
 if host_machine.cpu_family() == 'aarch64' or cc.has_argument('-mfpu=neon')
 	neon_args = host_machine.cpu_family() == 'aarch64' ? [] : ['-mfpu=neon']
@@ -53,12 +64,12 @@ lib_waypipe_src = static_library(
 	waypipe_source_files,
 	include_directories: waypipe_includes,
 	link_with: kernel_libs,
-	dependencies: waypipe_dependencies,
+	dependencies: waypipe_deps,
 )

 waypipe_prog = executable(
 	'waypipe',
-	['waypipe.c'],
+	['waypipe.c', 'bench.c', 'client.c', 'server.c'],
 	link_with: lib_waypipe_src,
 	install: true
 )
--- a/src/parsing.c
+++ b/src/parsing.c
@@ -212,6 +212,7 @@ static bool build_new_objects(const struct msg_data *data,
 			pos += gap_no == 0 ? data->base_gap
 					   : data->trail_gap[gap_no - 1];
 			pos += (payload[pos - 1] + 3) / 4;
+			gap_no++;
 		} else {
 			uint32_t new_id = payload[pos + data->new_obj_idxs[k]];
 			if (new_id == caller_obj->obj_id) {

--- a/src/parsing.h
+++ b/src/parsing.h
@@ -36,20 +36,20 @@ struct main_config;
 struct wp_interface;
 struct msg_handler {
 	const struct wp_interface *interface;
-	// these are structs packed densely with function pointers
+	/** These are structs packed densely with function pointers */
 	const void *event_handlers;
 	const void *request_handlers;
-	// can the type be produced via wl_registry::bind ?
+	/** Can the type be produced via wl_registry::bind ? */
 	bool is_global;
 };
+/** An object used by the wayland protocol. Specific types may extend
+ * this struct, using the following data as a header */
 struct wp_object {
-	/* An object used by the wayland protocol. Specific types may extend
-	 * this struct, using the following data as a header */
 	const struct wp_interface *type; // Use to lookup the message handler
 	uint32_t obj_id;
 	bool is_zombie; // object deleted but not yet acknowledged remotely
 };
-
+/** List of all Wayland protocol objects */
 struct obj_list {
 	struct wp_object **objs;
 	int nobj;
@@ -61,6 +61,7 @@ struct message_tracker {
 	// registry. each type produces 'callbacks'
 	struct obj_list objects;
 };
+/** Context object, to be passed to the protocol handler functions */
 struct context {
 	struct globals *const g;
 	struct obj_list *const obj_list;
@@ -80,6 +81,8 @@ struct context {
 	struct int_window *const fds;
 };

+/** Add a protocol object to the list, replacing any preceding object with
+ * the same id */
 void listset_insert(struct fd_translation_map *map, struct obj_list *lst,
 		struct wp_object *obj);
 void listset_remove(struct obj_list *lst, struct wp_object *obj);
@@ -91,6 +94,7 @@ void cleanup_message_tracker(

 /** Read message size from header; the 8 bytes beyond data must exist */
 int peek_message_size(const void *data);
+enum parse_state { PARSE_KNOWN, PARSE_UNKNOWN, PARSE_ERROR };
 /**
 * The return value is false iff the given message should be dropped.
 * The flag `unidentified_changes` is set to true if the message does
@@ -107,16 +111,19 @@ int peek_message_size(const void *data);
 * The end of `fds` may be moved if any fds are inserted or discarded.
 * The start of fds will be moved, depending on how many fds were consumed.
 */
-enum parse_state { PARSE_KNOWN, PARSE_UNKNOWN, PARSE_ERROR };
 enum parse_state handle_message(struct globals *g, bool on_display_side,
 		bool from_client, struct char_window *chars,
 		struct int_window *fds);

 // handlers.c
+/** Create a new Wayland protocol object of the given type; some types
+ * produce structs extending from wp_object */
 struct wp_object *create_wp_object(
 		uint32_t it, const struct wp_interface *type);
+/** Type-specific destruction routines, also dereferencing linked shadow_fds */
 void destroy_wp_object(
 		struct fd_translation_map *map, struct wp_object *object);
+
 extern const struct msg_handler handlers[];
 extern const struct wp_interface *the_display_interface;