From bb7dc7c20dbe52e56005ee2c71187974cd0fa284 Mon Sep 17 00:00:00 2001
From: Phillip Burgess <paintyourdragon@dslextreme.com>
Date: Thu, 9 Mar 2023 15:00:54 -0800
Subject: [PATCH] Remove soft link to libdvi (copy full directory instead)

For Arduino Library Manager compliance
---
 Readme.md                          |   3 +-
 src/libdvi                         |   1 -
 src/libdvi/CMakeLists.txt          |  33 ++
 src/libdvi/dvi.c                   | 255 ++++++++++++
 src/libdvi/dvi.h                   |  81 ++++
 src/libdvi/dvi_config_defs.h       | 151 +++++++
 src/libdvi/dvi_serialiser.c        |  73 ++++
 src/libdvi/dvi_serialiser.h        |  22 +
 src/libdvi/dvi_serialiser.pio      |  53 +++
 src/libdvi/dvi_serialiser.pio.h    | 101 +++++
 src/libdvi/dvi_timing.c            | 324 +++++++++++++++
 src/libdvi/dvi_timing.h            |  99 +++++
 src/libdvi/tmds_encode.S           | 623 +++++++++++++++++++++++++++++
 src/libdvi/tmds_encode.c           | 305 ++++++++++++++
 src/libdvi/tmds_encode.h           |  46 +++
 src/libdvi/tmds_encode_1bpp.pio    |  46 +++
 src/libdvi/tmds_table.h            |  76 ++++
 src/libdvi/tmds_table_fullres.h    | 139 +++++++
 src/libdvi/tmds_table_gen.py       | 150 +++++++
 src/libdvi/util_queue_u32_inline.h |  83 ++++
 20 files changed, 2661 insertions(+), 3 deletions(-)
 delete mode 120000 src/libdvi
 create mode 100644 src/libdvi/CMakeLists.txt
 create mode 100644 src/libdvi/dvi.c
 create mode 100644 src/libdvi/dvi.h
 create mode 100644 src/libdvi/dvi_config_defs.h
 create mode 100644 src/libdvi/dvi_serialiser.c
 create mode 100644 src/libdvi/dvi_serialiser.h
 create mode 100644 src/libdvi/dvi_serialiser.pio
 create mode 100644 src/libdvi/dvi_serialiser.pio.h
 create mode 100644 src/libdvi/dvi_timing.c
 create mode 100644 src/libdvi/dvi_timing.h
 create mode 100644 src/libdvi/tmds_encode.S
 create mode 100644 src/libdvi/tmds_encode.c
 create mode 100644 src/libdvi/tmds_encode.h
 create mode 100644 src/libdvi/tmds_encode_1bpp.pio
 create mode 100644 src/libdvi/tmds_table.h
 create mode 100644 src/libdvi/tmds_table_fullres.h
 create mode 100755 src/libdvi/tmds_table_gen.py
 create mode 100644 src/libdvi/util_queue_u32_inline.h

diff --git a/Readme.md b/Readme.md
index 4193f5b..b57676c 100644
--- a/Readme.md
+++ b/Readme.md
@@ -19,8 +19,7 @@ RP2040 core).
 Changes vs main PicoDVI repo:
 - Add library.properties file, src and examples directories per Arduino
 requirements.
-- software/libdvi is soft-linked into src so Arduino IDE can compile these
-parts.
+- A full copy of software/libdvi is made in src (originally was soft-linked but Arduino Library Manager does not approve). If any updates are made in the original PicoDVI libdvi directory, copy them here!
 - The file dvi_serialiser.pio.h, normally not part of the distribution and
 generated during the Pico SDK build process, is provided here for Arduino
 build to work. If any changes are made in dvi_serialiser.pio (either here
diff --git a/src/libdvi b/src/libdvi
deleted file mode 120000
index b457413..0000000
--- a/src/libdvi
+++ /dev/null
@@ -1 +0,0 @@
-../software/libdvi
\ No newline at end of file
diff --git a/src/libdvi/CMakeLists.txt b/src/libdvi/CMakeLists.txt
new file mode 100644
index 0000000..7c52661
--- /dev/null
+++ b/src/libdvi/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Note we are using INTERFACE so that the library can be configured per-app
+# with compile-time defines
+
+add_library(libdvi INTERFACE)
+
+target_sources(libdvi INTERFACE
+	${CMAKE_CURRENT_LIST_DIR}/dvi.c
+	${CMAKE_CURRENT_LIST_DIR}/dvi.h
+	${CMAKE_CURRENT_LIST_DIR}/dvi_config_defs.h
+	${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.c
+	${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.h
+	${CMAKE_CURRENT_LIST_DIR}/dvi_timing.c
+	${CMAKE_CURRENT_LIST_DIR}/dvi_timing.h
+	${CMAKE_CURRENT_LIST_DIR}/tmds_encode.S
+	${CMAKE_CURRENT_LIST_DIR}/tmds_encode.c
+	${CMAKE_CURRENT_LIST_DIR}/tmds_encode.h
+	${CMAKE_CURRENT_LIST_DIR}/tmds_table.h
+	${CMAKE_CURRENT_LIST_DIR}/tmds_table_fullres.h
+	${CMAKE_CURRENT_LIST_DIR}/util_queue_u32_inline.h
+	)
+
+target_include_directories(libdvi INTERFACE ${CMAKE_CURRENT_LIST_DIR})
+target_link_libraries(libdvi INTERFACE
+	pico_base_headers
+	pico_util
+	hardware_dma
+	hardware_interp
+	hardware_pio
+	hardware_pwm
+	)
+
+pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.pio)
+pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/tmds_encode_1bpp.pio)
diff --git a/src/libdvi/dvi.c b/src/libdvi/dvi.c
new file mode 100644
index 0000000..07ff5b6
--- /dev/null
+++ b/src/libdvi/dvi.c
@@ -0,0 +1,255 @@
+#include <stdlib.h>
+#include "hardware/dma.h"
+#include "hardware/irq.h"
+
+#include "dvi.h"
+#include "dvi_timing.h"
+#include "dvi_serialiser.h"
+#include "tmds_encode.h"
+
+// Adafruit PicoDVI fork requires a couple global items run-time configurable:
+uint8_t dvi_vertical_repeat = DVI_VERTICAL_REPEAT;
+bool    dvi_monochrome_tmds = DVI_MONOCHROME_TMDS;
+
+// Time-critical functions pulled into RAM but each in a unique section to
+// allow garbage collection
+#define __dvi_func(f) __not_in_flash_func(f)
+#define __dvi_func_x(f) __scratch_x(__STRING(f)) f
+
+// We require exclusive use of a DMA IRQ line. (you wouldn't want to share
+// anyway). It's possible in theory to hook both IRQs and have two DVI outs.
+static struct dvi_inst *dma_irq_privdata[2];
+static void dvi_dma0_irq();
+static void dvi_dma1_irq();
+
+void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue) {
+	dvi_timing_state_init(&inst->timing_state);
+	dvi_serialiser_init(&inst->ser_cfg);
+	for (int i = 0; i < N_TMDS_LANES; ++i) {
+		inst->dma_cfg[i].chan_ctrl = dma_claim_unused_channel(true);
+		inst->dma_cfg[i].chan_data = dma_claim_unused_channel(true);
+		inst->dma_cfg[i].tx_fifo = (void*)&inst->ser_cfg.pio->txf[inst->ser_cfg.sm_tmds[i]];
+		inst->dma_cfg[i].dreq = pio_get_dreq(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i], true);
+	}
+	inst->late_scanline_ctr = 0;
+	inst->tmds_buf_release_next = NULL;
+	inst->tmds_buf_release = NULL;
+	queue_init_with_spinlock(&inst->q_tmds_valid,   sizeof(void*),  8, spinlock_tmds_queue);
+	queue_init_with_spinlock(&inst->q_tmds_free,    sizeof(void*),  8, spinlock_tmds_queue);
+	queue_init_with_spinlock(&inst->q_colour_valid, sizeof(void*),  8, spinlock_colour_queue);
+	queue_init_with_spinlock(&inst->q_colour_free,  sizeof(void*),  8, spinlock_colour_queue);
+
+	dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, true, &inst->dma_list_vblank_sync);
+	dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, false, &inst->dma_list_vblank_nosync);
+#if defined(ARDUINO)
+	dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (uint32_t*)SRAM_BASE, &inst->dma_list_active);
+#else
+	dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (void*)SRAM_BASE, &inst->dma_list_active);
+#endif
+	dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, NULL, &inst->dma_list_error);
+
+	for (int i = 0; i < DVI_N_TMDS_BUFFERS; ++i) {
+		void *tmdsbuf;
+		if (dvi_monochrome_tmds)
+			tmdsbuf = malloc(inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
+		else
+			tmdsbuf = malloc(3 * inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
+		if (!tmdsbuf)
+			panic("TMDS buffer allocation failed");
+		queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
+	}
+}
+
+// The IRQs will run on whichever core calls this function (this is why it's
+// called separately from dvi_init)
+void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num) {
+	uint32_t mask_sync_channel = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
+	uint32_t mask_all_channels = 0;
+	for (int i = 0; i < N_TMDS_LANES; ++i)
+		mask_all_channels |= 1u << inst->dma_cfg[i].chan_ctrl | 1u << inst->dma_cfg[i].chan_data;
+
+	dma_hw->ints0 = mask_sync_channel;
+	if (irq_num == DMA_IRQ_0) {
+		hw_write_masked(&dma_hw->inte0, mask_sync_channel, mask_all_channels);
+		dma_irq_privdata[0] = inst;
+		irq_set_exclusive_handler(DMA_IRQ_0, dvi_dma0_irq);
+	}
+	else {
+		hw_write_masked(&dma_hw->inte1, mask_sync_channel, mask_all_channels);
+		dma_irq_privdata[1] = inst;
+		irq_set_exclusive_handler(DMA_IRQ_1, dvi_dma1_irq);
+	}
+	irq_set_enabled(irq_num, true);
+}
+
+// Set up control channels to make transfers to data channels' control
+// registers (but don't trigger the control channels -- this is done either by
+// data channel CHAIN_TO or an initial write to MULTI_CHAN_TRIGGER)
+static inline void __attribute__((always_inline)) _dvi_load_dma_op(const struct dvi_lane_dma_cfg dma_cfg[], struct dvi_scanline_dma_list *l) {
+	for (int i = 0; i < N_TMDS_LANES; ++i) {
+		dma_channel_config cfg = dma_channel_get_default_config(dma_cfg[i].chan_ctrl);
+		channel_config_set_ring(&cfg, true, 4); // 16-byte write wrap
+		channel_config_set_read_increment(&cfg, true);
+		channel_config_set_write_increment(&cfg, true);
+		dma_channel_configure(
+			dma_cfg[i].chan_ctrl,
+			&cfg,
+			&dma_hw->ch[dma_cfg[i].chan_data],
+			dvi_lane_from_list(l, i),
+			4, // Configure all 4 registers then halt until next CHAIN_TO
+			false
+		);
+	}
+}
+
+// Setup first set of control block lists, configure the control channels, and
+// trigger them. Control channels will subsequently be triggered only by DMA
+// CHAIN_TO on data channel completion. IRQ handler *must* be prepared before
+// calling this. (Hooked to DMA IRQ0)
+void dvi_start(struct dvi_inst *inst) {
+	_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync);
+	dma_start_channel_mask(
+		(1u << inst->dma_cfg[0].chan_ctrl) |
+		(1u << inst->dma_cfg[1].chan_ctrl) |
+		(1u << inst->dma_cfg[2].chan_ctrl));
+
+	// We really don't want the FIFOs to bottom out, so wait for full before
+	// starting the shift-out.
+	for (int i = 0; i < N_TMDS_LANES; ++i)
+		while (!pio_sm_is_tx_fifo_full(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i]))
+			tight_loop_contents();
+	dvi_serialiser_enable(&inst->ser_cfg, true);
+}
+
+static inline void __dvi_func_x(_dvi_prepare_scanline_8bpp)(struct dvi_inst *inst, uint32_t *scanbuf) {
+	uint32_t *tmdsbuf;
+	queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
+	uint pixwidth = inst->timing->h_active_pixels;
+	uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
+	// Scanline buffers are half-resolution; the functions take the number of *input* pixels as parameter.
+	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_8BPP_BLUE_MSB,  DVI_8BPP_BLUE_LSB );
+	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_8BPP_GREEN_MSB, DVI_8BPP_GREEN_LSB);
+	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_8BPP_RED_MSB,   DVI_8BPP_RED_LSB  );
+	queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
+}
+
+static inline void __dvi_func_x(_dvi_prepare_scanline_16bpp)(struct dvi_inst *inst, uint32_t *scanbuf) {
+	uint32_t *tmdsbuf;
+	queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
+	uint pixwidth = inst->timing->h_active_pixels;
+	uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
+	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_16BPP_BLUE_MSB,  DVI_16BPP_BLUE_LSB );
+	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_16BPP_GREEN_MSB, DVI_16BPP_GREEN_LSB);
+	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_16BPP_RED_MSB,   DVI_16BPP_RED_LSB  );
+	queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
+}
+
+// "Worker threads" for TMDS encoding (core enters and never returns, but still handles IRQs)
+
+// Version where each record in q_colour_valid is one scanline:
+void __dvi_func(dvi_scanbuf_main_8bpp)(struct dvi_inst *inst) {
+	uint y = 0;
+	while (1) {
+		uint32_t *scanbuf;
+		queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf);
+		_dvi_prepare_scanline_8bpp(inst, scanbuf);
+		queue_add_blocking_u32(&inst->q_colour_free, &scanbuf);
+		++y;
+		if (y == inst->timing->v_active_lines) {
+			y = 0;
+		}
+	}
+	__builtin_unreachable();
+}
+
+// Ugh copy/paste but it lets us garbage collect the TMDS stuff that is not being used from .scratch_x
+void __dvi_func(dvi_scanbuf_main_16bpp)(struct dvi_inst *inst) {
+	uint y = 0;
+	while (1) {
+		uint32_t *scanbuf;
+		queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf);
+		_dvi_prepare_scanline_16bpp(inst, scanbuf);
+		queue_add_blocking_u32(&inst->q_colour_free, &scanbuf);
+		++y;
+		if (y == inst->timing->v_active_lines) {
+			y = 0;
+		}
+	}
+	__builtin_unreachable();
+}
+
+static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) {
+	// Every fourth interrupt marks the start of the horizontal active region. We
+	// now have until the end of this region to generate DMA blocklist for next
+	// scanline.
+	dvi_timing_state_advance(inst->timing, &inst->timing_state);
+	if (inst->tmds_buf_release && !queue_try_add_u32(&inst->q_tmds_free, &inst->tmds_buf_release))
+		panic("TMDS free queue full in IRQ!");
+	inst->tmds_buf_release = inst->tmds_buf_release_next;
+	inst->tmds_buf_release_next = NULL;
+
+	// Make sure all three channels have definitely loaded their last block
+	// (should be within a few cycles of one another)
+	for (int i = 0; i < N_TMDS_LANES; ++i) {
+		while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
+			tight_loop_contents();
+	}
+
+	uint32_t *tmdsbuf;
+	while (inst->late_scanline_ctr > 0 && queue_try_remove_u32(&inst->q_tmds_valid, &tmdsbuf)) {
+		// If we displayed this buffer then it would be in the wrong vertical
+		// position on-screen. Just pass it back.
+		queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
+		--inst->late_scanline_ctr;
+	}
+
+	if (inst->timing_state.v_state != DVI_STATE_ACTIVE) {
+		// Don't care
+		tmdsbuf = NULL;
+	}
+	else if (queue_try_peek_u32(&inst->q_tmds_valid, &tmdsbuf)) {
+		if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) {
+			queue_remove_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
+			inst->tmds_buf_release_next = tmdsbuf;
+		}
+	}
+	else {
+		// No valid scanline was ready (generates solid red scanline)
+		tmdsbuf = NULL;
+		if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1)
+			++inst->late_scanline_ctr;
+	}
+
+	switch (inst->timing_state.v_state) {
+		case DVI_STATE_ACTIVE:
+			if (tmdsbuf) {
+				dvi_update_scanline_data_dma(inst->timing, tmdsbuf, &inst->dma_list_active);
+				_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_active);
+			}
+			else {
+				_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_error);
+			}
+			if (inst->scanline_callback && inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) {
+				inst->scanline_callback();
+			}
+			break;
+		case DVI_STATE_SYNC:
+			_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_sync);
+			break;
+		default:
+			_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync);
+			break;
+	}
+}
+
+static void __dvi_func(dvi_dma0_irq)() {
+	struct dvi_inst *inst = dma_irq_privdata[0];
+	dma_hw->ints0 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
+	dvi_dma_irq_handler(inst);
+}
+
+static void __dvi_func(dvi_dma1_irq)() {
+	struct dvi_inst *inst = dma_irq_privdata[1];
+	dma_hw->ints1 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
+	dvi_dma_irq_handler(inst);
+}
diff --git a/src/libdvi/dvi.h b/src/libdvi/dvi.h
new file mode 100644
index 0000000..ee4a1a1
--- /dev/null
+++ b/src/libdvi/dvi.h
@@ -0,0 +1,81 @@
+#ifndef _DVI_H
+#define _DVI_H
+
+#define N_TMDS_LANES 3
+#define TMDS_SYNC_LANE 0 // blue!
+
+#include "pico/util/queue.h"
+
+#include "dvi_config_defs.h"
+#include "dvi_timing.h"
+#include "dvi_serialiser.h"
+#include "util_queue_u32_inline.h"
+
+typedef void (*dvi_callback_t)(void);
+
+struct dvi_inst {
+	// Config ---
+	const struct dvi_timing *timing;
+	struct dvi_lane_dma_cfg dma_cfg[N_TMDS_LANES];
+	struct dvi_timing_state timing_state;
+	struct dvi_serialiser_cfg ser_cfg;
+	// Called in the DMA IRQ once per scanline -- careful with the run time!
+	dvi_callback_t scanline_callback;
+
+	// State ---
+	struct dvi_scanline_dma_list dma_list_vblank_sync;
+	struct dvi_scanline_dma_list dma_list_vblank_nosync;
+	struct dvi_scanline_dma_list dma_list_active;
+	struct dvi_scanline_dma_list dma_list_error;
+
+	// After a TMDS buffer has been enqueue via a control block for the last
+	// time, two IRQs must go by before freeing. The first indicates the control
+	// block for this buf has been loaded, and the second occurs some time after
+	// the actual data DMA transfer has completed.
+	uint32_t *tmds_buf_release_next;
+	uint32_t *tmds_buf_release;
+	// Remember how far behind the source is on TMDS scanlines, so we can output
+	// solid colour until they catch up (rather than dying spectacularly)
+	uint late_scanline_ctr;
+
+	// Encoded scanlines:
+	queue_t q_tmds_valid;
+	queue_t q_tmds_free;
+
+	// Either scanline buffers or frame buffers:
+	queue_t q_colour_valid;
+	queue_t q_colour_free;
+
+};
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+// Set up data structures and hardware for DVI.
+void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue);
+
+// Call this after calling dvi_init(). DVI DMA interrupts will be routed to
+// whichever core called this function. Registers an exclusive IRQ handler.
+void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num);
+
+// Start actually wiggling TMDS pairs. Call this once you have initialised the
+// DVI, have registered the IRQs, and are producing rendered scanlines.
+void dvi_start(struct dvi_inst *inst);
+
+// TMDS encode worker function: core enters and doesn't leave, but still
+// responds to IRQs. Repeatedly pop a scanline buffer from q_colour_valid,
+// TMDS encode it, and pass it to the tmds valid queue.
+void dvi_scanbuf_main_8bpp(struct dvi_inst *inst);
+void dvi_scanbuf_main_16bpp(struct dvi_inst *inst);
+
+// Same as above, but each q_colour_valid entry is a framebuffer
+void dvi_framebuf_main_8bpp(struct dvi_inst *inst);
+void dvi_framebuf_main_16bpp(struct dvi_inst *inst);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/src/libdvi/dvi_config_defs.h b/src/libdvi/dvi_config_defs.h
new file mode 100644
index 0000000..66c1e58
--- /dev/null
+++ b/src/libdvi/dvi_config_defs.h
@@ -0,0 +1,151 @@
+#ifndef _DVI_CONFIG_DEFS_H
+#define _DVI_CONFIG_DEFS_H
+
+// Compile-time configuration definitions for libdvi. This file provides
+// defaults -- you can override using a board header, or setting compile
+// definitions directly from the commandline (e.g. using CMake
+// target_compile_definitions())
+
+// Pull in base headers to make sure board definitions override the
+// definitions provided here. Note this file is included in asm and C.
+#include "hardware/platform_defs.h"
+#include "pico/config.h"
+
+// ----------------------------------------------------------------------------
+// General DVI defines
+
+// How many times to output the same TMDS buffer before recyling it onto the
+// free queue. Pixels are repeated vertically if this is >1.
+#ifndef DVI_VERTICAL_REPEAT
+#define DVI_VERTICAL_REPEAT 2
+#endif
+
+// Number of TMDS buffers to allocate (malloc()) in DVI init. You can set this
+// to 0 if you want to allocate your own (e.g. if you want static buffers)
+#ifndef DVI_N_TMDS_BUFFERS
+#define DVI_N_TMDS_BUFFERS 3
+#endif
+
+// If 1, replace the DVI serialiser with a 10n1 UART (1 start bit, 10 data
+// bits, 1 stop bit) so the stream can be dumped and analysed easily.
+#ifndef DVI_SERIAL_DEBUG
+#define DVI_SERIAL_DEBUG 0
+#endif
+
+// If 1, the same TMDS symbols are sent to all 3 lanes during the horizontal
+// active period. This means only monochrome colour is available, but the TMDS
+// buffers are 3 times smaller as a result, and the performance requirements
+// for encode are also cut by 3.
+#ifndef DVI_MONOCHROME_TMDS
+#define DVI_MONOCHROME_TMDS 0
+#endif
+
+// By default, we assume each 32-bit word written to a PIO FIFO contains 2x
+// 10-bit TMDS symbols, concatenated into the lower 20 bits, least-significant
+// first. This is convenient if you are generating two or more pixels at once,
+// e.g. using the pixel-doubling TMDS encode. You can change this value to 1
+// (so each word contains 1 symbol) for e.g. full resolution RGB encode. Note
+// that this value needs to divide the DVI horizontal timings, so is limited
+// to 1 or 2.
+#ifndef DVI_SYMBOLS_PER_WORD
+#define DVI_SYMBOLS_PER_WORD 2
+#endif
+
+#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2
+#error "Unsupported value for DVI_SYMBOLS_PER_WORD"
+#endif
+
+// ----------------------------------------------------------------------------
+// Pixel component layout
+
+// By default we go R, G, B from MSB -> LSB. Override to e.g. swap RGB <-> BGR
+
+// Default 8bpp layout: RGB332, {r[1:0], g[2:0], b[1:0]}
+
+#ifndef DVI_8BPP_RED_MSB
+#define DVI_8BPP_RED_MSB 7
+#endif
+
+#ifndef DVI_8BPP_RED_LSB
+#define DVI_8BPP_RED_LSB 5
+#endif
+
+#ifndef DVI_8BPP_GREEN_MSB
+#define DVI_8BPP_GREEN_MSB 4
+#endif
+
+#ifndef DVI_8BPP_GREEN_LSB
+#define DVI_8BPP_GREEN_LSB 2
+#endif
+
+#ifndef DVI_8BPP_BLUE_MSB
+#define DVI_8BPP_BLUE_MSB 1
+#endif
+
+#ifndef DVI_8BPP_BLUE_LSB
+#define DVI_8BPP_BLUE_LSB 0
+#endif
+
+// Default 16bpp layout: RGB565, {r[4:0], g[5:0], b[4:0]}
+
+#ifndef DVI_16BPP_RED_MSB
+#define DVI_16BPP_RED_MSB 15
+#endif
+
+#ifndef DVI_16BPP_RED_LSB
+#define DVI_16BPP_RED_LSB 11
+#endif
+
+#ifndef DVI_16BPP_GREEN_MSB
+#define DVI_16BPP_GREEN_MSB 10
+#endif
+
+#ifndef DVI_16BPP_GREEN_LSB
+#define DVI_16BPP_GREEN_LSB 5
+#endif
+
+#ifndef DVI_16BPP_BLUE_MSB
+#define DVI_16BPP_BLUE_MSB 4
+#endif
+
+#ifndef DVI_16BPP_BLUE_LSB
+#define DVI_16BPP_BLUE_LSB 0
+#endif
+
+// Default 1bpp layout: bitwise little-endian, i.e. least significant bit of
+// each word is the first (leftmost) of a block of 32 pixels.
+
+// If 1, reverse the order of pixels within each byte. Order of bytes within
+// each word is still little-endian.
+#ifndef DVI_1BPP_BIT_REVERSE
+#define DVI_1BPP_BIT_REVERSE 1 // Adafruit_GFX GFXcanvas1 requires this 1
+#endif
+
+// ----------------------------------------------------------------------------
+// TMDS encode controls
+
+// Number of TMDS loop bodies between branches. cmp + branch costs 3 cycles,
+// so you can easily save 10% of encode time by bumping this. Note that body
+// will *already* produce multiple pixels, and total symbols per iteration
+// must cleanly divide symbols per scanline, else the loop won't terminate.
+// Point gun away from foot.
+#ifndef TMDS_ENCODE_UNROLL
+#define TMDS_ENCODE_UNROLL 1
+#endif
+
+// If 1, don't save/restore the interpolators on full-resolution TMDS encode.
+// Speed hack. The TMDS code uses both interpolators, for each of the 3 data
+// channels, so this define avoids 6 save/restores per scanline.
+#ifndef TMDS_FULLRES_NO_INTERP_SAVE
+#define TMDS_FULLRES_NO_INTERP_SAVE 0
+#endif
+
+// If 1, don't DC-balance the output of full resolution encode. Hilariously
+// noncompliant, but Dell Ultrasharp -- the honey badger of computer monitors
+// -- does not seem to mind (it helps that we DC-couple). Another speed hack,
+// useful when you are trying to get everything else up to speed.
+#ifndef TMDS_FULLRES_NO_DC_BALANCE
+#define TMDS_FULLRES_NO_DC_BALANCE 0
+#endif
+
+#endif
diff --git a/src/libdvi/dvi_serialiser.c b/src/libdvi/dvi_serialiser.c
new file mode 100644
index 0000000..308f23f
--- /dev/null
+++ b/src/libdvi/dvi_serialiser.c
@@ -0,0 +1,73 @@
+#include "pico.h"
+#include "hardware/pio.h"
+#include "hardware/gpio.h"
+#include "hardware/pwm.h"
+#include "hardware/structs/padsbank0.h"
+
+#include "dvi.h"
+#include "dvi_serialiser.h"
+#include "dvi_serialiser.pio.h"
+
+static void dvi_configure_pad(uint gpio, bool invert) {
+	// 2 mA drive, enable slew rate limiting (this seems fine even at 720p30, and
+	// the 3V3 LDO doesn't get warm like when turning all the GPIOs up to 11).
+	// Also disable digital receiver.
+	hw_write_masked(
+		&padsbank0_hw->io[gpio],
+		(0 << PADS_BANK0_GPIO0_DRIVE_LSB),
+		PADS_BANK0_GPIO0_DRIVE_BITS | PADS_BANK0_GPIO0_SLEWFAST_BITS | PADS_BANK0_GPIO0_IE_BITS
+	);
+	gpio_set_outover(gpio, invert ? GPIO_OVERRIDE_INVERT : GPIO_OVERRIDE_NORMAL);
+}
+
+void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg) {
+#if DVI_SERIAL_DEBUG
+	uint offset = pio_add_program(cfg->pio, &dvi_serialiser_debug_program);
+#else
+	uint offset = pio_add_program(cfg->pio, &dvi_serialiser_program);
+#endif
+	cfg->prog_offs = offset;
+
+	for (int i = 0; i < N_TMDS_LANES; ++i) {
+		pio_sm_claim(cfg->pio, cfg->sm_tmds[i]);
+		dvi_serialiser_program_init(
+			cfg->pio,
+			cfg->sm_tmds[i],
+			offset,
+			cfg->pins_tmds[i],
+			DVI_SERIAL_DEBUG
+		);
+		dvi_configure_pad(cfg->pins_tmds[i], cfg->invert_diffpairs);
+		dvi_configure_pad(cfg->pins_tmds[i] + 1, cfg->invert_diffpairs);
+	}
+
+	// Use a PWM slice to drive the pixel clock. Both GPIOs must be on the same
+	// slice (lower-numbered GPIO must be even).
+	assert(cfg->pins_clk % 2 == 0);
+	uint slice = pwm_gpio_to_slice_num(cfg->pins_clk);
+	// 5 cycles high, 5 low. Invert one channel so that we get complementary outputs.
+	pwm_config pwm_cfg = pwm_get_default_config();
+	pwm_config_set_output_polarity(&pwm_cfg, true, false);
+	pwm_config_set_wrap(&pwm_cfg, 9);
+	pwm_init(slice, &pwm_cfg, false);
+	pwm_set_both_levels(slice, 5, 5);
+
+	for (uint i = cfg->pins_clk; i <= cfg->pins_clk + 1; ++i) {
+		gpio_set_function(i, GPIO_FUNC_PWM);
+		dvi_configure_pad(i, cfg->invert_diffpairs);
+	}
+}
+
+void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable) {
+	uint mask = 0;
+	for (int i = 0; i < N_TMDS_LANES; ++i)
+		mask |= 1u << (cfg->sm_tmds[i] + PIO_CTRL_SM_ENABLE_LSB);
+	if (enable) {
+		hw_set_bits(&cfg->pio->ctrl, mask);
+		pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), true);
+	}
+	else {
+		hw_clear_bits(&cfg->pio->ctrl, mask);
+		pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), false);
+	}
+}
diff --git a/src/libdvi/dvi_serialiser.h b/src/libdvi/dvi_serialiser.h
new file mode 100644
index 0000000..d978f60
--- /dev/null
+++ b/src/libdvi/dvi_serialiser.h
@@ -0,0 +1,22 @@
+#ifndef _DVI_SERIALISER_H
+#define _DVI_SERIALISER_H
+
+#include "hardware/pio.h"
+#include "dvi_config_defs.h"
+
+#define N_TMDS_LANES 3
+
+struct dvi_serialiser_cfg {
+	PIO pio;
+	uint sm_tmds[N_TMDS_LANES];
+	uint pins_tmds[N_TMDS_LANES];
+	uint pins_clk;
+	bool invert_diffpairs;
+	uint prog_offs;
+};
+
+void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg);
+void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable);
+uint32_t dvi_single_to_diff(uint32_t in);
+
+#endif
diff --git a/src/libdvi/dvi_serialiser.pio b/src/libdvi/dvi_serialiser.pio
new file mode 100644
index 0000000..520c8e0
--- /dev/null
+++ b/src/libdvi/dvi_serialiser.pio
@@ -0,0 +1,53 @@
+.program dvi_serialiser
+.side_set 2
+.origin 0
+
+; Single-ended -> differential serial
+
+	out pc, 1    side 0b10
+	out pc, 1    side 0b01
+
+.program dvi_serialiser_debug
+.side_set 1 opt
+
+; The debug variant behaves as a UART with 1 start bit, 10 data bits, 1 stop
+; bit, and 5/6ths the data throughput of the TMDS version.
+
+	pull ifempty  side 1 ; Extend stop bit with FIFO stall
+	nop           side 0
+	out pins, 1          ; Unrolled because we require 1 bit / clk
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	out pins, 1
+	
+% c-sdk {
+#include "dvi_config_defs.h"
+
+static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
+    pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
+    pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
+    pio_gpio_init(pio, data_pins);
+    pio_gpio_init(pio, data_pins + 1);
+
+    pio_sm_config c;
+    if (debug) {
+        c = dvi_serialiser_debug_program_get_default_config(offset);
+    }
+    else {
+        c = dvi_serialiser_program_get_default_config(offset);
+    }
+    sm_config_set_sideset_pins(&c, data_pins);
+    if (debug)
+	    sm_config_set_out_pins(&c, data_pins, 1);
+    sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
+    sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
+    pio_sm_init(pio, sm, offset, &c);
+    pio_sm_set_enabled(pio, sm, false);
+}
+%}
diff --git a/src/libdvi/dvi_serialiser.pio.h b/src/libdvi/dvi_serialiser.pio.h
new file mode 100644
index 0000000..d1275fe
--- /dev/null
+++ b/src/libdvi/dvi_serialiser.pio.h
@@ -0,0 +1,101 @@
+// -------------------------------------------------- //
+// This file is autogenerated by pioasm; do not edit! //
+// -------------------------------------------------- //
+
+#pragma once
+
+#if !PICO_NO_HARDWARE
+#include "hardware/pio.h"
+#endif
+
+// -------------- //
+// dvi_serialiser //
+// -------------- //
+
+#define dvi_serialiser_wrap_target 0
+#define dvi_serialiser_wrap 1
+
+static const uint16_t dvi_serialiser_program_instructions[] = {
+            //     .wrap_target
+    0x70a1, //  0: out    pc, 1           side 2     
+    0x68a1, //  1: out    pc, 1           side 1     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program dvi_serialiser_program = {
+    .instructions = dvi_serialiser_program_instructions,
+    .length = 2,
+    .origin = 0,
+};
+
+static inline pio_sm_config dvi_serialiser_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + dvi_serialiser_wrap_target, offset + dvi_serialiser_wrap);
+    sm_config_set_sideset(&c, 2, false, false);
+    return c;
+}
+#endif
+
+// -------------------- //
+// dvi_serialiser_debug //
+// -------------------- //
+
+#define dvi_serialiser_debug_wrap_target 0
+#define dvi_serialiser_debug_wrap 11
+
+static const uint16_t dvi_serialiser_debug_program_instructions[] = {
+            //     .wrap_target
+    0x98e0, //  0: pull   ifempty block   side 1     
+    0xb042, //  1: nop                    side 0     
+    0x6001, //  2: out    pins, 1                    
+    0x6001, //  3: out    pins, 1                    
+    0x6001, //  4: out    pins, 1                    
+    0x6001, //  5: out    pins, 1                    
+    0x6001, //  6: out    pins, 1                    
+    0x6001, //  7: out    pins, 1                    
+    0x6001, //  8: out    pins, 1                    
+    0x6001, //  9: out    pins, 1                    
+    0x6001, // 10: out    pins, 1                    
+    0x6001, // 11: out    pins, 1                    
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program dvi_serialiser_debug_program = {
+    .instructions = dvi_serialiser_debug_program_instructions,
+    .length = 12,
+    .origin = -1,
+};
+
+static inline pio_sm_config dvi_serialiser_debug_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + dvi_serialiser_debug_wrap_target, offset + dvi_serialiser_debug_wrap);
+    sm_config_set_sideset(&c, 2, true, false);
+    return c;
+}
+
+#include "dvi_config_defs.h"
+static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
+    pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
+    pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
+    pio_gpio_init(pio, data_pins);
+    pio_gpio_init(pio, data_pins + 1);
+    pio_sm_config c;
+    if (debug) {
+        c = dvi_serialiser_debug_program_get_default_config(offset);
+    }
+    else {
+        c = dvi_serialiser_program_get_default_config(offset);
+    }
+    sm_config_set_sideset_pins(&c, data_pins);
+    if (debug)
+	    sm_config_set_out_pins(&c, data_pins, 1);
+    sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
+    sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
+    pio_sm_init(pio, sm, offset, &c);
+    pio_sm_set_enabled(pio, sm, false);
+}
+
+#endif
+
diff --git a/src/libdvi/dvi_timing.c b/src/libdvi/dvi_timing.c
new file mode 100644
index 0000000..54ba8e1
--- /dev/null
+++ b/src/libdvi/dvi_timing.c
@@ -0,0 +1,324 @@
+#include "dvi.h"
+#include "dvi_timing.h"
+#include "hardware/dma.h"
+
+// This file contains:
+// - Timing parameters for DVI modes (horizontal + vertical counts, best
+//   achievable bit clock from 12 MHz crystal)
+// - Helper functions for generating DMA lists based on these timings
+
+extern bool dvi_monochrome_tmds; // In dvi.c
+
+// Pull into RAM but apply unique section suffix to allow linker GC
+#define __dvi_func(x) __not_in_flash_func(x)
+#define __dvi_const(x) __not_in_flash_func(x)
+
+// VGA -- we do this mode properly, with a pretty comfortable clk_sys (252 MHz)
+const struct dvi_timing __dvi_const(dvi_timing_640x480p_60hz) = {
+	.h_sync_polarity   = false,
+	.h_front_porch     = 16,
+	.h_sync_width      = 96,
+	.h_back_porch      = 48,
+	.h_active_pixels   = 640,
+
+	.v_sync_polarity   = false,
+	.v_front_porch     = 10,
+	.v_sync_width      = 2,
+	.v_back_porch      = 33,
+	.v_active_lines    = 480,
+
+	.bit_clk_khz       = 252000
+};
+
+// SVGA -- completely by-the-book but requires 400 MHz clk_sys
+const struct dvi_timing __dvi_const(dvi_timing_800x600p_60hz) = {
+	.h_sync_polarity   = false,
+	.h_front_porch     = 44,
+	.h_sync_width      = 128,
+	.h_back_porch      = 88,
+	.h_active_pixels   = 800,
+
+	.v_sync_polarity   = false,
+	.v_front_porch     = 1,
+	.v_sync_width      = 4,
+	.v_back_porch      = 23,
+	.v_active_lines    = 600,
+
+	.bit_clk_khz       = 400000
+};
+
+// 800x480p 60 Hz (note this doesn't seem to be a CEA mode, I just used the
+// output of `cvt 800 480 60`), 295 MHz bit clock
+const struct dvi_timing __dvi_const(dvi_timing_800x480p_60hz) = {
+	.h_sync_polarity = false,
+	.h_front_porch   = 24,
+	.h_sync_width    = 72,
+	.h_back_porch    = 96,
+	.h_active_pixels = 800,
+
+	.v_sync_polarity = true,
+	.v_front_porch   = 3,
+	.v_sync_width    = 10,
+	.v_back_porch    = 7,
+	.v_active_lines  = 480,
+
+	.bit_clk_khz     = 295200
+};
+
+// SVGA reduced blanking (355 MHz bit clock) -- valid CVT mode, less common
+// than fully-blanked SVGA, but doesn't require such a high system clock
+const struct dvi_timing __dvi_const(dvi_timing_800x600p_reduced_60hz) = {
+	.h_sync_polarity   = true,
+	.h_front_porch     = 48,
+	.h_sync_width      = 32,
+	.h_back_porch      = 80,
+	.h_active_pixels   = 800,
+
+	.v_sync_polarity   = false,
+	.v_front_porch     = 3,
+	.v_sync_width      = 4,
+	.v_back_porch      = 11,
+	.v_active_lines    = 600,
+
+	.bit_clk_khz       = 354000
+};
+
+// Also known as qHD, bit uncommon, but it's a nice modest-resolution 16:9
+// aspect mode. Pixel clock 37.3 MHz
+const struct dvi_timing __dvi_const(dvi_timing_960x540p_60hz) = {
+	.h_sync_polarity   = true,
+	.h_front_porch     = 16,
+	.h_sync_width      = 32,
+	.h_back_porch      = 96,
+	.h_active_pixels   = 960,
+
+	.v_sync_polarity   = true,
+	.v_front_porch     = 2,
+	.v_sync_width      = 6,
+	.v_back_porch      = 15,
+	.v_active_lines    = 540,
+
+	.bit_clk_khz       = 372000
+};
+
+// Note this is NOT the correct 720p30 CEA mode, but rather 720p60 run at half
+// pixel clock. Seems to be commonly accepted (and is a valid CVT mode). The
+// actual CEA mode is the same pixel clock as 720p60 but with >50% blanking,
+// which would require a clk_sys of 742 MHz!
+const struct dvi_timing __dvi_const(dvi_timing_1280x720p_30hz) = {
+	.h_sync_polarity   = true,
+	.h_front_porch     = 110,
+	.h_sync_width      = 40,
+	.h_back_porch      = 220,
+	.h_active_pixels   = 1280,
+
+	.v_sync_polarity   = true,
+	.v_front_porch     = 5,
+	.v_sync_width      = 5,
+	.v_back_porch      = 20,
+	.v_active_lines    = 720,
+
+	.bit_clk_khz       = 372000
+};
+
+// Reduced-blanking (CVT) 720p. You aren't supposed to use reduced blanking
+// modes below 60 Hz, but I won't tell anyone (and it works on the monitors
+// I've tried). This nets a lower system clock than regular 720p30 (319 MHz)
+const struct dvi_timing __dvi_const(dvi_timing_1280x720p_reduced_30hz) = {
+	.h_sync_polarity   = true,
+	.h_front_porch     = 48,
+	.h_sync_width      = 32,
+	.h_back_porch      = 80,
+	.h_active_pixels   = 1280,
+
+	.v_sync_polarity   = false,
+	.v_front_porch     = 3,
+	.v_sync_width      = 5,
+	.v_back_porch      = 13,
+	.v_active_lines    = 720,
+
+	.bit_clk_khz       = 319200
+};
+
+// This requires a spicy 488 MHz system clock and is illegal in most countries
+// (you need to have a very lucky piece of silicon to run this at 1.3 V, or
+// connect an external supply and give it a bit more juice)
+const struct dvi_timing __dvi_const(dvi_timing_1600x900p_reduced_30hz) = {
+	.h_sync_polarity   = true,
+	.h_front_porch     = 48,
+	.h_sync_width      = 32,
+	.h_back_porch      = 80,
+	.h_active_pixels   = 1600,
+
+	.v_sync_polarity   = false,
+	.v_front_porch     = 3,
+	.v_sync_width      = 5,
+	.v_back_porch      = 18,
+	.v_active_lines    = 900,
+
+	.bit_clk_khz       = 488000
+};
+
+// ----------------------------------------------------------------------------
+
+// The DMA scheme is:
+//
+// - One channel transferring data to each of the three PIO state machines
+//   performing TMDS serialisation
+//
+// - One channel programming the registers of each of these data channels,
+//   triggered (CHAIN_TO) each time the corresponding data channel completes
+//
+// - Lanes 1 and 2 have one block for blanking and one for video data
+//
+// - Lane 0 has one block for each horizontal region (front porch, hsync, back
+//   porch, active)
+//
+// - The IRQ_QUIET flag is used to select which data block on the sync lane is
+//   allowed to generate an IRQ upon completion. This is the block immediately
+//   before the horizontal active region. The IRQ is entered at ~the same time
+//   as the last data transfer starts
+//
+// - The IRQ points the control channels at new blocklists for next scanline.
+//   The DMA starts the new list automatically at end-of-scanline, via
+//   CHAIN_TO.
+//
+// The horizontal active region is the longest continuous transfer, so this
+// gives the most time to handle the IRQ and load new blocklists.
+//
+// Note a null trigger IRQ is not suitable because we get that *after* the
+// last data transfer finishes, and the FIFOs bottom out very shortly
+// afterward. For pure DVI (four blocks per scanline), it works ok to take
+// four regular IRQs per scanline and return early from 3 of them, but this
+// breaks down when you have very short scanline sections like guard bands.
+
+// Each symbol appears twice, concatenated in one word. Note these must be in
+// RAM because they see a lot of DMA traffic
+const uint32_t __dvi_const(dvi_ctrl_syms)[4] = {
+	0xd5354,
+	0x2acab,
+	0x55154,
+	0xaaeab
+};
+
+// Output solid red scanline if we are given NULL for tmdsbuff
+#if DVI_SYMBOLS_PER_WORD == 2
+static uint32_t __dvi_const(empty_scanline_tmds)[3] = {
+	0x7fd00u, // 0x00, 0x00
+	0x7fd00u, // 0x00, 0x00
+	0xbfa01u  // 0xfc, 0xfc
+};
+#else
+static uint32_t __attribute__((aligned(8))) __dvi_const(empty_scanline_tmds)[6] = {
+	0x100u, 0x1ffu, // 0x00, 0x00
+	0x100u, 0x1ffu, // 0x00, 0x00
+	0x201u, 0x2feu  // 0xfc, 0xfc
+};
+#endif
+
+void dvi_timing_state_init(struct dvi_timing_state *t) {
+	t->v_ctr = 0;
+	t->v_state = DVI_STATE_FRONT_PORCH;
+};
+
+void __dvi_func(dvi_timing_state_advance)(const struct dvi_timing *t, struct dvi_timing_state *s) {
+		s->v_ctr++;
+		if ((s->v_state == DVI_STATE_FRONT_PORCH && s->v_ctr == t->v_front_porch) || 
+		    (s->v_state == DVI_STATE_SYNC && s->v_ctr == t->v_sync_width) ||
+		    (s->v_state == DVI_STATE_BACK_PORCH && s->v_ctr == t->v_back_porch) ||
+		    (s->v_state == DVI_STATE_ACTIVE && s->v_ctr == t->v_active_lines)) {
+
+			s->v_state = (s->v_state + 1) % DVI_STATE_COUNT;
+			s->v_ctr = 0;
+		}
+}
+
+void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list) {
+	*dma_list = (struct dvi_scanline_dma_list){};	
+}
+
+static const uint32_t *get_ctrl_sym(bool vsync, bool hsync) {
+	return &dvi_ctrl_syms[!!vsync << 1 | !!hsync];
+}
+
+// Make a sequence of paced transfers to the relevant FIFO
+static void _set_data_cb(dma_cb_t *cb, const struct dvi_lane_dma_cfg *dma_cfg,
+		const void *read_addr, uint transfer_count, uint read_ring, bool irq_on_finish) {
+	cb->read_addr = read_addr;
+	cb->write_addr = dma_cfg->tx_fifo;
+	cb->transfer_count = transfer_count;
+	cb->c = dma_channel_get_default_config(dma_cfg->chan_data);
+	channel_config_set_ring(&cb->c, false, read_ring);
+	channel_config_set_dreq(&cb->c, dma_cfg->dreq);
+	// Call back to control channel for reconfiguration:
+	channel_config_set_chain_to(&cb->c, dma_cfg->chan_ctrl);
+	// Note we never send a null trigger, so IRQ_QUIET is an IRQ suppression flag
+	channel_config_set_irq_quiet(&cb->c, !irq_on_finish);
+};
+
+void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
+		bool vsync_asserted, struct dvi_scanline_dma_list *l) {
+
+	bool vsync = t->v_sync_polarity == vsync_asserted;
+	const uint32_t *sym_hsync_off = get_ctrl_sym(vsync, !t->h_sync_polarity);
+	const uint32_t *sym_hsync_on  = get_ctrl_sym(vsync,  t->h_sync_polarity);
+	const uint32_t *sym_no_sync   = get_ctrl_sym(false,  false             );
+
+	dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
+	// The symbol table contains each control symbol *twice*, concatenated into 20 LSBs of table word, so we can always do word-repeat.
+	_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch   / DVI_SYMBOLS_PER_WORD, 2, false);
+	_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on,  t->h_sync_width    / DVI_SYMBOLS_PER_WORD, 2, false);
+	_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch    / DVI_SYMBOLS_PER_WORD, 2, true);
+	_set_data_cb(&synclist[3], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
+
+	for (int i = 0; i < N_TMDS_LANES; ++i) {
+		if (i == TMDS_SYNC_LANE)
+			continue;
+		dma_cb_t *cblist = dvi_lane_from_list(l, i);
+		_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
+		_set_data_cb(&cblist[1], &dma_cfg[i], sym_no_sync, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
+	}
+}
+
+void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
+		uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) {
+
+	const uint32_t *sym_hsync_off = get_ctrl_sym(!t->v_sync_polarity, !t->h_sync_polarity);
+	const uint32_t *sym_hsync_on  = get_ctrl_sym(!t->v_sync_polarity,  t->h_sync_polarity);
+	const uint32_t *sym_no_sync   = get_ctrl_sym(false,                false             );
+
+	dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
+	_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false);
+	_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on,  t->h_sync_width  / DVI_SYMBOLS_PER_WORD, 2, false);
+	_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch  / DVI_SYMBOLS_PER_WORD, 2, true);
+
+	for (int i = 0; i < N_TMDS_LANES; ++i) {
+		dma_cb_t *cblist = dvi_lane_from_list(l, i);
+		if (i != TMDS_SYNC_LANE) {
+			_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,
+				(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
+		}
+		int target_block = i == TMDS_SYNC_LANE ? DVI_SYNC_LANE_CHUNKS - 1 :  DVI_NOSYNC_LANE_CHUNKS - 1;
+		if (tmdsbuf) {
+			// Non-repeating DMA for the freshly-encoded TMDS buffer
+			_set_data_cb(&cblist[target_block], &dma_cfg[i], tmdsbuf + i * (t->h_active_pixels / DVI_SYMBOLS_PER_WORD),
+				t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 0, false);
+		}
+		else {
+			// Use read ring to repeat the correct DC-balanced symbol pair on blank scanlines (4 or 8 byte period)
+			_set_data_cb(&cblist[target_block], &dma_cfg[i], &empty_scanline_tmds[2 * i / DVI_SYMBOLS_PER_WORD],
+				t->h_active_pixels / DVI_SYMBOLS_PER_WORD, DVI_SYMBOLS_PER_WORD == 2 ? 2 : 3, false);
+		}
+	}
+}
+
+void __dvi_func(dvi_update_scanline_data_dma)(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) {
+	for (int i = 0; i < N_TMDS_LANES; ++i) {
+		const uint32_t *lane_tmdsbuf = dvi_monochrome_tmds ? tmdsbuf : tmdsbuf + i * t->h_active_pixels / DVI_SYMBOLS_PER_WORD;
+		if (i == TMDS_SYNC_LANE)
+			dvi_lane_from_list(l, i)[3].read_addr = lane_tmdsbuf;
+		else
+			dvi_lane_from_list(l, i)[1].read_addr = lane_tmdsbuf;
+	}
+}
+
diff --git a/src/libdvi/dvi_timing.h b/src/libdvi/dvi_timing.h
new file mode 100644
index 0000000..bf34937
--- /dev/null
+++ b/src/libdvi/dvi_timing.h
@@ -0,0 +1,99 @@
+#ifndef _DVI_TIMING_H
+#define _DVI_TIMING_H
+
+#include "hardware/dma.h"
+#include "pico/util/queue.h"
+
+#include "dvi.h"
+
+struct dvi_timing {
+	bool h_sync_polarity;
+	uint h_front_porch;
+	uint h_sync_width;
+	uint h_back_porch;
+	uint h_active_pixels;
+
+	bool v_sync_polarity;
+	uint v_front_porch;
+	uint v_sync_width;
+	uint v_back_porch;
+	uint v_active_lines;
+
+	uint bit_clk_khz;
+};
+
+enum dvi_line_state {
+	DVI_STATE_FRONT_PORCH = 0,
+	DVI_STATE_SYNC,
+	DVI_STATE_BACK_PORCH,
+	DVI_STATE_ACTIVE,
+	DVI_STATE_COUNT
+};
+
+struct dvi_timing_state {
+	uint v_ctr;
+	enum dvi_line_state v_state;
+};
+
+// This should map directly to DMA register layout, but more convenient types
+// (also this really shouldn't be here... we don't have a dma_cb in the SDK
+// because there are many valid formats due to aliases)
+typedef struct dma_cb {
+	const void *read_addr;
+	void *write_addr;
+	uint32_t transfer_count;
+	dma_channel_config c;
+} dma_cb_t;
+
+static_assert(sizeof(dma_cb_t) == 4 * sizeof(uint32_t), "bad dma layout");
+static_assert(__builtin_offsetof(dma_cb_t, c.ctrl) == __builtin_offsetof(dma_channel_hw_t, ctrl_trig), "bad dma layout");
+
+#define DVI_SYNC_LANE_CHUNKS DVI_STATE_COUNT
+#define DVI_NOSYNC_LANE_CHUNKS 2
+
+struct dvi_scanline_dma_list {
+	dma_cb_t l0[DVI_SYNC_LANE_CHUNKS];
+	dma_cb_t l1[DVI_NOSYNC_LANE_CHUNKS];
+	dma_cb_t l2[DVI_NOSYNC_LANE_CHUNKS];
+};
+
+static inline dma_cb_t* dvi_lane_from_list(struct dvi_scanline_dma_list *l, int i) {
+	return i == 0 ? l->l0 : i == 1 ? l->l1 : l->l2;
+}
+
+// Each TMDS lane uses one DMA channel to transfer data to a PIO state
+// machine, and another channel to load control blocks into this channel.
+struct dvi_lane_dma_cfg {
+	uint chan_ctrl;
+	uint chan_data;
+	void *tx_fifo;
+	uint dreq;
+};
+
+// Note these are already converted to pseudo-differential representation
+extern const uint32_t dvi_ctrl_syms[4];
+
+extern const struct dvi_timing dvi_timing_640x480p_60hz;
+extern const struct dvi_timing dvi_timing_800x480p_60hz;
+extern const struct dvi_timing dvi_timing_800x600p_60hz;
+extern const struct dvi_timing dvi_timing_960x540p_60hz;
+extern const struct dvi_timing dvi_timing_1280x720p_30hz;
+
+extern const struct dvi_timing dvi_timing_800x600p_reduced_60hz;
+extern const struct dvi_timing dvi_timing_1280x720p_reduced_30hz;
+
+void dvi_timing_state_init(struct dvi_timing_state *t);
+
+void dvi_timing_state_advance(const struct dvi_timing *t, struct dvi_timing_state *s);
+
+void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list);
+
+void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
+		bool vsync_asserted, struct dvi_scanline_dma_list *l);
+
+void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
+		uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l);
+
+void dvi_update_scanline_data_dma(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l);
+
+#endif
diff --git a/src/libdvi/tmds_encode.S b/src/libdvi/tmds_encode.S
new file mode 100644
index 0000000..065061d
--- /dev/null
+++ b/src/libdvi/tmds_encode.S
@@ -0,0 +1,623 @@
+#include "hardware/regs/addressmap.h"
+#include "hardware/regs/sio.h"
+#include "dvi_config_defs.h"
+
+// Offsets suitable for ldr/str (must be <= 0x7c):
+#define ACCUM0_OFFS     (SIO_INTERP0_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
+#define ACCUM1_OFFS     (SIO_INTERP0_ACCUM1_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
+#define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
+#define PEEK0_OFFS      (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
+#define PEEK1_OFFS      (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
+#define PEEK2_OFFS      (SIO_INTERP0_PEEK_FULL_OFFSET  - SIO_INTERP0_ACCUM0_OFFSET)
+#define INTERP1         (SIO_INTERP1_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
+// Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
+// word-addressed space... almost as though it were intentional! :)
+
+.syntax unified
+.cpu cortex-m0plus
+.thumb
+
+.macro decl_func_x name
+.section .scratch_x.\name, "ax"
+.global \name
+.type \name,%function
+.thumb_func
+\name:
+.endm
+
+.macro decl_func_y name
+.section .scratch_y.\name, "ax"
+.global \name
+.type \name,%function
+.thumb_func
+\name:
+.endm
+
+#define decl_func decl_func_x
+
+// ----------------------------------------------------------------------------
+// Pixel-doubling encoders for RGB
+
+// r0: Input buffer (word-aligned)
+// r1: Output buffer (word-aligned)
+// r2: Input size (pixels)
+
+.macro do_channel_16bpp r_ibase r_inout0 r_out1
+	str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
+	ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
+	ldr \r_inout0, [\r_inout0]
+	ldr \r_out1, [\r_ibase, #PEEK1_OFFS]
+	ldr \r_out1, [\r_out1]
+.endm
+
+decl_func tmds_encode_loop_16bpp
+	push {r4, r5, r6, r7, lr}
+	lsls r2, #2
+	add r2, r1
+	mov ip, r2
+	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	b 2f
+.align 2
+1:
+.rept TMDS_ENCODE_UNROLL
+	ldmia r0!, {r4, r6}
+	do_channel_16bpp r2, r4, r5
+	do_channel_16bpp r2, r6, r7
+	stmia r1!, {r4, r5, r6, r7}
+.endr
+2:
+	cmp r1, ip
+	bne 1b
+	pop {r4, r5, r6, r7, pc}
+
+// Same as above, but scale data to make up for lack of left shift
+// in interpolator (costs 1 cycle per 2 pixels)
+//
+// r0: Input buffer (word-aligned)
+// r1: Output buffer (word-aligned)
+// r2: Input size (pixels)
+// r3: Left shift amount
+
+decl_func tmds_encode_loop_16bpp_leftshift
+	push {r4, r5, r6, r7, lr}
+	lsls r2, #2
+	add r2, r1
+	mov ip, r2
+	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	b 2f
+.align 2
+1:
+.rept TMDS_ENCODE_UNROLL
+	ldmia r0!, {r4, r6}
+	lsls r4, r3
+	do_channel_16bpp r2, r4, r5
+	lsls r6, r3
+	do_channel_16bpp r2, r6, r7
+	stmia r1!, {r4, r5, r6, r7}
+.endr
+2:
+	cmp r1, ip
+	bne 1b
+	pop {r4, r5, r6, r7, pc}
+
+// r0: Input buffer (word-aligned)
+// r1: Output buffer (word-aligned)
+// r2: Input size (pixels)
+
+decl_func tmds_encode_loop_8bpp
+	push {r4, r5, r6, r7, lr}
+	lsls r2, #2
+	add r2, r1
+	mov ip, r2
+	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	b 2f
+.align 2
+1:
+.rept TMDS_ENCODE_UNROLL
+	ldmia  r0!, {r4}
+	str r4, [r2, #ACCUM0_OFFS + INTERP1]
+	str r4, [r2, #ACCUM0_OFFS]
+	ldr r4, [r2, #PEEK0_OFFS]
+	ldr r4, [r4]
+	ldr r5, [r2, #PEEK1_OFFS]
+	ldr r5, [r5]
+	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
+	ldr r6, [r6]
+	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
+	ldr r7, [r7]
+	stmia r1!, {r4, r5, r6, r7}
+.endr
+2:
+	cmp r1, ip
+	bne 1b
+	pop {r4, r5, r6, r7, pc}
+
+// r0: Input buffer (word-aligned)
+// r1: Output buffer (word-aligned)
+// r2: Input size (pixels)
+// r3: Left shift amount
+//
+// Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
+// the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
+// the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
+// since its channel MSBs are no greater than 7.
+
+decl_func tmds_encode_loop_8bpp_leftshift
+	push {r4, r5, r6, r7, lr}
+	lsls r2, #3
+	add r2, r1
+	mov ip, r2
+	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	b 2f
+.align 2
+1:
+.rept TMDS_ENCODE_UNROLL
+	ldmia  r0!, {r4}
+	str r4, [r2, #ACCUM0_OFFS + INTERP1]
+	lsls r4, r3
+	str r4, [r2, #ACCUM0_OFFS]
+	ldr r4, [r2, #PEEK0_OFFS]
+	ldr r4, [r4]
+	ldr r5, [r2, #PEEK1_OFFS]
+	ldr r5, [r5]
+	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
+	ldr r6, [r6]
+	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
+	ldr r7, [r7]
+	stmia r1!, {r4, r5, r6, r7}
+.endr
+2:
+	cmp r1, ip
+	bne 1b
+	pop {r4, r5, r6, r7, pc}
+
+// ----------------------------------------------------------------------------
+// Fast 1bpp black/white encoder (full res)
+
+// Taking the encoder from DVI spec, with initial balance 0:
+// 
+// - Encoding either 0x00 or 0xff will produce a running balance of -8, with
+//   output symbol of 0x100 or 0x200
+// 
+// - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with
+//  output symbol of 0x1ff or 0x2ff
+// 
+// So we can do 1bpp encode with a lookup of x coordinate LSB, and input
+// colour bit. If we process pixels in even-sized blocks, only the colour
+// lookup is needed.
+
+// Encode 8 pixels @ 1bpp (using two table lookups)
+// r3 contains lookup mask (preshifted)
+// r8 contains pointer to encode table
+// 2.125 cyc/pix
+.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
+	\shift_instr0 r4, r2, #\shamt0
+	ands r4, r3
+	add r4, r8
+	ldmia r4, {r4, r5}
+	\shift_instr1 r6, r2, #\shamt1
+	ands r6, r3
+	add r6, r8
+	ldmia r6, {r6, r7}
+	stmia r1!, {r4, r5, r6, r7}
+.endm
+
+// r0: input buffer (word-aligned)
+// r1: output buffer (word-aligned)
+// r2: output pixel count
+decl_func tmds_encode_1bpp
+	push {r4-r7, lr}
+	mov r7, r8
+	push {r7}
+	lsls r2, #1
+	add r2, r1
+	mov ip, r2
+	adr r4, tmds_1bpp_table
+	mov r8, r4
+	// Mask: 4 bit index, 8 bytes per entry
+	movs r3, #0x78
+	b 2f
+1:
+	ldmia r0!, {r2}
+#if !DVI_1BPP_BIT_REVERSE
+	tmds_encode_1bpp_body lsls 3  lsrs 1
+	tmds_encode_1bpp_body lsrs 5  lsrs 9
+	tmds_encode_1bpp_body lsrs 13 lsrs 17
+	tmds_encode_1bpp_body lsrs 21 lsrs 25
+#else
+	tmds_encode_1bpp_body lsrs 1   lsls 3
+	tmds_encode_1bpp_body lsrs 9   lsrs 5
+	tmds_encode_1bpp_body lsrs 17  lsrs 13
+	tmds_encode_1bpp_body lsrs 25  lsrs 21
+#endif
+2:
+	cmp r1, ip
+	blo 1b
+
+	pop {r7}
+	mov r8, r7
+	pop {r4-r7, pc}
+
+.align 2
+tmds_1bpp_table:
+#if !DVI_1BPP_BIT_REVERSE
+	.word 0x7fd00, 0x7fd00  // 0000
+	.word 0x7fe00, 0x7fd00  // 0001
+	.word 0xbfd00, 0x7fd00  // 0010
+	.word 0xbfe00, 0x7fd00  // 0011
+	.word 0x7fd00, 0x7fe00  // 0100
+	.word 0x7fe00, 0x7fe00  // 0101
+	.word 0xbfd00, 0x7fe00  // 0110
+	.word 0xbfe00, 0x7fe00  // 0111
+	.word 0x7fd00, 0xbfd00  // 1000
+	.word 0x7fe00, 0xbfd00  // 1001
+	.word 0xbfd00, 0xbfd00  // 1010
+	.word 0xbfe00, 0xbfd00  // 1011
+	.word 0x7fd00, 0xbfe00  // 1100
+	.word 0x7fe00, 0xbfe00  // 1101
+	.word 0xbfd00, 0xbfe00  // 1110
+	.word 0xbfe00, 0xbfe00  // 1111
+#else
+	.word 0x7fd00, 0x7fd00  // 0000
+	.word 0x7fd00, 0xbfd00  // 1000
+	.word 0x7fd00, 0x7fe00  // 0100
+	.word 0x7fd00, 0xbfe00  // 1100
+	.word 0xbfd00, 0x7fd00  // 0010
+	.word 0xbfd00, 0xbfd00  // 1010
+	.word 0xbfd00, 0x7fe00  // 0110
+	.word 0xbfd00, 0xbfe00  // 1110
+	.word 0x7fe00, 0x7fd00  // 0001
+	.word 0x7fe00, 0xbfd00  // 1001
+	.word 0x7fe00, 0x7fe00  // 0101
+	.word 0x7fe00, 0xbfe00  // 1101
+	.word 0xbfe00, 0x7fd00  // 0011
+	.word 0xbfe00, 0xbfd00  // 1011
+	.word 0xbfe00, 0x7fe00  // 0111
+	.word 0xbfe00, 0xbfe00  // 1111
+#endif
+
+
+// ----------------------------------------------------------------------------
+// Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222)
+
+// Even-x-position pixels are encoded as symbols with imbalance -4, and odd
+// pixels with +4, so that we can mix-and-match our even/odd codewords and
+// always get a properly balanced sequence:
+//
+// level 0: (05 -> 103), then (04 -> 1fc)  (decimal 5, 4)
+// level 1: (50 -> 130), then (51 -> 1cf)  (decimal 80, 81)
+// level 2: (af -> 230), then (ae -> 2cf)  (decimal 175, 174)
+// level 3: (fa -> 203), then (fb -> 2fc)  (decimal 250, 251)
+//
+// These correspond to roughly 255 times (0, 1/3, 2/3, 1).
+//
+// Alternatively we could use symbols with 0 balance, which results in lower
+// contrast but avoids the LSB bobble:
+//
+// level 0: (10 -> 1f0) always
+// level 1: (5a -> 263) always
+// level 2: (a5 -> 163) always
+// level 3: (ef -> 2f0) always
+
+// Table base pointer in r0. Input pixels in r2.
+.macro encode_2bpp_body shift_instr shamt rd
+	\shift_instr \rd, r2, #\shamt
+	ands \rd, r3
+	ldr \rd, [r0, \rd]
+.endm
+
+// r0: input buffer (word-aligned)
+// r1: output buffer (word-aligned)
+// r2: output pixel count
+decl_func tmds_encode_2bpp
+	push {r4-r7, lr}
+	mov r7, r8
+	push {r7}
+	mov r8, r0
+	adr r0, tmds_2bpp_table
+	// Mask: 4-bit index into 4-byte entries.
+	movs r3, #0x3c
+	// Limit pointer: 1 word per 2 pixels
+	lsls r2, #1
+	add r2, r1
+	mov ip, r2
+	b 2f
+1:
+	mov r4, r8
+	ldmia r4!, {r2}
+	mov r8, r4
+	encode_2bpp_body lsls 2  r4
+	encode_2bpp_body lsrs 2  r5
+	encode_2bpp_body lsrs 6  r6
+	encode_2bpp_body lsrs 10 r7
+	stmia r1!, {r4-r7}
+	encode_2bpp_body lsrs 14 r4
+	encode_2bpp_body lsrs 18 r5
+	encode_2bpp_body lsrs 22 r6
+	encode_2bpp_body lsrs 26 r7
+	stmia r1!, {r4-r7}
+2:
+	cmp r1, ip
+	blo 1b
+	pop {r7}
+	mov r8, r7
+	pop {r4-r7, pc}
+
+.align 2
+tmds_2bpp_table:
+	.word 0x7f103 // 00, 00
+	.word 0x7f130 // 01, 00
+	.word 0x7f230 // 10, 00
+	.word 0x7f203 // 11, 00
+	.word 0x73d03 // 00, 01
+	.word 0x73d30 // 01, 01
+	.word 0x73e30 // 10, 01
+	.word 0x73e03 // 11, 01
+	.word 0xb3d03 // 00, 10
+	.word 0xb3d30 // 01, 10
+	.word 0xb3e30 // 10, 10
+	.word 0xb3e03 // 11, 10
+	.word 0xbf103 // 00, 11
+	.word 0xbf130 // 01, 11
+	.word 0xbf230 // 10, 11
+	.word 0xbf203 // 11, 11
+
+// ----------------------------------------------------------------------------
+// Full-resolution RGB encode (not very practical)
+
+// Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is
+// taking horizontal blanking (at VGA) and dual core into account, and
+// assuming the 3 channels are encoded individually.)
+//
+// Here is an idea
+// Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
+// ACCUM0), concatenated with the sign bit of our running disparity (from
+// ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
+// with the symbol's disparity stored left-justified in the upper 12 bits, as
+// e.g. a 6 bit signed integer.
+//
+// - Load pixel data.                        cyc: 0.75 (ldmia 2 words, every 4 pixels)
+// - Write pixel to ACCUM0.                  cyc: 1
+// - Read address from PEEK2.                cyc: 1
+// - Load encoded pixel from address.        cyc: 2
+// - Write disparity data to ACCUM1_ADD      cyc: 1
+// - Write encoded data to output buffer.    cyc: 1.25 (stmia 4 words, every 4 pixels)
+//
+// With decent register allocation we may be able to load 4 pixels at
+// once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
+//
+// One issue is that the TMDS data in the bottom of ACCUM1 will eventually
+// overflow and affect the running disparity, but with 16 zeroes in between,
+// this would take much longer than one scanline, so everything is fine if
+// we clear the accumulator at the start of the scanline.
+//
+// Note that we need to use two interpolators to get the bits from both pixels
+// -- we are not outputting a single DC-balanced stream, but rather two
+// interleaved streams which are each DC-balanced. This is fine electrically,
+// but our output here will *NOT* match the TMDS encoder given in the DVI
+// spec.
+
+// You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance
+// feedback. With the feedback enabled (default), the output is DC balanced,
+// but there are just barely enough CPU cycles to do all the encode, so it's
+// essentially a party trick. If you disable DC balancing, the performance is
+// much better, and many monitors will still accept the signals as long as you
+// DC couple your DVI signals.
+
+.macro tmds_fullres_encode_loop_body ra rb
+	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
+	str \ra, [r2, #ACCUM0_OFFS]
+	ldr \ra, [r2, #PEEK2_OFFS]
+	ldr \ra, [\ra]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str \ra, [r2, #ACCUM1_ADD_OFFS]
+#endif
+	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
+	ldr \rb, [\rb]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+#endif
+.endm
+
+// r0: Input buffer (word-aligned)
+// r1: Output buffer (word-aligned)
+// r2: Pixel count
+
+.macro tmds_fullres_encode_loop_16bpp
+	push {r4-r7, lr}
+	mov r4, r8
+	push {r4}
+
+
+	lsls r2, #2
+	add r2, r1
+	mov ip, r2
+	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	// DC balance defined to be 0 at start of scanline:
+	movs r4, #0
+	str r4, [r2, #ACCUM1_OFFS]
+#if TMDS_FULLRES_NO_DC_BALANCE
+	// Alternate parity between odd/even symbols if no feedback
+	mvns r4, r4
+#endif
+	str r4, [r2, #ACCUM1_OFFS + INTERP1]
+
+	// Keep loop start pointer in r8 so we can get a longer backward branch
+	adr r4, 1f
+	adds r4, #1 // god damn thumb bit why is this a thing
+	mov r8, r4
+	b 2f
+	.align 2
+1:
+.rept 16
+	ldmia r0!, {r4, r6}
+	tmds_fullres_encode_loop_body r4 r5
+	tmds_fullres_encode_loop_body r6 r7
+	stmia r1!, {r4, r5, r6, r7}
+.endr
+2:
+	cmp r1, ip
+	beq 1f
+	bx r8
+1:
+	pop {r4}
+	mov r8, r4
+	pop {r4-r7, pc}
+.endm
+
+// One copy each in X and Y, so the two cores don't step on each other
+decl_func_x tmds_fullres_encode_loop_16bpp_x
+	tmds_fullres_encode_loop_16bpp
+decl_func_y tmds_fullres_encode_loop_16bpp_y
+	tmds_fullres_encode_loop_16bpp
+
+
+.macro tmds_fullres_encode_loop_body_leftshift ra rb
+	// Note we apply the leftshift for INTERP0 only
+	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
+	lsls \ra, r3
+	str \ra, [r2, #ACCUM0_OFFS]
+	ldr \ra, [r2, #PEEK2_OFFS]
+	ldr \ra, [\ra]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str \ra, [r2, #ACCUM1_ADD_OFFS]
+#endif
+	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
+	ldr \rb, [\rb]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+#endif
+.endm
+
+// r0: Input buffer (word-aligned)
+// r1: Output buffer (word-aligned)
+// r2: Pixel count
+// r3: Left shift amount
+
+.macro tmds_fullres_encode_loop_16bpp_leftshift
+	push {r4-r7, lr}
+	mov r4, r8
+	mov r5, r9
+	push {r4-r5}
+
+	lsls r2, #2
+	add r2, r1
+	mov ip, r2
+	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	// DC balance defined to be 0 at start of scanline:
+	movs r4, #0
+	str r4, [r2, #ACCUM1_OFFS]
+#if TMDS_FULLRES_NO_DC_BALANCE
+	// Alternate parity between odd/even symbols if there's no balance feedback
+	mvns r4, r4
+#endif
+	str r4, [r2, #ACCUM1_OFFS + INTERP1]
+
+	adr r4, 1f
+	adds r4, #1
+	mov r8, r4
+	b 2f
+	.align 2
+1:
+.rept 16 // 64 pixels per iteration
+	ldmia r0!, {r4, r6}
+	tmds_fullres_encode_loop_body_leftshift r4 r5
+	tmds_fullres_encode_loop_body_leftshift r6 r7
+	stmia r1!, {r4, r5, r6, r7}
+.endr
+2:
+	cmp r1, ip
+	beq 1f
+	bx r8
+1:
+	pop {r4-r5}
+	mov r8, r4
+	mov r9, r5
+	pop {r4-r7, pc}
+.endm
+
+decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
+	tmds_fullres_encode_loop_16bpp_leftshift
+decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
+	tmds_fullres_encode_loop_16bpp_leftshift
+
+
+// ----------------------------------------------------------------------------
+// Full-resolution 8bpp paletted encode
+
+// Variant of tmds_fullres_encode_loop_16bpp that reads
+// 8-bit wide pixels packed 4 per word.  The interpolator
+// base is set to a reordered list of TMDS symbols based
+// on a user colour palette.
+
+// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
+// interp base pointer. r7 used as temporary.
+.macro tmds_palette_encode_loop_body rd
+	str \rd, [r2, #ACCUM0_OFFS]
+	str \rd, [r2, #ACCUM0_OFFS + INTERP1]
+	ldr \rd, [r2, #PEEK2_OFFS]
+	ldr \rd, [\rd]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str \rd, [r2, #ACCUM1_ADD_OFFS]
+#endif
+	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
+	ldr r7, [r7]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+#endif
+	lsls r7, #10
+	orrs \rd, r7
+.endm
+
+.macro tmds_palette_encode_loop
+	push {r4-r7, lr}
+	mov r4, r8
+	push {r4}
+
+
+	lsls r2, #1
+	add r2, r1
+	mov ip, r2
+	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	// DC balance defined to be 0 at start of scanline:
+	movs r4, #0
+	str r4, [r2, #ACCUM1_OFFS]
+#if TMDS_FULLRES_NO_DC_BALANCE
+	// Alternate parity between odd/even symbols if there's no balance feedback
+	mvns r4, r4
+#endif
+	str r4, [r2, #ACCUM1_OFFS + INTERP1]
+
+	// Keep loop start pointer in r8 so we can get a longer backward branch
+	adr r4, 1f
+	adds r4, #1 // god damn thumb bit why is this a thing
+	mov r8, r4
+	b 2f
+	.align 2
+1:
+.rept 10
+	ldmia r0!, {r3, r5}
+	lsrs r4, r3, #14
+	lsls r3, #2
+	lsrs r6, r5, #14
+	lsls r5, #2
+	tmds_palette_encode_loop_body r3
+	tmds_palette_encode_loop_body r4
+	tmds_palette_encode_loop_body r5
+	tmds_palette_encode_loop_body r6
+	stmia r1!, {r3, r4, r5, r6}
+.endr
+2:
+	cmp r1, ip
+	beq 1f
+	bx r8
+1:
+	pop {r4}
+	mov r8, r4
+	pop {r4-r7, pc}
+.endm
+
+decl_func_x tmds_palette_encode_loop_x
+	tmds_palette_encode_loop
+decl_func_y tmds_palette_encode_loop_y
+	tmds_palette_encode_loop
diff --git a/src/libdvi/tmds_encode.c b/src/libdvi/tmds_encode.c
new file mode 100644
index 0000000..472b1a9
--- /dev/null
+++ b/src/libdvi/tmds_encode.c
@@ -0,0 +1,305 @@
+#include "hardware/interp.h"
+#include "tmds_encode.h"
+#include "hardware/gpio.h"
+#include "hardware/sync.h"
+
+static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
+#include "tmds_table.h"
+};
+
+// Fullres table is bandwidth-critical, so gets one copy for each scratch
+// memory. There is a third copy which can go in flash, because it's just used
+// to generate palette LUTs. The ones we don't use will get garbage collected
+// during linking.
+const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
+#include "tmds_table_fullres.h"
+};
+
+const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
+#include "tmds_table_fullres.h"
+};
+
+// Configure an interpolator to extract a single colour channel from each of a pair
+// of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
+// pixel_width wide. Produce a LUT address for the first pixel's colour data on
+// LANE0, and the second pixel's colour data on LANE1.
+//
+// Returns nonzero if the *_leftshift variant of the encoder loop must be used
+// (needed for blue channel because I was a stubborn idiot and didn't put
+// signed/bidirectional shift on interpolator, very slightly slower). The
+// return value is the size of left shift required.
+
+static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint pixel_lsb, uint pixel_width, uint lut_index_width, const uint32_t *lutbase) {
+	interp_config c;
+	const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
+
+	int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift;
+	int oops = 0;
+	if (shift_channel_to_index < 0) {
+		// "It's ok we'll fix it in software"
+		oops = -shift_channel_to_index;
+		shift_channel_to_index = 0;
+	}
+
+	uint index_msb = index_shift + lut_index_width - 1;
+
+	c = interp_default_config();
+	interp_config_set_shift(&c, shift_channel_to_index);
+	interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
+	interp_set_config(interp, 0, &c);
+
+	c = interp_default_config();
+	interp_config_set_shift(&c, pixel_width	+ shift_channel_to_index);
+	interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
+	interp_config_set_cross_input(&c, true);
+	interp_set_config(interp, 1, &c);
+
+	interp->base[0] = (uint32_t)lutbase;
+	interp->base[1] = (uint32_t)lutbase;
+
+	return oops;
+}
+
+// Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer
+// of TMDS symbols from this colour channel. Number of pixels must be even,
+// pixel buffer must be word-aligned.
+
+void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
+	interp_hw_save_t interp0_save;
+	interp_save(interp0_hw, &interp0_save);
+	int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table);
+	if (require_lshift)
+		tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
+	else
+		tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
+	interp_restore(interp0_hw, &interp0_save);
+}
+
+// As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned.
+void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
+	interp_hw_save_t interp0_save, interp1_save;
+	interp_save(interp0_hw, &interp0_save);
+	interp_save(interp1_hw, &interp1_save);
+	// Note that for 8bpp, some left shift is always required for pixel 0 (any
+	// channel), which destroys some MSBs of pixel 3. To get around this, pixel
+	// data sent to interp1 is *not left-shifted*
+	int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table);
+	int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table);
+	assert(!lshift_upper); (void)lshift_upper;
+	if (require_lshift)	
+		tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
+	else
+		tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
+	interp_restore(interp0_hw, &interp0_save);
+	interp_restore(interp1_hw, &interp1_save);
+}
+
+// ----------------------------------------------------------------------------
+// Code for full-resolution TMDS encode (barely possible, utterly impractical):
+
+// Different scheme used for full res as the fun pixel-doubling DC balance
+// trick doesn't work, so we need to actually do running disparity. ACCUM0 has
+// pixel data, ACCUM1 has running disparity. INTERP0 is used to process even
+// pixels, and INTERP1 for odd pixels. Note this means that even and odd
+// symbols have their DC balance handled separately, which is not to spec.
+
+static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) {
+	const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
+
+	int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift;
+	int oops = 0;
+	if (shift_channel_to_index < 0) {
+		// "It's ok we'll fix it in software"
+		oops = -shift_channel_to_index;
+		shift_channel_to_index = 0;
+	}
+
+	uint index_msb = index_shift + lut_index_width - 1;
+
+	interp_config c;
+	// Shift and mask colour channel to lower 6 bits of LUT index (note lut_index_width excludes disparity sign)
+	c = interp_default_config();
+	interp_config_set_shift(&c, shift_channel_to_index);
+	interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
+	interp_set_config(interp, 0, &c);
+
+	// Concatenate disparity (ACCUM1) sign onto the LUT index
+	c = interp_default_config();
+	interp_config_set_shift(&c, 30 - index_msb);
+	interp_config_set_mask(&c, index_msb + 1, index_msb + 1);
+	interp_set_config(interp, 1, &c);
+
+	interp->base[2] = (uint32_t)lutbase;
+
+	return oops;
+}
+
+void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
+	uint core = get_core_num();
+#if !TMDS_FULLRES_NO_INTERP_SAVE
+	interp_hw_save_t interp0_save, interp1_save;
+	interp_save(interp0_hw, &interp0_save);
+	interp_save(interp1_hw, &interp1_save);
+#endif
+
+	// There is a copy of the inner loop and the LUT in both scratch X and
+	// scratch Y memories. Use X on core 1 and Y on core 0 so the cores don't
+	// tread on each other's toes too much.
+	const uint32_t *lutbase = core ? tmds_table_fullres_x : tmds_table_fullres_y;
+	int lshift_lower = configure_interp_for_addrgen_fullres(interp0_hw, channel_msb, channel_lsb, 6, lutbase);
+	int lshift_upper = configure_interp_for_addrgen_fullres(interp1_hw, channel_msb + 16, channel_lsb + 16, 6, lutbase);
+	assert(!lshift_upper); (void)lshift_upper;
+	if (lshift_lower) {
+		(core ?
+			tmds_fullres_encode_loop_16bpp_leftshift_x :
+			tmds_fullres_encode_loop_16bpp_leftshift_y
+		)(pixbuf, symbuf, n_pix, lshift_lower);
+	}
+	else {
+		(core ?
+			tmds_fullres_encode_loop_16bpp_x :
+			tmds_fullres_encode_loop_16bpp_y
+		)(pixbuf, symbuf, n_pix);
+	}
+#if !TMDS_FULLRES_NO_INTERP_SAVE
+	interp_restore(interp0_hw, &interp0_save);
+	interp_restore(interp1_hw, &interp1_save);
+#endif
+}
+
+static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 };
+
+static inline int byte_imbalance(uint32_t x)
+{
+	return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF];
+}
+
+static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym)
+{
+	int pixel_imbalance = byte_imbalance(pixel);
+	uint32_t sym = pixel & 1;
+	if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) {
+		for (int i = 0; i < 7; ++i) {
+			sym |= (~((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1);
+		}
+	}
+	else {
+		for (int i = 0; i < 7; ++i) {
+			sym |= ( ((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1);
+		}
+		sym |= 0x100;
+	}
+
+	int imbalance = byte_imbalance(sym & 0xFF);
+  if (imbalance == 0) {
+		if ((sym & 0x100) == 0) sym ^= 0x2ff;
+		*positive_balance_sym = sym;
+		*negative_balance_sym = sym;
+		return;
+	}
+	else if (imbalance > 0) {
+		*negative_balance_sym = (sym ^ 0x2ff) | (((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) & 0x3F) << 26);
+		*positive_balance_sym = sym | ((imbalance + imbalance_lookup[sym >> 8] + 2) << 26);
+	}
+	else {
+		*negative_balance_sym = sym | (((imbalance + imbalance_lookup[sym >> 8] + 2) & 0x3F) << 26);
+		*positive_balance_sym = (sym ^ 0x2ff) | ((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) << 26);
+	}
+}
+
+// This takes a 16-bit (RGB 565) colour palette and makes palettes of TMDS symbols suitable
+// for performing fullres encode.
+// The TMDS palette buffer should be 6 * n_palette words long.
+// n_palette must be a power of 2 <= 256.
+void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *tmds_palette, size_t n_palette) {
+	uint32_t* tmds_palette_blue = tmds_palette;
+	uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
+	uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
+	for (int i = 0; i < n_palette; ++i) {
+		uint16_t blue = (palette[i] << 3) & 0xf8;
+		uint16_t green = (palette[i] >> 3) & 0xfc;
+		uint16_t red = (palette[i] >> 8) & 0xf8;
+		tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]);
+		tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]);
+		tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]);
+	}
+}
+
+// This takes a 24-bit (RGB 888) colour palette and makes palettes of TMDS symbols suitable
+// for performing fullres encode.
+// The TMDS palette buffer should be 6 * n_palette words long.
+// n_palette must be a power of 2 <= 256.
+void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *tmds_palette, size_t n_palette) {
+	uint32_t* tmds_palette_blue = tmds_palette;
+	uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
+	uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
+	for (int i = 0; i < n_palette; ++i) {
+		uint16_t blue = palette[i] & 0xff;
+		uint16_t green = (palette[i] >> 8) & 0xff;
+		uint16_t red = (palette[i] >> 16) & 0xff;
+		tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]);
+		tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]);
+		tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]);
+	}
+}
+
+// Encode palette data for all 3 channels.
+// pixbuf is an array of n_pix 8-bit wide pixels containing palette values (32-bit word aligned)
+// tmds_palette is a palette of TMDS symbols produced by tmds_setup_palette_symbols
+// symbuf is 3*n_pix 32-bit words, this function writes the symbol values for each of the channels to it.
+void __not_in_flash_func(tmds_encode_palette_data)(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits) {
+	uint core = get_core_num();
+#if !TMDS_FULLRES_NO_INTERP_SAVE
+	interp_hw_save_t interp0_save, interp1_save;
+	interp_save(interp0_hw, &interp0_save);
+	interp_save(interp1_hw, &interp1_save);
+#endif
+
+	interp0_hw->base[2] = (uint32_t)tmds_palette;
+	interp1_hw->base[2] = (uint32_t)tmds_palette;
+
+	// Lane 0 on both interpolators masks the palette bits, starting at bit 2,
+	// The second interpolator also shifts to read the 2nd or 4th byte of the word.
+	interp0_hw->ctrl[0] =
+		(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
+		((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
+	interp1_hw->ctrl[0] =
+		(8 << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
+		(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
+		((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
+
+	// Lane 1 shifts and masks the sign bit into the right position to add to the symbol
+	// table index to choose the negative disparity symbols if the sign is negative.
+	const uint32_t ctrl_lane_1 =
+		((31 - (palette_bits + 2)) << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
+		(palette_bits + 2) * ((1 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) | (1 << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB));
+	interp0_hw->ctrl[1] = ctrl_lane_1;
+	interp1_hw->ctrl[1] = ctrl_lane_1;
+
+	if (core) {
+		tmds_palette_encode_loop_x(pixbuf, symbuf, n_pix);
+
+		interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
+		interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
+		tmds_palette_encode_loop_x(pixbuf, symbuf + (n_pix >> 1), n_pix);
+
+		interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
+		interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
+		tmds_palette_encode_loop_x(pixbuf, symbuf + n_pix, n_pix);
+	} else {
+		tmds_palette_encode_loop_y(pixbuf, symbuf, n_pix);
+
+		interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
+		interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
+		tmds_palette_encode_loop_y(pixbuf, symbuf + (n_pix >> 1), n_pix);
+
+		interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
+		interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
+		tmds_palette_encode_loop_y(pixbuf, symbuf + n_pix, n_pix);
+	}
+
+#if !TMDS_FULLRES_NO_INTERP_SAVE
+	interp_restore(interp0_hw, &interp0_save);
+	interp_restore(interp1_hw, &interp1_save);
+#endif
+}
diff --git a/src/libdvi/tmds_encode.h b/src/libdvi/tmds_encode.h
new file mode 100644
index 0000000..633d630
--- /dev/null
+++ b/src/libdvi/tmds_encode.h
@@ -0,0 +1,46 @@
+#ifndef _TMDS_ENCODE_H_
+#define _TMDS_ENCODE_H_
+
+#include "hardware/interp.h"
+#include "dvi_config_defs.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+// Functions from tmds_encode.c
+void tmds_encode_data_channel_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
+void tmds_encode_data_channel_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
+void tmds_encode_data_channel_fullres_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
+void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *symbuf, size_t n_palette);
+void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *symbuf, size_t n_palette);
+void tmds_encode_palette_data(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits);
+
+// Functions from tmds_encode.S
+
+void tmds_encode_1bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_2bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+
+// Uses interp0:
+void tmds_encode_loop_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_loop_16bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
+
+// Uses interp0 and interp1:
+void tmds_encode_loop_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_encode_loop_8bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
+
+// Uses interp0 and interp1:
+// (Note a copy is provided in scratch memories X and Y)
+void tmds_fullres_encode_loop_16bpp_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_fullres_encode_loop_16bpp_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_fullres_encode_loop_16bpp_leftshift_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
+void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
+void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/src/libdvi/tmds_encode_1bpp.pio b/src/libdvi/tmds_encode_1bpp.pio
new file mode 100644
index 0000000..4ca31dc
--- /dev/null
+++ b/src/libdvi/tmds_encode_1bpp.pio
@@ -0,0 +1,46 @@
+.program tmds_encode_1bpp
+
+; 1bpp black/white pixels go in, TMDS symbols come out.
+; Each output word contains two output symbols, each 10 bits in size,
+; right-justified. The least-significant symbol is displayed first.
+;
+; We can encode using the following LUT: (yes this is compliant)
+;
+; x % 2 | colour | symbol
+; ------+--------+-------
+; 0     | 0      | 0x100
+; 0     | 1      | 0x200
+; 1     | 0      | 0x1ff
+; 1     | 1      | 0x2ff
+;
+; OSR: shift to right, autopull, threshold 32
+; ISR: shift to right, autopush, threshold 24
+;
+; Note the ISR needs to be shifted to *right* so that we can get the first
+; pixel in the less-significant position. Threshold 24 so we can get 8x 0-bits
+; at the LSBs for free :)
+
+even_pixel:
+    out x, 1
+    mov y, ~x
+    in y, 1
+    in x, 1
+
+odd_pixel:
+    mov x, ~null
+    in x, 8
+    out x, 1
+    mov y, ~x
+    in y, 1
+    in x, 13     ; Bring total shift to 24, triggering push.
+
+% c-sdk {
+static inline void tmds_encode_1bpp_init(PIO pio, uint sm) {
+    uint offset = pio_add_program(pio, &tmds_encode_1bpp_program);
+    pio_sm_config c = tmds_encode_1bpp_program_get_default_config(offset);
+    sm_config_set_out_shift(&c, true, true, 32);
+    sm_config_set_in_shift(&c, true, true, 24);
+    pio_sm_init(pio, sm, offset, &c);
+    pio_sm_set_enabled(pio, sm, true);
+}
+%}
diff --git a/src/libdvi/tmds_table.h b/src/libdvi/tmds_table.h
new file mode 100644
index 0000000..48ddf20
--- /dev/null
+++ b/src/libdvi/tmds_table.h
@@ -0,0 +1,76 @@
+// Generated from tmds_table_gen.py
+//
+// This table converts a 6 bit data input into a pair of TMDS data symbols
+// with data content *almost* equal (1 LSB off) to input value left shifted by
+// two. The pairs of symbols have a net DC balance of 0.
+//
+// The two symbols are concatenated in the 20 LSBs of a data word, with the
+// first symbol in least-significant position.
+//
+// Note the declaration isn't included here, just the table body. This is in
+// case you want multiple copies of the table in different SRAMs (particularly
+// scratch X/Y).
+0x7fd00u,
+0x40dfcu,
+0x41df8u,
+0x7ed04u,
+0x43df0u,
+0x7cd0cu,
+0x7dd08u,
+0x42df4u,
+0x47de0u,
+0x78d1cu,
+0x79d18u,
+0x46de4u,
+0x7bd10u,
+0x44decu,
+0x45de8u,
+0xafa41u,
+0x4fdc0u,
+0x70d3cu,
+0x71d38u,
+0x4edc4u,
+0x73d30u,
+0x4cdccu,
+0x4ddc8u,
+0xa7a61u,
+0x77d20u,
+0x48ddcu,
+0x49dd8u,
+0xa3a71u,
+0x4bdd0u,
+0xa1a79u,
+0xa0a7du,
+0x9fa81u,
+0x5fd80u,
+0x60d7cu,
+0x61d78u,
+0x5ed84u,
+0x63d70u,
+0x5cd8cu,
+0x5dd88u,
+0xb7a21u,
+0x67d60u,
+0x58d9cu,
+0x59d98u,
+0xb3a31u,
+0x5bd90u,
+0xb1a39u,
+0xb0a3du,
+0x8fac1u,
+0x6fd40u,
+0x50dbcu,
+0x51db8u,
+0xbba11u,
+0x53db0u,
+0xb9a19u,
+0xb8a1du,
+0x87ae1u,
+0x57da0u,
+0xbda09u,
+0xbca0du,
+0x83af1u,
+0xbea05u,
+0x81af9u,
+0x80afdu,
+0xbfa01u,
diff --git a/src/libdvi/tmds_table_fullres.h b/src/libdvi/tmds_table_fullres.h
new file mode 100644
index 0000000..872d7ff
--- /dev/null
+++ b/src/libdvi/tmds_table_fullres.h
@@ -0,0 +1,139 @@
+// Each entry consists of a 10 bit TMDS symbol in pseudo-differential format
+// (10 LSBs) and the symbol's disparity as a 6 bit signed integer (the 6
+// MSBs). There is a 16 bit gap in between them, which is actually vital for
+// the way the TMDS encode works!
+//
+// There are 128 1-word entries. The lookup index should be the concatenation
+// of the sign bit of current running disparity, with 6 bits of colour channel
+// data.
+
+// Non-negative running disparity:
+0xe0000100,
+0xf8000303,
+0x00000307,
+0xe8000104,
+0x000001f0,
+0xf000010c,
+0xe8000108,
+0x0000030b,
+0xf80001e0,
+0xf800011c,
+0xf0000118,
+0x000001e4,
+0xe8000110,
+0x00000313,
+0x000001e8,
+0xf0000241,
+0xf00001c0,
+0x0000013c,
+0xf8000138,
+0xf80001c4,
+0xf0000130,
+0x000001cc,
+0xf80001c8,
+0xf8000261,
+0xe8000120,
+0x00000323,
+0x000001d8,
+0x00000271,
+0xf80001d0,
+0xf0000086,
+0xe8000082,
+0xf0000281,
+0xe8000180,
+0x00000383,
+0x00000178,
+0xf0000184,
+0xf8000170,
+0xf800018c,
+0xf0000188,
+0xf0000221,
+0xf0000160,
+0x0000019c,
+0xf8000198,
+0xf8000231,
+0xf0000190,
+0x00000239,
+0xf00000c2,
+0xf80002c1,
+0xe8000140,
+0x00000343,
+0x000001b8,
+0xf0000211,
+0xf80001b0,
+0xf8000219,
+0x0000021d,
+0x000002e1,
+0xf00001a0,
+0xf0000209,
+0xf800020d,
+0xf000000e,
+0xf0000205,
+0xe8000006,
+0xe0000002,
+0xe8000201,
+// Negative running disparity:
+0x280003ff,
+0x100001fc,
+0x080001f8,
+0x200003fb,
+0x000001f0,
+0x180003f3,
+0x200003f7,
+0x080001f4,
+0x1000031f,
+0x100003e3,
+0x180003e7,
+0x000001e4,
+0x200003ef,
+0x080001ec,
+0x000001e8,
+0x080000be,
+0x1800033f,
+0x0000013c,
+0x100003c7,
+0x1000033b,
+0x180003cf,
+0x000001cc,
+0x10000337,
+0x0000009e,
+0x200003df,
+0x080001dc,
+0x000001d8,
+0x00000271,
+0x1000032f,
+0x08000279,
+0x1000027d,
+0x0800007e,
+0x2000037f,
+0x0800017c,
+0x00000178,
+0x1800037b,
+0x1000038f,
+0x10000373,
+0x18000377,
+0x080000de,
+0x1800039f,
+0x0000019c,
+0x10000367,
+0x000000ce,
+0x1800036f,
+0x00000239,
+0x0800023d,
+0x0000003e,
+0x200003bf,
+0x080001bc,
+0x000001b8,
+0x080000ee,
+0x1000034f,
+0x000000e6,
+0x0000021d,
+0x000002e1,
+0x1800035f,
+0x080000f6,
+0x000000f2,
+0x080002f1,
+0x080000fa,
+0x100002f9,
+0x180002fd,
+0x100000fe,
diff --git a/src/libdvi/tmds_table_gen.py b/src/libdvi/tmds_table_gen.py
new file mode 100755
index 0000000..0ad554e
--- /dev/null
+++ b/src/libdvi/tmds_table_gen.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+# The key fact is that, if x is even, and the encoder currently has a running
+# imbalance of 0, encoding x followed by x + 1 produces a symbol pair with a
+# net balance of 0.
+#
+# This is a reasonable constraint, because we only want RGB565 (so 6 valid
+# channel data bits -> data is multiple of 4), and can probably tolerate
+# 0.25LSB of noise :)
+#
+# This means that encoding a half-horizontal-resolution scanline buffer is a
+# simple LUT operation for each colour channel, because we have made the
+# encoding process stateless by guaranteeing 0 balance.
+
+def popcount(x):
+	n = 0
+	while x:
+		n += 1
+		x = x & (x - 1)
+	return n
+
+# Equivalent to N1(q) - N0(q) in the DVI spec
+def byteimbalance(x):
+	return 2 * popcount(x) - 8
+
+# This is a direct translation of "Figure 3-5. T.M.D.S. Encode Algorithm" on
+# page 29 of DVI 1.0 spec
+
+class TMDSEncode:
+	ctrl_syms = {
+		0b00: 0b1101010100,
+		0b01: 0b0010101011,
+		0b10: 0b0101010100,
+		0b11: 0b1010101011
+	}
+	def __init__(self):
+		self.imbalance = 0
+
+	def encode(self, d, c, de):
+		if not de:
+			self.imbalance = 0
+			return self.ctrl_syms[c]
+		# Minimise transitions
+		q_m = d & 0x1
+		if popcount(d) > 4 or (popcount(d) == 4 and not d & 0x1):
+			for i in range(7):
+				q_m = q_m | (~(q_m >> i ^ d >> i + 1) & 0x1) << i + 1
+		else:
+			for i in range(7):
+				q_m = q_m | ( (q_m >> i ^ d >> i + 1) & 0x1) << i + 1
+			q_m = q_m | 0x100
+		# Correct DC balance
+		inversion_mask = 0x2ff
+		q_out = 0
+		if self.imbalance == 0 or byteimbalance(q_m & 0xff) == 0:
+			q_out = q_m ^ (0 if q_m & 0x100 else inversion_mask)
+			if q_m & 0x100:
+				self.imbalance += byteimbalance(q_m & 0xff)
+			else:
+				self.imbalance -= byteimbalance(q_m & 0xff)
+		elif (self.imbalance > 0) == (byteimbalance(q_m & 0xff) > 0):
+			q_out = q_m ^ inversion_mask
+			self.imbalance += ((q_m & 0x100) >> 7) - byteimbalance(q_m & 0xff)
+		else:
+			q_out = q_m
+			self.imbalance += byteimbalance(q_m & 0xff) - ((~q_m & 0x100) >> 7)
+		return q_out
+
+# Turn a bitmap of width n into n pairs of pseudo-differential bits
+def differentialise(x, n):
+	accum = 0
+	for i in range(n):
+		accum <<= 2
+		if x & (1 << (n - 1)):
+			accum |= 0b01
+		else:
+			accum |= 0b10
+		x <<= 1
+	return accum
+
+enc = TMDSEncode()
+
+
+###
+# Pixel-doubled table:
+
+# for i in range(0, 256, 4):
+# 	sym0 = enc.encode(i, 0, 1)
+# 	sym1 = enc.encode(i ^ 1, 0, 1)
+# 	assert(enc.imbalance == 0)
+# 	print(f"0x{sym0 | (sym1 << 10):05x}u,")
+
+###
+# Fullres 1bpp table: (each entry is 2 words, 4 pixels)
+
+# (note trick here is that encoding 0x00 or 0xff sets imbalance to -8, and
+# (encoding 0x01 or 0xfe returns imbalance to 0, so we alternate between these
+# (two pairs of dark/light colours. Creates some fairly subtle vertical
+# (banding, but it's cheap.
+
+# for i in range(1 << 4):
+# 	syms = list(enc.encode((0xff if i & 1 << j else 0) ^ j & 0x01, 0, 1) for j in range(4))
+# 	print(f"0x{syms[0] | syms[1] << 10:05x}, 0x{syms[2] | syms[3] << 10:05x}")
+# 	assert(enc.imbalance == 0)
+
+###
+# Fullres table stuff:
+
+# def disptable_format(sym):
+# 	return sym | ((popcount(sym) * 2 - 10 & 0x3f) << 26)
+
+# print("// Non-negative running disparity:")
+# for i in range(0, 256, 4):
+# 	enc.imbalance = 1
+# 	print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
+
+# print("// Negative running disparity:")
+# for i in range(0, 256, 4):
+# 	enc.imbalance = -1
+# 	print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
+
+###
+# Control symbols:
+
+# for i in range(4):
+# 	sym = enc.encode(0, i, 0)
+# 	print(f"0x{sym << 10 | sym:05x},")
+
+
+###
+# Find zero-balance symbols:
+
+# for i in range(256):
+# 	enc.imbalance = 0
+# 	sym = enc.encode(i, 0, 1)
+# 	if enc.imbalance == 0:
+# 		print(f"{i:02x}: {sym:03x}")
+
+###
+# Generate 2bpp table based on above experiment:
+
+levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa]
+levels_2bpp_odd  = [0x04, 0x51, 0xae, 0xfb]
+
+for i1, p1 in enumerate(levels_2bpp_odd):
+	for i0, p0 in enumerate(levels_2bpp_even):
+		sym0 = enc.encode(p0, 0, 1)
+		sym1 = enc.encode(p1, 0, 1)
+		assert(enc.imbalance == 0)
+		print(f".word 0x{sym1 << 10 | sym0:05x} // {i0:02b}, {i1:02b}")
diff --git a/src/libdvi/util_queue_u32_inline.h b/src/libdvi/util_queue_u32_inline.h
new file mode 100644
index 0000000..32a1413
--- /dev/null
+++ b/src/libdvi/util_queue_u32_inline.h
@@ -0,0 +1,83 @@
+#ifndef _UTIL_QUEUE_U32_INLINE_H
+#define _UTIL_QUEUE_U32_INLINE_H
+
+// Faster versions of the functions found in pico/util/queue.h, for the common
+// case of 32-bit-sized elements. Can be used on the same queue data
+// structure, and mixed freely with the generic access methods, as long as
+// element_size == 4.
+
+#include "pico/util/queue.h"
+#include "hardware/sync.h"
+
+static inline uint16_t _queue_inc_index_u32(queue_t *q, uint16_t index) {
+    if (++index > q->element_count) { // > because we have element_count + 1 elements
+        index = 0;
+    }
+    return index;
+}
+
+static inline bool queue_try_add_u32(queue_t *q, void *data) {
+    bool success = false;
+    uint32_t flags = spin_lock_blocking(q->core.spin_lock);
+    if (queue_get_level_unsafe(q) != q->element_count) {
+        ((uint32_t*)q->data)[q->wptr] = *(uint32_t*)data;
+        q->wptr = _queue_inc_index_u32(q, q->wptr);
+        success = true;
+    }
+    spin_unlock(q->core.spin_lock, flags);
+    if (success) __sev();
+    return success;
+}
+
+static inline bool queue_try_remove_u32(queue_t *q, void *data) {
+    bool success = false;
+    uint32_t flags = spin_lock_blocking(q->core.spin_lock);
+    if (queue_get_level_unsafe(q) != 0) {
+        *(uint32_t*)data = ((uint32_t*)q->data)[q->rptr];
+        q->rptr = _queue_inc_index_u32(q, q->rptr);
+        success = true;
+    }
+    spin_unlock(q->core.spin_lock, flags);
+    if (success) __sev();
+    return success;
+}
+
+static inline bool queue_try_peek_u32(queue_t *q, void *data) {
+    bool success = false;
+    uint32_t flags = spin_lock_blocking(q->core.spin_lock);
+    if (queue_get_level_unsafe(q) != 0) {
+        *(uint32_t*)data = ((uint32_t*)q->data)[q->rptr];
+        success = true;
+    }
+    spin_unlock(q->core.spin_lock, flags);
+    return success;
+}
+
+static inline void queue_add_blocking_u32(queue_t *q, void *data) {
+    bool done;
+    do {
+        done = queue_try_add_u32(q, data);
+        if (done) break;
+        __wfe();
+    } while (true);
+}
+
+static inline void queue_remove_blocking_u32(queue_t *q, void *data) {
+    bool done;
+    do {
+        done = queue_try_remove_u32(q, data);
+        if (done) break;
+        __wfe();
+    } while (true);
+}
+
+static inline void queue_peek_blocking_u32(queue_t *q, void *data) {
+    bool done;
+    do {
+        done = queue_try_peek_u32(q, data);
+        if (done) break;
+        __wfe();
+    } while (true);
+}
+
+#endif