Remove soft link to libdvi (copy full directory instead)

For Arduino Library Manager compliance
2023-03-09 15:00:54 -08:00 · 2023-03-09 15:00:54 -08:00 · bb7dc7c20d
commit bb7dc7c20d
parent 506fca674a
20 changed files with 2661 additions and 3 deletions
--- a/Readme.md
+++ b/Readme.md
@ -19,8 +19,7 @@ RP2040 core).
 Changes vs main PicoDVI repo:
 - Add library.properties file, src and examples directories per Arduino
 requirements.
- software/libdvi is soft-linked into src so Arduino IDE can compile these
+- A full copy of software/libdvi is made in src (originally was soft-linked but Arduino Library Manager does not approve). If any updates are made in the original PicoDVI libdvi directory, copy them here!
 parts.
 - The file dvi_serialiser.pio.h, normally not part of the distribution and
 generated during the Pico SDK build process, is provided here for Arduino
 build to work. If any changes are made in dvi_serialiser.pio (either here
--- a/src/libdvi
+++ b/src/libdvi
@ -1 +0,0 @@
 ../software/libdvi
--- a/src/libdvi/CMakeLists.txt
+++ b/src/libdvi/CMakeLists.txt
@ -0,0 +1,33 @@
 # Note we are using INTERFACE so that the library can be configured per-app
 # with compile-time defines
 add_library(libdvi INTERFACE)
 target_sources(libdvi INTERFACE
 	${CMAKE_CURRENT_LIST_DIR}/dvi.c
 	${CMAKE_CURRENT_LIST_DIR}/dvi.h
 	${CMAKE_CURRENT_LIST_DIR}/dvi_config_defs.h
 	${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.c
 	${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.h
 	${CMAKE_CURRENT_LIST_DIR}/dvi_timing.c
 	${CMAKE_CURRENT_LIST_DIR}/dvi_timing.h
 	${CMAKE_CURRENT_LIST_DIR}/tmds_encode.S
 	${CMAKE_CURRENT_LIST_DIR}/tmds_encode.c
 	${CMAKE_CURRENT_LIST_DIR}/tmds_encode.h
 	${CMAKE_CURRENT_LIST_DIR}/tmds_table.h
 	${CMAKE_CURRENT_LIST_DIR}/tmds_table_fullres.h
 	${CMAKE_CURRENT_LIST_DIR}/util_queue_u32_inline.h
 	)
 target_include_directories(libdvi INTERFACE ${CMAKE_CURRENT_LIST_DIR})
 target_link_libraries(libdvi INTERFACE
 	pico_base_headers
 	pico_util
 	hardware_dma
 	hardware_interp
 	hardware_pio
 	hardware_pwm
 	)
 pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.pio)
 pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/tmds_encode_1bpp.pio)
--- a/src/libdvi/dvi.c
+++ b/src/libdvi/dvi.c
@ -0,0 +1,255 @@
 #include <stdlib.h>
 #include "hardware/dma.h"
 #include "hardware/irq.h"
 #include "dvi.h"
 #include "dvi_timing.h"
 #include "dvi_serialiser.h"
 #include "tmds_encode.h"
 // Adafruit PicoDVI fork requires a couple global items run-time configurable:
 uint8_t dvi_vertical_repeat = DVI_VERTICAL_REPEAT;
 bool    dvi_monochrome_tmds = DVI_MONOCHROME_TMDS;
 // Time-critical functions pulled into RAM but each in a unique section to
 // allow garbage collection
 #define __dvi_func(f) __not_in_flash_func(f)
 #define __dvi_func_x(f) __scratch_x(__STRING(f)) f
 // We require exclusive use of a DMA IRQ line. (you wouldn't want to share
 // anyway). It's possible in theory to hook both IRQs and have two DVI outs.
 static struct dvi_inst *dma_irq_privdata[2];
 static void dvi_dma0_irq();
 static void dvi_dma1_irq();
 void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue) {
 	dvi_timing_state_init(&inst->timing_state);
 	dvi_serialiser_init(&inst->ser_cfg);
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		inst->dma_cfg[i].chan_ctrl = dma_claim_unused_channel(true);
 		inst->dma_cfg[i].chan_data = dma_claim_unused_channel(true);
 		inst->dma_cfg[i].tx_fifo = (void*)&inst->ser_cfg.pio->txf[inst->ser_cfg.sm_tmds[i]];
 		inst->dma_cfg[i].dreq = pio_get_dreq(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i], true);
 	}
 	inst->late_scanline_ctr = 0;
 	inst->tmds_buf_release_next = NULL;
 	inst->tmds_buf_release = NULL;
 	queue_init_with_spinlock(&inst->q_tmds_valid,   sizeof(void*),  8, spinlock_tmds_queue);
 	queue_init_with_spinlock(&inst->q_tmds_free,    sizeof(void*),  8, spinlock_tmds_queue);
 	queue_init_with_spinlock(&inst->q_colour_valid, sizeof(void*),  8, spinlock_colour_queue);
 	queue_init_with_spinlock(&inst->q_colour_free,  sizeof(void*),  8, spinlock_colour_queue);
 	dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, true, &inst->dma_list_vblank_sync);
 	dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, false, &inst->dma_list_vblank_nosync);
 #if defined(ARDUINO)
 	dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (uint32_t*)SRAM_BASE, &inst->dma_list_active);
 #else
 	dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (void*)SRAM_BASE, &inst->dma_list_active);
 #endif
 	dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, NULL, &inst->dma_list_error);
 	for (int i = 0; i < DVI_N_TMDS_BUFFERS; ++i) {
 		void *tmdsbuf;
 		if (dvi_monochrome_tmds)
 			tmdsbuf = malloc(inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
 		else
 			tmdsbuf = malloc(3 * inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
 		if (!tmdsbuf)
 			panic("TMDS buffer allocation failed");
 		queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
 	}
 }
 // The IRQs will run on whichever core calls this function (this is why it's
 // called separately from dvi_init)
 void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num) {
 	uint32_t mask_sync_channel = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
 	uint32_t mask_all_channels = 0;
 	for (int i = 0; i < N_TMDS_LANES; ++i)
 		mask_all_channels |= 1u << inst->dma_cfg[i].chan_ctrl | 1u << inst->dma_cfg[i].chan_data;
 	dma_hw->ints0 = mask_sync_channel;
 	if (irq_num == DMA_IRQ_0) {
 		hw_write_masked(&dma_hw->inte0, mask_sync_channel, mask_all_channels);
 		dma_irq_privdata[0] = inst;
 		irq_set_exclusive_handler(DMA_IRQ_0, dvi_dma0_irq);
 	}
 	else {
 		hw_write_masked(&dma_hw->inte1, mask_sync_channel, mask_all_channels);
 		dma_irq_privdata[1] = inst;
 		irq_set_exclusive_handler(DMA_IRQ_1, dvi_dma1_irq);
 	}
 	irq_set_enabled(irq_num, true);
 }
 // Set up control channels to make transfers to data channels' control
 // registers (but don't trigger the control channels -- this is done either by
 // data channel CHAIN_TO or an initial write to MULTI_CHAN_TRIGGER)
 static inline void __attribute__((always_inline)) _dvi_load_dma_op(const struct dvi_lane_dma_cfg dma_cfg[], struct dvi_scanline_dma_list *l) {
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		dma_channel_config cfg = dma_channel_get_default_config(dma_cfg[i].chan_ctrl);
 		channel_config_set_ring(&cfg, true, 4); // 16-byte write wrap
 		channel_config_set_read_increment(&cfg, true);
 		channel_config_set_write_increment(&cfg, true);
 		dma_channel_configure(
 			dma_cfg[i].chan_ctrl,
 			&cfg,
 			&dma_hw->ch[dma_cfg[i].chan_data],
 			dvi_lane_from_list(l, i),
 			4, // Configure all 4 registers then halt until next CHAIN_TO
 			false
 		);
 	}
 }
 // Setup first set of control block lists, configure the control channels, and
 // trigger them. Control channels will subsequently be triggered only by DMA
 // CHAIN_TO on data channel completion. IRQ handler *must* be prepared before
 // calling this. (Hooked to DMA IRQ0)
 void dvi_start(struct dvi_inst *inst) {
 	_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync);
 	dma_start_channel_mask(
 		(1u << inst->dma_cfg[0].chan_ctrl) |
 		(1u << inst->dma_cfg[1].chan_ctrl) |
 		(1u << inst->dma_cfg[2].chan_ctrl));
 	// We really don't want the FIFOs to bottom out, so wait for full before
 	// starting the shift-out.
 	for (int i = 0; i < N_TMDS_LANES; ++i)
 		while (!pio_sm_is_tx_fifo_full(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i]))
 			tight_loop_contents();
 	dvi_serialiser_enable(&inst->ser_cfg, true);
 }
 static inline void __dvi_func_x(_dvi_prepare_scanline_8bpp)(struct dvi_inst *inst, uint32_t *scanbuf) {
 	uint32_t *tmdsbuf;
 	queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
 	uint pixwidth = inst->timing->h_active_pixels;
 	uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
 	// Scanline buffers are half-resolution; the functions take the number of *input* pixels as parameter.
 	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_8BPP_BLUE_MSB,  DVI_8BPP_BLUE_LSB );
 	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_8BPP_GREEN_MSB, DVI_8BPP_GREEN_LSB);
 	tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_8BPP_RED_MSB,   DVI_8BPP_RED_LSB  );
 	queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
 }
 static inline void __dvi_func_x(_dvi_prepare_scanline_16bpp)(struct dvi_inst *inst, uint32_t *scanbuf) {
 	uint32_t *tmdsbuf;
 	queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
 	uint pixwidth = inst->timing->h_active_pixels;
 	uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
 	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_16BPP_BLUE_MSB,  DVI_16BPP_BLUE_LSB );
 	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_16BPP_GREEN_MSB, DVI_16BPP_GREEN_LSB);
 	tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_16BPP_RED_MSB,   DVI_16BPP_RED_LSB  );
 	queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
 }
 // "Worker threads" for TMDS encoding (core enters and never returns, but still handles IRQs)
 // Version where each record in q_colour_valid is one scanline:
 void __dvi_func(dvi_scanbuf_main_8bpp)(struct dvi_inst *inst) {
 	uint y = 0;
 	while (1) {
 		uint32_t *scanbuf;
 		queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf);
 		_dvi_prepare_scanline_8bpp(inst, scanbuf);
 		queue_add_blocking_u32(&inst->q_colour_free, &scanbuf);
 		++y;
 		if (y == inst->timing->v_active_lines) {
 			y = 0;
 		}
 	}
 	__builtin_unreachable();
 }
 // Ugh copy/paste but it lets us garbage collect the TMDS stuff that is not being used from .scratch_x
 void __dvi_func(dvi_scanbuf_main_16bpp)(struct dvi_inst *inst) {
 	uint y = 0;
 	while (1) {
 		uint32_t *scanbuf;
 		queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf);
 		_dvi_prepare_scanline_16bpp(inst, scanbuf);
 		queue_add_blocking_u32(&inst->q_colour_free, &scanbuf);
 		++y;
 		if (y == inst->timing->v_active_lines) {
 			y = 0;
 		}
 	}
 	__builtin_unreachable();
 }
 static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) {
 	// Every fourth interrupt marks the start of the horizontal active region. We
 	// now have until the end of this region to generate DMA blocklist for next
 	// scanline.
 	dvi_timing_state_advance(inst->timing, &inst->timing_state);
 	if (inst->tmds_buf_release && !queue_try_add_u32(&inst->q_tmds_free, &inst->tmds_buf_release))
 		panic("TMDS free queue full in IRQ!");
 	inst->tmds_buf_release = inst->tmds_buf_release_next;
 	inst->tmds_buf_release_next = NULL;
 	// Make sure all three channels have definitely loaded their last block
 	// (should be within a few cycles of one another)
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
 			tight_loop_contents();
 	}
 	uint32_t *tmdsbuf;
 	while (inst->late_scanline_ctr > 0 && queue_try_remove_u32(&inst->q_tmds_valid, &tmdsbuf)) {
 		// If we displayed this buffer then it would be in the wrong vertical
 		// position on-screen. Just pass it back.
 		queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
 		--inst->late_scanline_ctr;
 	}
 	if (inst->timing_state.v_state != DVI_STATE_ACTIVE) {
 		// Don't care
 		tmdsbuf = NULL;
 	}
 	else if (queue_try_peek_u32(&inst->q_tmds_valid, &tmdsbuf)) {
 		if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) {
 			queue_remove_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
 			inst->tmds_buf_release_next = tmdsbuf;
 		}
 	}
 	else {
 		// No valid scanline was ready (generates solid red scanline)
 		tmdsbuf = NULL;
 		if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1)
 			++inst->late_scanline_ctr;
 	}
 	switch (inst->timing_state.v_state) {
 		case DVI_STATE_ACTIVE:
 			if (tmdsbuf) {
 				dvi_update_scanline_data_dma(inst->timing, tmdsbuf, &inst->dma_list_active);
 				_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_active);
 			}
 			else {
 				_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_error);
 			}
 			if (inst->scanline_callback && inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) {
 				inst->scanline_callback();
 			}
 			break;
 		case DVI_STATE_SYNC:
 			_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_sync);
 			break;
 		default:
 			_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync);
 			break;
 	}
 }
 static void __dvi_func(dvi_dma0_irq)() {
 	struct dvi_inst *inst = dma_irq_privdata[0];
 	dma_hw->ints0 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
 	dvi_dma_irq_handler(inst);
 }
 static void __dvi_func(dvi_dma1_irq)() {
 	struct dvi_inst *inst = dma_irq_privdata[1];
 	dma_hw->ints1 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
 	dvi_dma_irq_handler(inst);
 }
--- a/src/libdvi/dvi.h
+++ b/src/libdvi/dvi.h
@ -0,0 +1,81 @@
 #ifndef _DVI_H
 #define _DVI_H
 #define N_TMDS_LANES 3
 #define TMDS_SYNC_LANE 0 // blue!
 #include "pico/util/queue.h"
 #include "dvi_config_defs.h"
 #include "dvi_timing.h"
 #include "dvi_serialiser.h"
 #include "util_queue_u32_inline.h"
 typedef void (*dvi_callback_t)(void);
 struct dvi_inst {
 	// Config ---
 	const struct dvi_timing *timing;
 	struct dvi_lane_dma_cfg dma_cfg[N_TMDS_LANES];
 	struct dvi_timing_state timing_state;
 	struct dvi_serialiser_cfg ser_cfg;
 	// Called in the DMA IRQ once per scanline -- careful with the run time!
 	dvi_callback_t scanline_callback;
 	// State ---
 	struct dvi_scanline_dma_list dma_list_vblank_sync;
 	struct dvi_scanline_dma_list dma_list_vblank_nosync;
 	struct dvi_scanline_dma_list dma_list_active;
 	struct dvi_scanline_dma_list dma_list_error;
 	// After a TMDS buffer has been enqueue via a control block for the last
 	// time, two IRQs must go by before freeing. The first indicates the control
 	// block for this buf has been loaded, and the second occurs some time after
 	// the actual data DMA transfer has completed.
 	uint32_t *tmds_buf_release_next;
 	uint32_t *tmds_buf_release;
 	// Remember how far behind the source is on TMDS scanlines, so we can output
 	// solid colour until they catch up (rather than dying spectacularly)
 	uint late_scanline_ctr;
 	// Encoded scanlines:
 	queue_t q_tmds_valid;
 	queue_t q_tmds_free;
 	// Either scanline buffers or frame buffers:
 	queue_t q_colour_valid;
 	queue_t q_colour_free;
 };
 #if defined(__cplusplus)
 extern "C"
 {
 #endif
 // Set up data structures and hardware for DVI.
 void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue);
 // Call this after calling dvi_init(). DVI DMA interrupts will be routed to
 // whichever core called this function. Registers an exclusive IRQ handler.
 void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num);
 // Start actually wiggling TMDS pairs. Call this once you have initialised the
 // DVI, have registered the IRQs, and are producing rendered scanlines.
 void dvi_start(struct dvi_inst *inst);
 // TMDS encode worker function: core enters and doesn't leave, but still
 // responds to IRQs. Repeatedly pop a scanline buffer from q_colour_valid,
 // TMDS encode it, and pass it to the tmds valid queue.
 void dvi_scanbuf_main_8bpp(struct dvi_inst *inst);
 void dvi_scanbuf_main_16bpp(struct dvi_inst *inst);
 // Same as above, but each q_colour_valid entry is a framebuffer
 void dvi_framebuf_main_8bpp(struct dvi_inst *inst);
 void dvi_framebuf_main_16bpp(struct dvi_inst *inst);
 #if defined(__cplusplus)
 }
 #endif
 #endif
--- a/src/libdvi/dvi_config_defs.h
+++ b/src/libdvi/dvi_config_defs.h
@ -0,0 +1,151 @@
 #ifndef _DVI_CONFIG_DEFS_H
 #define _DVI_CONFIG_DEFS_H
 // Compile-time configuration definitions for libdvi. This file provides
 // defaults -- you can override using a board header, or setting compile
 // definitions directly from the commandline (e.g. using CMake
 // target_compile_definitions())
 // Pull in base headers to make sure board definitions override the
 // definitions provided here. Note this file is included in asm and C.
 #include "hardware/platform_defs.h"
 #include "pico/config.h"
 // ----------------------------------------------------------------------------
 // General DVI defines
 // How many times to output the same TMDS buffer before recyling it onto the
 // free queue. Pixels are repeated vertically if this is >1.
 #ifndef DVI_VERTICAL_REPEAT
 #define DVI_VERTICAL_REPEAT 2
 #endif
 // Number of TMDS buffers to allocate (malloc()) in DVI init. You can set this
 // to 0 if you want to allocate your own (e.g. if you want static buffers)
 #ifndef DVI_N_TMDS_BUFFERS
 #define DVI_N_TMDS_BUFFERS 3
 #endif
 // If 1, replace the DVI serialiser with a 10n1 UART (1 start bit, 10 data
 // bits, 1 stop bit) so the stream can be dumped and analysed easily.
 #ifndef DVI_SERIAL_DEBUG
 #define DVI_SERIAL_DEBUG 0
 #endif
 // If 1, the same TMDS symbols are sent to all 3 lanes during the horizontal
 // active period. This means only monochrome colour is available, but the TMDS
 // buffers are 3 times smaller as a result, and the performance requirements
 // for encode are also cut by 3.
 #ifndef DVI_MONOCHROME_TMDS
 #define DVI_MONOCHROME_TMDS 0
 #endif
 // By default, we assume each 32-bit word written to a PIO FIFO contains 2x
 // 10-bit TMDS symbols, concatenated into the lower 20 bits, least-significant
 // first. This is convenient if you are generating two or more pixels at once,
 // e.g. using the pixel-doubling TMDS encode. You can change this value to 1
 // (so each word contains 1 symbol) for e.g. full resolution RGB encode. Note
 // that this value needs to divide the DVI horizontal timings, so is limited
 // to 1 or 2.
 #ifndef DVI_SYMBOLS_PER_WORD
 #define DVI_SYMBOLS_PER_WORD 2
 #endif
 #if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2
 #error "Unsupported value for DVI_SYMBOLS_PER_WORD"
 #endif
 // ----------------------------------------------------------------------------
 // Pixel component layout
 // By default we go R, G, B from MSB -> LSB. Override to e.g. swap RGB <-> BGR
 // Default 8bpp layout: RGB332, {r[1:0], g[2:0], b[1:0]}
 #ifndef DVI_8BPP_RED_MSB
 #define DVI_8BPP_RED_MSB 7
 #endif
 #ifndef DVI_8BPP_RED_LSB
 #define DVI_8BPP_RED_LSB 5
 #endif
 #ifndef DVI_8BPP_GREEN_MSB
 #define DVI_8BPP_GREEN_MSB 4
 #endif
 #ifndef DVI_8BPP_GREEN_LSB
 #define DVI_8BPP_GREEN_LSB 2
 #endif
 #ifndef DVI_8BPP_BLUE_MSB
 #define DVI_8BPP_BLUE_MSB 1
 #endif
 #ifndef DVI_8BPP_BLUE_LSB
 #define DVI_8BPP_BLUE_LSB 0
 #endif
 // Default 16bpp layout: RGB565, {r[4:0], g[5:0], b[4:0]}
 #ifndef DVI_16BPP_RED_MSB
 #define DVI_16BPP_RED_MSB 15
 #endif
 #ifndef DVI_16BPP_RED_LSB
 #define DVI_16BPP_RED_LSB 11
 #endif
 #ifndef DVI_16BPP_GREEN_MSB
 #define DVI_16BPP_GREEN_MSB 10
 #endif
 #ifndef DVI_16BPP_GREEN_LSB
 #define DVI_16BPP_GREEN_LSB 5
 #endif
 #ifndef DVI_16BPP_BLUE_MSB
 #define DVI_16BPP_BLUE_MSB 4
 #endif
 #ifndef DVI_16BPP_BLUE_LSB
 #define DVI_16BPP_BLUE_LSB 0
 #endif
 // Default 1bpp layout: bitwise little-endian, i.e. least significant bit of
 // each word is the first (leftmost) of a block of 32 pixels.
 // If 1, reverse the order of pixels within each byte. Order of bytes within
 // each word is still little-endian.
 #ifndef DVI_1BPP_BIT_REVERSE
 #define DVI_1BPP_BIT_REVERSE 1 // Adafruit_GFX GFXcanvas1 requires this 1
 #endif
 // ----------------------------------------------------------------------------
 // TMDS encode controls
 // Number of TMDS loop bodies between branches. cmp + branch costs 3 cycles,
 // so you can easily save 10% of encode time by bumping this. Note that body
 // will *already* produce multiple pixels, and total symbols per iteration
 // must cleanly divide symbols per scanline, else the loop won't terminate.
 // Point gun away from foot.
 #ifndef TMDS_ENCODE_UNROLL
 #define TMDS_ENCODE_UNROLL 1
 #endif
 // If 1, don't save/restore the interpolators on full-resolution TMDS encode.
 // Speed hack. The TMDS code uses both interpolators, for each of the 3 data
 // channels, so this define avoids 6 save/restores per scanline.
 #ifndef TMDS_FULLRES_NO_INTERP_SAVE
 #define TMDS_FULLRES_NO_INTERP_SAVE 0
 #endif
 // If 1, don't DC-balance the output of full resolution encode. Hilariously
 // noncompliant, but Dell Ultrasharp -- the honey badger of computer monitors
 // -- does not seem to mind (it helps that we DC-couple). Another speed hack,
 // useful when you are trying to get everything else up to speed.
 #ifndef TMDS_FULLRES_NO_DC_BALANCE
 #define TMDS_FULLRES_NO_DC_BALANCE 0
 #endif
 #endif
--- a/src/libdvi/dvi_serialiser.c
+++ b/src/libdvi/dvi_serialiser.c
@ -0,0 +1,73 @@
 #include "pico.h"
 #include "hardware/pio.h"
 #include "hardware/gpio.h"
 #include "hardware/pwm.h"
 #include "hardware/structs/padsbank0.h"
 #include "dvi.h"
 #include "dvi_serialiser.h"
 #include "dvi_serialiser.pio.h"
 static void dvi_configure_pad(uint gpio, bool invert) {
 	// 2 mA drive, enable slew rate limiting (this seems fine even at 720p30, and
 	// the 3V3 LDO doesn't get warm like when turning all the GPIOs up to 11).
 	// Also disable digital receiver.
 	hw_write_masked(
 		&padsbank0_hw->io[gpio],
 		(0 << PADS_BANK0_GPIO0_DRIVE_LSB),
 		PADS_BANK0_GPIO0_DRIVE_BITS | PADS_BANK0_GPIO0_SLEWFAST_BITS | PADS_BANK0_GPIO0_IE_BITS
 	);
 	gpio_set_outover(gpio, invert ? GPIO_OVERRIDE_INVERT : GPIO_OVERRIDE_NORMAL);
 }
 void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg) {
 #if DVI_SERIAL_DEBUG
 	uint offset = pio_add_program(cfg->pio, &dvi_serialiser_debug_program);
 #else
 	uint offset = pio_add_program(cfg->pio, &dvi_serialiser_program);
 #endif
 	cfg->prog_offs = offset;
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		pio_sm_claim(cfg->pio, cfg->sm_tmds[i]);
 		dvi_serialiser_program_init(
 			cfg->pio,
 			cfg->sm_tmds[i],
 			offset,
 			cfg->pins_tmds[i],
 			DVI_SERIAL_DEBUG
 		);
 		dvi_configure_pad(cfg->pins_tmds[i], cfg->invert_diffpairs);
 		dvi_configure_pad(cfg->pins_tmds[i] + 1, cfg->invert_diffpairs);
 	}
 	// Use a PWM slice to drive the pixel clock. Both GPIOs must be on the same
 	// slice (lower-numbered GPIO must be even).
 	assert(cfg->pins_clk % 2 == 0);
 	uint slice = pwm_gpio_to_slice_num(cfg->pins_clk);
 	// 5 cycles high, 5 low. Invert one channel so that we get complementary outputs.
 	pwm_config pwm_cfg = pwm_get_default_config();
 	pwm_config_set_output_polarity(&pwm_cfg, true, false);
 	pwm_config_set_wrap(&pwm_cfg, 9);
 	pwm_init(slice, &pwm_cfg, false);
 	pwm_set_both_levels(slice, 5, 5);
 	for (uint i = cfg->pins_clk; i <= cfg->pins_clk + 1; ++i) {
 		gpio_set_function(i, GPIO_FUNC_PWM);
 		dvi_configure_pad(i, cfg->invert_diffpairs);
 	}
 }
 void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable) {
 	uint mask = 0;
 	for (int i = 0; i < N_TMDS_LANES; ++i)
 		mask |= 1u << (cfg->sm_tmds[i] + PIO_CTRL_SM_ENABLE_LSB);
 	if (enable) {
 		hw_set_bits(&cfg->pio->ctrl, mask);
 		pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), true);
 	}
 	else {
 		hw_clear_bits(&cfg->pio->ctrl, mask);
 		pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), false);
 	}
 }
--- a/src/libdvi/dvi_serialiser.h
+++ b/src/libdvi/dvi_serialiser.h
@ -0,0 +1,22 @@
 #ifndef _DVI_SERIALISER_H
 #define _DVI_SERIALISER_H
 #include "hardware/pio.h"
 #include "dvi_config_defs.h"
 #define N_TMDS_LANES 3
 struct dvi_serialiser_cfg {
 	PIO pio;
 	uint sm_tmds[N_TMDS_LANES];
 	uint pins_tmds[N_TMDS_LANES];
 	uint pins_clk;
 	bool invert_diffpairs;
 	uint prog_offs;
 };
 void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg);
 void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable);
 uint32_t dvi_single_to_diff(uint32_t in);
 #endif
--- a/src/libdvi/dvi_serialiser.pio
+++ b/src/libdvi/dvi_serialiser.pio
@ -0,0 +1,53 @@
 .program dvi_serialiser
 .side_set 2
 .origin 0
 ; Single-ended -> differential serial
 	out pc, 1    side 0b10
 	out pc, 1    side 0b01
 .program dvi_serialiser_debug
 .side_set 1 opt
 ; The debug variant behaves as a UART with 1 start bit, 10 data bits, 1 stop
 ; bit, and 5/6ths the data throughput of the TMDS version.
 	pull ifempty  side 1 ; Extend stop bit with FIFO stall
 	nop           side 0
 	out pins, 1          ; Unrolled because we require 1 bit / clk
 	out pins, 1
 	out pins, 1
 	out pins, 1
 	out pins, 1
 	out pins, 1
 	out pins, 1
 	out pins, 1
 	out pins, 1
 	out pins, 1
 % c-sdk {
 #include "dvi_config_defs.h"
 static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
    pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
    pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
    pio_gpio_init(pio, data_pins);
    pio_gpio_init(pio, data_pins + 1);
    pio_sm_config c;
    if (debug) {
        c = dvi_serialiser_debug_program_get_default_config(offset);
    }
    else {
        c = dvi_serialiser_program_get_default_config(offset);
    }
    sm_config_set_sideset_pins(&c, data_pins);
    if (debug)
 	    sm_config_set_out_pins(&c, data_pins, 1);
    sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
    sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
    pio_sm_init(pio, sm, offset, &c);
    pio_sm_set_enabled(pio, sm, false);
 }
 %}
--- a/src/libdvi/dvi_serialiser.pio.h
+++ b/src/libdvi/dvi_serialiser.pio.h
@ -0,0 +1,101 @@
 // -------------------------------------------------- //
 // This file is autogenerated by pioasm; do not edit! //
 // -------------------------------------------------- //
 #pragma once
 #if !PICO_NO_HARDWARE
 #include "hardware/pio.h"
 #endif
 // -------------- //
 // dvi_serialiser //
 // -------------- //
 #define dvi_serialiser_wrap_target 0
 #define dvi_serialiser_wrap 1
 static const uint16_t dvi_serialiser_program_instructions[] = {
            //     .wrap_target
    0x70a1, //  0: out    pc, 1           side 2     
    0x68a1, //  1: out    pc, 1           side 1     
            //     .wrap
 };
 #if !PICO_NO_HARDWARE
 static const struct pio_program dvi_serialiser_program = {
    .instructions = dvi_serialiser_program_instructions,
    .length = 2,
    .origin = 0,
 };
 static inline pio_sm_config dvi_serialiser_program_get_default_config(uint offset) {
    pio_sm_config c = pio_get_default_sm_config();
    sm_config_set_wrap(&c, offset + dvi_serialiser_wrap_target, offset + dvi_serialiser_wrap);
    sm_config_set_sideset(&c, 2, false, false);
    return c;
 }
 #endif
 // -------------------- //
 // dvi_serialiser_debug //
 // -------------------- //
 #define dvi_serialiser_debug_wrap_target 0
 #define dvi_serialiser_debug_wrap 11
 static const uint16_t dvi_serialiser_debug_program_instructions[] = {
            //     .wrap_target
    0x98e0, //  0: pull   ifempty block   side 1     
    0xb042, //  1: nop                    side 0     
    0x6001, //  2: out    pins, 1                    
    0x6001, //  3: out    pins, 1                    
    0x6001, //  4: out    pins, 1                    
    0x6001, //  5: out    pins, 1                    
    0x6001, //  6: out    pins, 1                    
    0x6001, //  7: out    pins, 1                    
    0x6001, //  8: out    pins, 1                    
    0x6001, //  9: out    pins, 1                    
    0x6001, // 10: out    pins, 1                    
    0x6001, // 11: out    pins, 1                    
            //     .wrap
 };
 #if !PICO_NO_HARDWARE
 static const struct pio_program dvi_serialiser_debug_program = {
    .instructions = dvi_serialiser_debug_program_instructions,
    .length = 12,
    .origin = -1,
 };
 static inline pio_sm_config dvi_serialiser_debug_program_get_default_config(uint offset) {
    pio_sm_config c = pio_get_default_sm_config();
    sm_config_set_wrap(&c, offset + dvi_serialiser_debug_wrap_target, offset + dvi_serialiser_debug_wrap);
    sm_config_set_sideset(&c, 2, true, false);
    return c;
 }
 #include "dvi_config_defs.h"
 static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
    pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
    pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
    pio_gpio_init(pio, data_pins);
    pio_gpio_init(pio, data_pins + 1);
    pio_sm_config c;
    if (debug) {
        c = dvi_serialiser_debug_program_get_default_config(offset);
    }
    else {
        c = dvi_serialiser_program_get_default_config(offset);
    }
    sm_config_set_sideset_pins(&c, data_pins);
    if (debug)
 	    sm_config_set_out_pins(&c, data_pins, 1);
    sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
    sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
    pio_sm_init(pio, sm, offset, &c);
    pio_sm_set_enabled(pio, sm, false);
 }
 #endif
--- a/src/libdvi/dvi_timing.c
+++ b/src/libdvi/dvi_timing.c
@ -0,0 +1,324 @@
 #include "dvi.h"
 #include "dvi_timing.h"
 #include "hardware/dma.h"
 // This file contains:
 // - Timing parameters for DVI modes (horizontal + vertical counts, best
 //   achievable bit clock from 12 MHz crystal)
 // - Helper functions for generating DMA lists based on these timings
 extern bool dvi_monochrome_tmds; // In dvi.c
 // Pull into RAM but apply unique section suffix to allow linker GC
 #define __dvi_func(x) __not_in_flash_func(x)
 #define __dvi_const(x) __not_in_flash_func(x)
 // VGA -- we do this mode properly, with a pretty comfortable clk_sys (252 MHz)
 const struct dvi_timing __dvi_const(dvi_timing_640x480p_60hz) = {
 	.h_sync_polarity   = false,
 	.h_front_porch     = 16,
 	.h_sync_width      = 96,
 	.h_back_porch      = 48,
 	.h_active_pixels   = 640,
 	.v_sync_polarity   = false,
 	.v_front_porch     = 10,
 	.v_sync_width      = 2,
 	.v_back_porch      = 33,
 	.v_active_lines    = 480,
 	.bit_clk_khz       = 252000
 };
 // SVGA -- completely by-the-book but requires 400 MHz clk_sys
 const struct dvi_timing __dvi_const(dvi_timing_800x600p_60hz) = {
 	.h_sync_polarity   = false,
 	.h_front_porch     = 44,
 	.h_sync_width      = 128,
 	.h_back_porch      = 88,
 	.h_active_pixels   = 800,
 	.v_sync_polarity   = false,
 	.v_front_porch     = 1,
 	.v_sync_width      = 4,
 	.v_back_porch      = 23,
 	.v_active_lines    = 600,
 	.bit_clk_khz       = 400000
 };
 // 800x480p 60 Hz (note this doesn't seem to be a CEA mode, I just used the
 // output of `cvt 800 480 60`), 295 MHz bit clock
 const struct dvi_timing __dvi_const(dvi_timing_800x480p_60hz) = {
 	.h_sync_polarity = false,
 	.h_front_porch   = 24,
 	.h_sync_width    = 72,
 	.h_back_porch    = 96,
 	.h_active_pixels = 800,
 	.v_sync_polarity = true,
 	.v_front_porch   = 3,
 	.v_sync_width    = 10,
 	.v_back_porch    = 7,
 	.v_active_lines  = 480,
 	.bit_clk_khz     = 295200
 };
 // SVGA reduced blanking (355 MHz bit clock) -- valid CVT mode, less common
 // than fully-blanked SVGA, but doesn't require such a high system clock
 const struct dvi_timing __dvi_const(dvi_timing_800x600p_reduced_60hz) = {
 	.h_sync_polarity   = true,
 	.h_front_porch     = 48,
 	.h_sync_width      = 32,
 	.h_back_porch      = 80,
 	.h_active_pixels   = 800,
 	.v_sync_polarity   = false,
 	.v_front_porch     = 3,
 	.v_sync_width      = 4,
 	.v_back_porch      = 11,
 	.v_active_lines    = 600,
 	.bit_clk_khz       = 354000
 };
 // Also known as qHD, bit uncommon, but it's a nice modest-resolution 16:9
 // aspect mode. Pixel clock 37.3 MHz
 const struct dvi_timing __dvi_const(dvi_timing_960x540p_60hz) = {
 	.h_sync_polarity   = true,
 	.h_front_porch     = 16,
 	.h_sync_width      = 32,
 	.h_back_porch      = 96,
 	.h_active_pixels   = 960,
 	.v_sync_polarity   = true,
 	.v_front_porch     = 2,
 	.v_sync_width      = 6,
 	.v_back_porch      = 15,
 	.v_active_lines    = 540,
 	.bit_clk_khz       = 372000
 };
 // Note this is NOT the correct 720p30 CEA mode, but rather 720p60 run at half
 // pixel clock. Seems to be commonly accepted (and is a valid CVT mode). The
 // actual CEA mode is the same pixel clock as 720p60 but with >50% blanking,
 // which would require a clk_sys of 742 MHz!
 const struct dvi_timing __dvi_const(dvi_timing_1280x720p_30hz) = {
 	.h_sync_polarity   = true,
 	.h_front_porch     = 110,
 	.h_sync_width      = 40,
 	.h_back_porch      = 220,
 	.h_active_pixels   = 1280,
 	.v_sync_polarity   = true,
 	.v_front_porch     = 5,
 	.v_sync_width      = 5,
 	.v_back_porch      = 20,
 	.v_active_lines    = 720,
 	.bit_clk_khz       = 372000
 };
 // Reduced-blanking (CVT) 720p. You aren't supposed to use reduced blanking
 // modes below 60 Hz, but I won't tell anyone (and it works on the monitors
 // I've tried). This nets a lower system clock than regular 720p30 (319 MHz)
 const struct dvi_timing __dvi_const(dvi_timing_1280x720p_reduced_30hz) = {
 	.h_sync_polarity   = true,
 	.h_front_porch     = 48,
 	.h_sync_width      = 32,
 	.h_back_porch      = 80,
 	.h_active_pixels   = 1280,
 	.v_sync_polarity   = false,
 	.v_front_porch     = 3,
 	.v_sync_width      = 5,
 	.v_back_porch      = 13,
 	.v_active_lines    = 720,
 	.bit_clk_khz       = 319200
 };
 // This requires a spicy 488 MHz system clock and is illegal in most countries
 // (you need to have a very lucky piece of silicon to run this at 1.3 V, or
 // connect an external supply and give it a bit more juice)
 const struct dvi_timing __dvi_const(dvi_timing_1600x900p_reduced_30hz) = {
 	.h_sync_polarity   = true,
 	.h_front_porch     = 48,
 	.h_sync_width      = 32,
 	.h_back_porch      = 80,
 	.h_active_pixels   = 1600,
 	.v_sync_polarity   = false,
 	.v_front_porch     = 3,
 	.v_sync_width      = 5,
 	.v_back_porch      = 18,
 	.v_active_lines    = 900,
 	.bit_clk_khz       = 488000
 };
 // ----------------------------------------------------------------------------
 // The DMA scheme is:
 //
 // - One channel transferring data to each of the three PIO state machines
 //   performing TMDS serialisation
 //
 // - One channel programming the registers of each of these data channels,
 //   triggered (CHAIN_TO) each time the corresponding data channel completes
 //
 // - Lanes 1 and 2 have one block for blanking and one for video data
 //
 // - Lane 0 has one block for each horizontal region (front porch, hsync, back
 //   porch, active)
 //
 // - The IRQ_QUIET flag is used to select which data block on the sync lane is
 //   allowed to generate an IRQ upon completion. This is the block immediately
 //   before the horizontal active region. The IRQ is entered at ~the same time
 //   as the last data transfer starts
 //
 // - The IRQ points the control channels at new blocklists for next scanline.
 //   The DMA starts the new list automatically at end-of-scanline, via
 //   CHAIN_TO.
 //
 // The horizontal active region is the longest continuous transfer, so this
 // gives the most time to handle the IRQ and load new blocklists.
 //
 // Note a null trigger IRQ is not suitable because we get that *after* the
 // last data transfer finishes, and the FIFOs bottom out very shortly
 // afterward. For pure DVI (four blocks per scanline), it works ok to take
 // four regular IRQs per scanline and return early from 3 of them, but this
 // breaks down when you have very short scanline sections like guard bands.
 // Each symbol appears twice, concatenated in one word. Note these must be in
 // RAM because they see a lot of DMA traffic
 const uint32_t __dvi_const(dvi_ctrl_syms)[4] = {
 	0xd5354,
 	0x2acab,
 	0x55154,
 	0xaaeab
 };
 // Output solid red scanline if we are given NULL for tmdsbuff
 #if DVI_SYMBOLS_PER_WORD == 2
 static uint32_t __dvi_const(empty_scanline_tmds)[3] = {
 	0x7fd00u, // 0x00, 0x00
 	0x7fd00u, // 0x00, 0x00
 	0xbfa01u  // 0xfc, 0xfc
 };
 #else
 static uint32_t __attribute__((aligned(8))) __dvi_const(empty_scanline_tmds)[6] = {
 	0x100u, 0x1ffu, // 0x00, 0x00
 	0x100u, 0x1ffu, // 0x00, 0x00
 	0x201u, 0x2feu  // 0xfc, 0xfc
 };
 #endif
 void dvi_timing_state_init(struct dvi_timing_state *t) {
 	t->v_ctr = 0;
 	t->v_state = DVI_STATE_FRONT_PORCH;
 };
 void __dvi_func(dvi_timing_state_advance)(const struct dvi_timing *t, struct dvi_timing_state *s) {
 		s->v_ctr++;
 		if ((s->v_state == DVI_STATE_FRONT_PORCH && s->v_ctr == t->v_front_porch) || 
 		    (s->v_state == DVI_STATE_SYNC && s->v_ctr == t->v_sync_width) ||
 		    (s->v_state == DVI_STATE_BACK_PORCH && s->v_ctr == t->v_back_porch) ||
 		    (s->v_state == DVI_STATE_ACTIVE && s->v_ctr == t->v_active_lines)) {
 			s->v_state = (s->v_state + 1) % DVI_STATE_COUNT;
 			s->v_ctr = 0;
 		}
 }
 void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list) {
 	*dma_list = (struct dvi_scanline_dma_list){};	
 }
 static const uint32_t *get_ctrl_sym(bool vsync, bool hsync) {
 	return &dvi_ctrl_syms[!!vsync << 1 | !!hsync];
 }
 // Make a sequence of paced transfers to the relevant FIFO
 static void _set_data_cb(dma_cb_t *cb, const struct dvi_lane_dma_cfg *dma_cfg,
 		const void *read_addr, uint transfer_count, uint read_ring, bool irq_on_finish) {
 	cb->read_addr = read_addr;
 	cb->write_addr = dma_cfg->tx_fifo;
 	cb->transfer_count = transfer_count;
 	cb->c = dma_channel_get_default_config(dma_cfg->chan_data);
 	channel_config_set_ring(&cb->c, false, read_ring);
 	channel_config_set_dreq(&cb->c, dma_cfg->dreq);
 	// Call back to control channel for reconfiguration:
 	channel_config_set_chain_to(&cb->c, dma_cfg->chan_ctrl);
 	// Note we never send a null trigger, so IRQ_QUIET is an IRQ suppression flag
 	channel_config_set_irq_quiet(&cb->c, !irq_on_finish);
 };
 void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
 		bool vsync_asserted, struct dvi_scanline_dma_list *l) {
 	bool vsync = t->v_sync_polarity == vsync_asserted;
 	const uint32_t *sym_hsync_off = get_ctrl_sym(vsync, !t->h_sync_polarity);
 	const uint32_t *sym_hsync_on  = get_ctrl_sym(vsync,  t->h_sync_polarity);
 	const uint32_t *sym_no_sync   = get_ctrl_sym(false,  false             );
 	dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
 	// The symbol table contains each control symbol *twice*, concatenated into 20 LSBs of table word, so we can always do word-repeat.
 	_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch   / DVI_SYMBOLS_PER_WORD, 2, false);
 	_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on,  t->h_sync_width    / DVI_SYMBOLS_PER_WORD, 2, false);
 	_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch    / DVI_SYMBOLS_PER_WORD, 2, true);
 	_set_data_cb(&synclist[3], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		if (i == TMDS_SYNC_LANE)
 			continue;
 		dma_cb_t *cblist = dvi_lane_from_list(l, i);
 		_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
 		_set_data_cb(&cblist[1], &dma_cfg[i], sym_no_sync, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
 	}
 }
 void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
 		uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) {
 	const uint32_t *sym_hsync_off = get_ctrl_sym(!t->v_sync_polarity, !t->h_sync_polarity);
 	const uint32_t *sym_hsync_on  = get_ctrl_sym(!t->v_sync_polarity,  t->h_sync_polarity);
 	const uint32_t *sym_no_sync   = get_ctrl_sym(false,                false             );
 	dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
 	_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false);
 	_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on,  t->h_sync_width  / DVI_SYMBOLS_PER_WORD, 2, false);
 	_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch  / DVI_SYMBOLS_PER_WORD, 2, true);
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		dma_cb_t *cblist = dvi_lane_from_list(l, i);
 		if (i != TMDS_SYNC_LANE) {
 			_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,
 				(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
 		}
 		int target_block = i == TMDS_SYNC_LANE ? DVI_SYNC_LANE_CHUNKS - 1 :  DVI_NOSYNC_LANE_CHUNKS - 1;
 		if (tmdsbuf) {
 			// Non-repeating DMA for the freshly-encoded TMDS buffer
 			_set_data_cb(&cblist[target_block], &dma_cfg[i], tmdsbuf + i * (t->h_active_pixels / DVI_SYMBOLS_PER_WORD),
 				t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 0, false);
 		}
 		else {
 			// Use read ring to repeat the correct DC-balanced symbol pair on blank scanlines (4 or 8 byte period)
 			_set_data_cb(&cblist[target_block], &dma_cfg[i], &empty_scanline_tmds[2 * i / DVI_SYMBOLS_PER_WORD],
 				t->h_active_pixels / DVI_SYMBOLS_PER_WORD, DVI_SYMBOLS_PER_WORD == 2 ? 2 : 3, false);
 		}
 	}
 }
 void __dvi_func(dvi_update_scanline_data_dma)(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) {
 	for (int i = 0; i < N_TMDS_LANES; ++i) {
 		const uint32_t *lane_tmdsbuf = dvi_monochrome_tmds ? tmdsbuf : tmdsbuf + i * t->h_active_pixels / DVI_SYMBOLS_PER_WORD;
 		if (i == TMDS_SYNC_LANE)
 			dvi_lane_from_list(l, i)[3].read_addr = lane_tmdsbuf;
 		else
 			dvi_lane_from_list(l, i)[1].read_addr = lane_tmdsbuf;
 	}
 }
--- a/src/libdvi/dvi_timing.h
+++ b/src/libdvi/dvi_timing.h
@ -0,0 +1,99 @@
 #ifndef _DVI_TIMING_H
 #define _DVI_TIMING_H
 #include "hardware/dma.h"
 #include "pico/util/queue.h"
 #include "dvi.h"
 struct dvi_timing {
 	bool h_sync_polarity;
 	uint h_front_porch;
 	uint h_sync_width;
 	uint h_back_porch;
 	uint h_active_pixels;
 	bool v_sync_polarity;
 	uint v_front_porch;
 	uint v_sync_width;
 	uint v_back_porch;
 	uint v_active_lines;
 	uint bit_clk_khz;
 };
 enum dvi_line_state {
 	DVI_STATE_FRONT_PORCH = 0,
 	DVI_STATE_SYNC,
 	DVI_STATE_BACK_PORCH,
 	DVI_STATE_ACTIVE,
 	DVI_STATE_COUNT
 };
 struct dvi_timing_state {
 	uint v_ctr;
 	enum dvi_line_state v_state;
 };
 // This should map directly to DMA register layout, but more convenient types
 // (also this really shouldn't be here... we don't have a dma_cb in the SDK
 // because there are many valid formats due to aliases)
 typedef struct dma_cb {
 	const void *read_addr;
 	void *write_addr;
 	uint32_t transfer_count;
 	dma_channel_config c;
 } dma_cb_t;
 static_assert(sizeof(dma_cb_t) == 4 * sizeof(uint32_t), "bad dma layout");
 static_assert(__builtin_offsetof(dma_cb_t, c.ctrl) == __builtin_offsetof(dma_channel_hw_t, ctrl_trig), "bad dma layout");
 #define DVI_SYNC_LANE_CHUNKS DVI_STATE_COUNT
 #define DVI_NOSYNC_LANE_CHUNKS 2
 struct dvi_scanline_dma_list {
 	dma_cb_t l0[DVI_SYNC_LANE_CHUNKS];
 	dma_cb_t l1[DVI_NOSYNC_LANE_CHUNKS];
 	dma_cb_t l2[DVI_NOSYNC_LANE_CHUNKS];
 };
 static inline dma_cb_t* dvi_lane_from_list(struct dvi_scanline_dma_list *l, int i) {
 	return i == 0 ? l->l0 : i == 1 ? l->l1 : l->l2;
 }
 // Each TMDS lane uses one DMA channel to transfer data to a PIO state
 // machine, and another channel to load control blocks into this channel.
 struct dvi_lane_dma_cfg {
 	uint chan_ctrl;
 	uint chan_data;
 	void *tx_fifo;
 	uint dreq;
 };
 // Note these are already converted to pseudo-differential representation
 extern const uint32_t dvi_ctrl_syms[4];
 extern const struct dvi_timing dvi_timing_640x480p_60hz;
 extern const struct dvi_timing dvi_timing_800x480p_60hz;
 extern const struct dvi_timing dvi_timing_800x600p_60hz;
 extern const struct dvi_timing dvi_timing_960x540p_60hz;
 extern const struct dvi_timing dvi_timing_1280x720p_30hz;
 extern const struct dvi_timing dvi_timing_800x600p_reduced_60hz;
 extern const struct dvi_timing dvi_timing_1280x720p_reduced_30hz;
 void dvi_timing_state_init(struct dvi_timing_state *t);
 void dvi_timing_state_advance(const struct dvi_timing *t, struct dvi_timing_state *s);
 void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list);
 void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
 		bool vsync_asserted, struct dvi_scanline_dma_list *l);
 void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
 		uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l);
 void dvi_update_scanline_data_dma(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l);
 #endif
--- a/src/libdvi/tmds_encode.S
+++ b/src/libdvi/tmds_encode.S
@ -0,0 +1,623 @@
 #include "hardware/regs/addressmap.h"
 #include "hardware/regs/sio.h"
 #include "dvi_config_defs.h"
 // Offsets suitable for ldr/str (must be <= 0x7c):
 #define ACCUM0_OFFS     (SIO_INTERP0_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
 #define ACCUM1_OFFS     (SIO_INTERP0_ACCUM1_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
 #define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 #define PEEK0_OFFS      (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 #define PEEK1_OFFS      (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 #define PEEK2_OFFS      (SIO_INTERP0_PEEK_FULL_OFFSET  - SIO_INTERP0_ACCUM0_OFFSET)
 #define INTERP1         (SIO_INTERP1_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
 // Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
 // word-addressed space... almost as though it were intentional! :)
 .syntax unified
 .cpu cortex-m0plus
 .thumb
 .macro decl_func_x name
 .section .scratch_x.\name, "ax"
 .global \name
 .type \name,%function
 .thumb_func
 \name:
 .endm
 .macro decl_func_y name
 .section .scratch_y.\name, "ax"
 .global \name
 .type \name,%function
 .thumb_func
 \name:
 .endm
 #define decl_func decl_func_x
 // ----------------------------------------------------------------------------
 // Pixel-doubling encoders for RGB
 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Input size (pixels)
 .macro do_channel_16bpp r_ibase r_inout0 r_out1
 	str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
 	ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
 	ldr \r_inout0, [\r_inout0]
 	ldr \r_out1, [\r_ibase, #PEEK1_OFFS]
 	ldr \r_out1, [\r_out1]
 .endm
 decl_func tmds_encode_loop_16bpp
 	push {r4, r5, r6, r7, lr}
 	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 	b 2f
 .align 2
 1:
 .rept TMDS_ENCODE_UNROLL
 	ldmia r0!, {r4, r6}
 	do_channel_16bpp r2, r4, r5
 	do_channel_16bpp r2, r6, r7
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
 	cmp r1, ip
 	bne 1b
 	pop {r4, r5, r6, r7, pc}
 // Same as above, but scale data to make up for lack of left shift
 // in interpolator (costs 1 cycle per 2 pixels)
 //
 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Input size (pixels)
 // r3: Left shift amount
 decl_func tmds_encode_loop_16bpp_leftshift
 	push {r4, r5, r6, r7, lr}
 	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 	b 2f
 .align 2
 1:
 .rept TMDS_ENCODE_UNROLL
 	ldmia r0!, {r4, r6}
 	lsls r4, r3
 	do_channel_16bpp r2, r4, r5
 	lsls r6, r3
 	do_channel_16bpp r2, r6, r7
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
 	cmp r1, ip
 	bne 1b
 	pop {r4, r5, r6, r7, pc}
 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Input size (pixels)
 decl_func tmds_encode_loop_8bpp
 	push {r4, r5, r6, r7, lr}
 	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 	b 2f
 .align 2
 1:
 .rept TMDS_ENCODE_UNROLL
 	ldmia  r0!, {r4}
 	str r4, [r2, #ACCUM0_OFFS + INTERP1]
 	str r4, [r2, #ACCUM0_OFFS]
 	ldr r4, [r2, #PEEK0_OFFS]
 	ldr r4, [r4]
 	ldr r5, [r2, #PEEK1_OFFS]
 	ldr r5, [r5]
 	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
 	ldr r6, [r6]
 	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
 	ldr r7, [r7]
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
 	cmp r1, ip
 	bne 1b
 	pop {r4, r5, r6, r7, pc}
 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Input size (pixels)
 // r3: Left shift amount
 //
 // Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
 // the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
 // the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
 // since its channel MSBs are no greater than 7.
 decl_func tmds_encode_loop_8bpp_leftshift
 	push {r4, r5, r6, r7, lr}
 	lsls r2, #3
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 	b 2f
 .align 2
 1:
 .rept TMDS_ENCODE_UNROLL
 	ldmia  r0!, {r4}
 	str r4, [r2, #ACCUM0_OFFS + INTERP1]
 	lsls r4, r3
 	str r4, [r2, #ACCUM0_OFFS]
 	ldr r4, [r2, #PEEK0_OFFS]
 	ldr r4, [r4]
 	ldr r5, [r2, #PEEK1_OFFS]
 	ldr r5, [r5]
 	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
 	ldr r6, [r6]
 	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
 	ldr r7, [r7]
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
 	cmp r1, ip
 	bne 1b
 	pop {r4, r5, r6, r7, pc}
 // ----------------------------------------------------------------------------
 // Fast 1bpp black/white encoder (full res)
 // Taking the encoder from DVI spec, with initial balance 0:
 // 
 // - Encoding either 0x00 or 0xff will produce a running balance of -8, with
 //   output symbol of 0x100 or 0x200
 // 
 // - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with
 //  output symbol of 0x1ff or 0x2ff
 // 
 // So we can do 1bpp encode with a lookup of x coordinate LSB, and input
 // colour bit. If we process pixels in even-sized blocks, only the colour
 // lookup is needed.
 // Encode 8 pixels @ 1bpp (using two table lookups)
 // r3 contains lookup mask (preshifted)
 // r8 contains pointer to encode table
 // 2.125 cyc/pix
 .macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
 	\shift_instr0 r4, r2, #\shamt0
 	ands r4, r3
 	add r4, r8
 	ldmia r4, {r4, r5}
 	\shift_instr1 r6, r2, #\shamt1
 	ands r6, r3
 	add r6, r8
 	ldmia r6, {r6, r7}
 	stmia r1!, {r4, r5, r6, r7}
 .endm
 // r0: input buffer (word-aligned)
 // r1: output buffer (word-aligned)
 // r2: output pixel count
 decl_func tmds_encode_1bpp
 	push {r4-r7, lr}
 	mov r7, r8
 	push {r7}
 	lsls r2, #1
 	add r2, r1
 	mov ip, r2
 	adr r4, tmds_1bpp_table
 	mov r8, r4
 	// Mask: 4 bit index, 8 bytes per entry
 	movs r3, #0x78
 	b 2f
 1:
 	ldmia r0!, {r2}
 #if !DVI_1BPP_BIT_REVERSE
 	tmds_encode_1bpp_body lsls 3  lsrs 1
 	tmds_encode_1bpp_body lsrs 5  lsrs 9
 	tmds_encode_1bpp_body lsrs 13 lsrs 17
 	tmds_encode_1bpp_body lsrs 21 lsrs 25
 #else
 	tmds_encode_1bpp_body lsrs 1   lsls 3
 	tmds_encode_1bpp_body lsrs 9   lsrs 5
 	tmds_encode_1bpp_body lsrs 17  lsrs 13
 	tmds_encode_1bpp_body lsrs 25  lsrs 21
 #endif
 2:
 	cmp r1, ip
 	blo 1b
 	pop {r7}
 	mov r8, r7
 	pop {r4-r7, pc}
 .align 2
 tmds_1bpp_table:
 #if !DVI_1BPP_BIT_REVERSE
 	.word 0x7fd00, 0x7fd00  // 0000
 	.word 0x7fe00, 0x7fd00  // 0001
 	.word 0xbfd00, 0x7fd00  // 0010
 	.word 0xbfe00, 0x7fd00  // 0011
 	.word 0x7fd00, 0x7fe00  // 0100
 	.word 0x7fe00, 0x7fe00  // 0101
 	.word 0xbfd00, 0x7fe00  // 0110
 	.word 0xbfe00, 0x7fe00  // 0111
 	.word 0x7fd00, 0xbfd00  // 1000
 	.word 0x7fe00, 0xbfd00  // 1001
 	.word 0xbfd00, 0xbfd00  // 1010
 	.word 0xbfe00, 0xbfd00  // 1011
 	.word 0x7fd00, 0xbfe00  // 1100
 	.word 0x7fe00, 0xbfe00  // 1101
 	.word 0xbfd00, 0xbfe00  // 1110
 	.word 0xbfe00, 0xbfe00  // 1111
 #else
 	.word 0x7fd00, 0x7fd00  // 0000
 	.word 0x7fd00, 0xbfd00  // 1000
 	.word 0x7fd00, 0x7fe00  // 0100
 	.word 0x7fd00, 0xbfe00  // 1100
 	.word 0xbfd00, 0x7fd00  // 0010
 	.word 0xbfd00, 0xbfd00  // 1010
 	.word 0xbfd00, 0x7fe00  // 0110
 	.word 0xbfd00, 0xbfe00  // 1110
 	.word 0x7fe00, 0x7fd00  // 0001
 	.word 0x7fe00, 0xbfd00  // 1001
 	.word 0x7fe00, 0x7fe00  // 0101
 	.word 0x7fe00, 0xbfe00  // 1101
 	.word 0xbfe00, 0x7fd00  // 0011
 	.word 0xbfe00, 0xbfd00  // 1011
 	.word 0xbfe00, 0x7fe00  // 0111
 	.word 0xbfe00, 0xbfe00  // 1111
 #endif
 // ----------------------------------------------------------------------------
 // Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222)
 // Even-x-position pixels are encoded as symbols with imbalance -4, and odd
 // pixels with +4, so that we can mix-and-match our even/odd codewords and
 // always get a properly balanced sequence:
 //
 // level 0: (05 -> 103), then (04 -> 1fc)  (decimal 5, 4)
 // level 1: (50 -> 130), then (51 -> 1cf)  (decimal 80, 81)
 // level 2: (af -> 230), then (ae -> 2cf)  (decimal 175, 174)
 // level 3: (fa -> 203), then (fb -> 2fc)  (decimal 250, 251)
 //
 // These correspond to roughly 255 times (0, 1/3, 2/3, 1).
 //
 // Alternatively we could use symbols with 0 balance, which results in lower
 // contrast but avoids the LSB bobble:
 //
 // level 0: (10 -> 1f0) always
 // level 1: (5a -> 263) always
 // level 2: (a5 -> 163) always
 // level 3: (ef -> 2f0) always
 // Table base pointer in r0. Input pixels in r2.
 .macro encode_2bpp_body shift_instr shamt rd
 	\shift_instr \rd, r2, #\shamt
 	ands \rd, r3
 	ldr \rd, [r0, \rd]
 .endm
 // r0: input buffer (word-aligned)
 // r1: output buffer (word-aligned)
 // r2: output pixel count
 decl_func tmds_encode_2bpp
 	push {r4-r7, lr}
 	mov r7, r8
 	push {r7}
 	mov r8, r0
 	adr r0, tmds_2bpp_table
 	// Mask: 4-bit index into 4-byte entries.
 	movs r3, #0x3c
 	// Limit pointer: 1 word per 2 pixels
 	lsls r2, #1
 	add r2, r1
 	mov ip, r2
 	b 2f
 1:
 	mov r4, r8
 	ldmia r4!, {r2}
 	mov r8, r4
 	encode_2bpp_body lsls 2  r4
 	encode_2bpp_body lsrs 2  r5
 	encode_2bpp_body lsrs 6  r6
 	encode_2bpp_body lsrs 10 r7
 	stmia r1!, {r4-r7}
 	encode_2bpp_body lsrs 14 r4
 	encode_2bpp_body lsrs 18 r5
 	encode_2bpp_body lsrs 22 r6
 	encode_2bpp_body lsrs 26 r7
 	stmia r1!, {r4-r7}
 2:
 	cmp r1, ip
 	blo 1b
 	pop {r7}
 	mov r8, r7
 	pop {r4-r7, pc}
 .align 2
 tmds_2bpp_table:
 	.word 0x7f103 // 00, 00
 	.word 0x7f130 // 01, 00
 	.word 0x7f230 // 10, 00
 	.word 0x7f203 // 11, 00
 	.word 0x73d03 // 00, 01
 	.word 0x73d30 // 01, 01
 	.word 0x73e30 // 10, 01
 	.word 0x73e03 // 11, 01
 	.word 0xb3d03 // 00, 10
 	.word 0xb3d30 // 01, 10
 	.word 0xb3e30 // 10, 10
 	.word 0xb3e03 // 11, 10
 	.word 0xbf103 // 00, 11
 	.word 0xbf130 // 01, 11
 	.word 0xbf230 // 10, 11
 	.word 0xbf203 // 11, 11
 // ----------------------------------------------------------------------------
 // Full-resolution RGB encode (not very practical)
 // Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is
 // taking horizontal blanking (at VGA) and dual core into account, and
 // assuming the 3 channels are encoded individually.)
 //
 // Here is an idea
 // Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
 // ACCUM0), concatenated with the sign bit of our running disparity (from
 // ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
 // with the symbol's disparity stored left-justified in the upper 12 bits, as
 // e.g. a 6 bit signed integer.
 //
 // - Load pixel data.                        cyc: 0.75 (ldmia 2 words, every 4 pixels)
 // - Write pixel to ACCUM0.                  cyc: 1
 // - Read address from PEEK2.                cyc: 1
 // - Load encoded pixel from address.        cyc: 2
 // - Write disparity data to ACCUM1_ADD      cyc: 1
 // - Write encoded data to output buffer.    cyc: 1.25 (stmia 4 words, every 4 pixels)
 //
 // With decent register allocation we may be able to load 4 pixels at
 // once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
 //
 // One issue is that the TMDS data in the bottom of ACCUM1 will eventually
 // overflow and affect the running disparity, but with 16 zeroes in between,
 // this would take much longer than one scanline, so everything is fine if
 // we clear the accumulator at the start of the scanline.
 //
 // Note that we need to use two interpolators to get the bits from both pixels
 // -- we are not outputting a single DC-balanced stream, but rather two
 // interleaved streams which are each DC-balanced. This is fine electrically,
 // but our output here will *NOT* match the TMDS encoder given in the DVI
 // spec.
 // You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance
 // feedback. With the feedback enabled (default), the output is DC balanced,
 // but there are just barely enough CPU cycles to do all the encode, so it's
 // essentially a party trick. If you disable DC balancing, the performance is
 // much better, and many monitors will still accept the signals as long as you
 // DC couple your DVI signals.
 .macro tmds_fullres_encode_loop_body ra rb
 	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
 	str \ra, [r2, #ACCUM0_OFFS]
 	ldr \ra, [r2, #PEEK2_OFFS]
 	ldr \ra, [\ra]
 #if !TMDS_FULLRES_NO_DC_BALANCE
 	str \ra, [r2, #ACCUM1_ADD_OFFS]
 #endif
 	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
 	ldr \rb, [\rb]
 #if !TMDS_FULLRES_NO_DC_BALANCE
 	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 #endif
 .endm
 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Pixel count
 .macro tmds_fullres_encode_loop_16bpp
 	push {r4-r7, lr}
 	mov r4, r8
 	push {r4}
 	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 	// DC balance defined to be 0 at start of scanline:
 	movs r4, #0
 	str r4, [r2, #ACCUM1_OFFS]
 #if TMDS_FULLRES_NO_DC_BALANCE
 	// Alternate parity between odd/even symbols if no feedback
 	mvns r4, r4
 #endif
 	str r4, [r2, #ACCUM1_OFFS + INTERP1]
 	// Keep loop start pointer in r8 so we can get a longer backward branch
 	adr r4, 1f
 	adds r4, #1 // god damn thumb bit why is this a thing
 	mov r8, r4
 	b 2f
 	.align 2
 1:
 .rept 16
 	ldmia r0!, {r4, r6}
 	tmds_fullres_encode_loop_body r4 r5
 	tmds_fullres_encode_loop_body r6 r7
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
 	cmp r1, ip
 	beq 1f
 	bx r8
 1:
 	pop {r4}
 	mov r8, r4
 	pop {r4-r7, pc}
 .endm
 // One copy each in X and Y, so the two cores don't step on each other
 decl_func_x tmds_fullres_encode_loop_16bpp_x
 	tmds_fullres_encode_loop_16bpp
 decl_func_y tmds_fullres_encode_loop_16bpp_y
 	tmds_fullres_encode_loop_16bpp
 .macro tmds_fullres_encode_loop_body_leftshift ra rb
 	// Note we apply the leftshift for INTERP0 only
 	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
 	lsls \ra, r3
 	str \ra, [r2, #ACCUM0_OFFS]
 	ldr \ra, [r2, #PEEK2_OFFS]
 	ldr \ra, [\ra]
 #if !TMDS_FULLRES_NO_DC_BALANCE
 	str \ra, [r2, #ACCUM1_ADD_OFFS]
 #endif
 	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
 	ldr \rb, [\rb]
 #if !TMDS_FULLRES_NO_DC_BALANCE
 	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 #endif
 .endm
 // r0: Input buffer (word-aligned)
 // r1: Output buffer (word-aligned)
 // r2: Pixel count
 // r3: Left shift amount
 .macro tmds_fullres_encode_loop_16bpp_leftshift
 	push {r4-r7, lr}
 	mov r4, r8
 	mov r5, r9
 	push {r4-r5}
 	lsls r2, #2
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 	// DC balance defined to be 0 at start of scanline:
 	movs r4, #0
 	str r4, [r2, #ACCUM1_OFFS]
 #if TMDS_FULLRES_NO_DC_BALANCE
 	// Alternate parity between odd/even symbols if there's no balance feedback
 	mvns r4, r4
 #endif
 	str r4, [r2, #ACCUM1_OFFS + INTERP1]
 	adr r4, 1f
 	adds r4, #1
 	mov r8, r4
 	b 2f
 	.align 2
 1:
 .rept 16 // 64 pixels per iteration
 	ldmia r0!, {r4, r6}
 	tmds_fullres_encode_loop_body_leftshift r4 r5
 	tmds_fullres_encode_loop_body_leftshift r6 r7
 	stmia r1!, {r4, r5, r6, r7}
 .endr
 2:
 	cmp r1, ip
 	beq 1f
 	bx r8
 1:
 	pop {r4-r5}
 	mov r8, r4
 	mov r9, r5
 	pop {r4-r7, pc}
 .endm
 decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
 	tmds_fullres_encode_loop_16bpp_leftshift
 decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
 	tmds_fullres_encode_loop_16bpp_leftshift
 // ----------------------------------------------------------------------------
 // Full-resolution 8bpp paletted encode
 // Variant of tmds_fullres_encode_loop_16bpp that reads
 // 8-bit wide pixels packed 4 per word.  The interpolator
 // base is set to a reordered list of TMDS symbols based
 // on a user colour palette.
 // Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
 // interp base pointer. r7 used as temporary.
 .macro tmds_palette_encode_loop_body rd
 	str \rd, [r2, #ACCUM0_OFFS]
 	str \rd, [r2, #ACCUM0_OFFS + INTERP1]
 	ldr \rd, [r2, #PEEK2_OFFS]
 	ldr \rd, [\rd]
 #if !TMDS_FULLRES_NO_DC_BALANCE
 	str \rd, [r2, #ACCUM1_ADD_OFFS]
 #endif
 	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
 	ldr r7, [r7]
 #if !TMDS_FULLRES_NO_DC_BALANCE
 	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 #endif
 	lsls r7, #10
 	orrs \rd, r7
 .endm
 .macro tmds_palette_encode_loop
 	push {r4-r7, lr}
 	mov r4, r8
 	push {r4}
 	lsls r2, #1
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 	// DC balance defined to be 0 at start of scanline:
 	movs r4, #0
 	str r4, [r2, #ACCUM1_OFFS]
 #if TMDS_FULLRES_NO_DC_BALANCE
 	// Alternate parity between odd/even symbols if there's no balance feedback
 	mvns r4, r4
 #endif
 	str r4, [r2, #ACCUM1_OFFS + INTERP1]
 	// Keep loop start pointer in r8 so we can get a longer backward branch
 	adr r4, 1f
 	adds r4, #1 // god damn thumb bit why is this a thing
 	mov r8, r4
 	b 2f
 	.align 2
 1:
 .rept 10
 	ldmia r0!, {r3, r5}
 	lsrs r4, r3, #14
 	lsls r3, #2
 	lsrs r6, r5, #14
 	lsls r5, #2
 	tmds_palette_encode_loop_body r3
 	tmds_palette_encode_loop_body r4
 	tmds_palette_encode_loop_body r5
 	tmds_palette_encode_loop_body r6
 	stmia r1!, {r3, r4, r5, r6}
 .endr
 2:
 	cmp r1, ip
 	beq 1f
 	bx r8
 1:
 	pop {r4}
 	mov r8, r4
 	pop {r4-r7, pc}
 .endm
 decl_func_x tmds_palette_encode_loop_x
 	tmds_palette_encode_loop
 decl_func_y tmds_palette_encode_loop_y
 	tmds_palette_encode_loop
--- a/src/libdvi/tmds_encode.c
+++ b/src/libdvi/tmds_encode.c
@ -0,0 +1,305 @@
 #include "hardware/interp.h"
 #include "tmds_encode.h"
 #include "hardware/gpio.h"
 #include "hardware/sync.h"
 static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
 #include "tmds_table.h"
 };
 // Fullres table is bandwidth-critical, so gets one copy for each scratch
 // memory. There is a third copy which can go in flash, because it's just used
 // to generate palette LUTs. The ones we don't use will get garbage collected
 // during linking.
 const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
 #include "tmds_table_fullres.h"
 };
 const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
 #include "tmds_table_fullres.h"
 };
 // Configure an interpolator to extract a single colour channel from each of a pair
 // of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
 // pixel_width wide. Produce a LUT address for the first pixel's colour data on
 // LANE0, and the second pixel's colour data on LANE1.
 //
 // Returns nonzero if the *_leftshift variant of the encoder loop must be used
 // (needed for blue channel because I was a stubborn idiot and didn't put
 // signed/bidirectional shift on interpolator, very slightly slower). The
 // return value is the size of left shift required.
 static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint pixel_lsb, uint pixel_width, uint lut_index_width, const uint32_t *lutbase) {
 	interp_config c;
 	const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
 	int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift;
 	int oops = 0;
 	if (shift_channel_to_index < 0) {
 		// "It's ok we'll fix it in software"
 		oops = -shift_channel_to_index;
 		shift_channel_to_index = 0;
 	}
 	uint index_msb = index_shift + lut_index_width - 1;
 	c = interp_default_config();
 	interp_config_set_shift(&c, shift_channel_to_index);
 	interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
 	interp_set_config(interp, 0, &c);
 	c = interp_default_config();
 	interp_config_set_shift(&c, pixel_width	+ shift_channel_to_index);
 	interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
 	interp_config_set_cross_input(&c, true);
 	interp_set_config(interp, 1, &c);
 	interp->base[0] = (uint32_t)lutbase;
 	interp->base[1] = (uint32_t)lutbase;
 	return oops;
 }
 // Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer
 // of TMDS symbols from this colour channel. Number of pixels must be even,
 // pixel buffer must be word-aligned.
 void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
 	interp_hw_save_t interp0_save;
 	interp_save(interp0_hw, &interp0_save);
 	int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table);
 	if (require_lshift)
 		tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
 	else
 		tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
 	interp_restore(interp0_hw, &interp0_save);
 }
 // As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned.
 void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
 	interp_hw_save_t interp0_save, interp1_save;
 	interp_save(interp0_hw, &interp0_save);
 	interp_save(interp1_hw, &interp1_save);
 	// Note that for 8bpp, some left shift is always required for pixel 0 (any
 	// channel), which destroys some MSBs of pixel 3. To get around this, pixel
 	// data sent to interp1 is *not left-shifted*
 	int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table);
 	int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table);
 	assert(!lshift_upper); (void)lshift_upper;
 	if (require_lshift)	
 		tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
 	else
 		tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
 	interp_restore(interp0_hw, &interp0_save);
 	interp_restore(interp1_hw, &interp1_save);
 }
 // ----------------------------------------------------------------------------
 // Code for full-resolution TMDS encode (barely possible, utterly impractical):
 // Different scheme used for full res as the fun pixel-doubling DC balance
 // trick doesn't work, so we need to actually do running disparity. ACCUM0 has
 // pixel data, ACCUM1 has running disparity. INTERP0 is used to process even
 // pixels, and INTERP1 for odd pixels. Note this means that even and odd
 // symbols have their DC balance handled separately, which is not to spec.
 static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) {
 	const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
 	int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift;
 	int oops = 0;
 	if (shift_channel_to_index < 0) {
 		// "It's ok we'll fix it in software"
 		oops = -shift_channel_to_index;
 		shift_channel_to_index = 0;
 	}
 	uint index_msb = index_shift + lut_index_width - 1;
 	interp_config c;
 	// Shift and mask colour channel to lower 6 bits of LUT index (note lut_index_width excludes disparity sign)
 	c = interp_default_config();
 	interp_config_set_shift(&c, shift_channel_to_index);
 	interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
 	interp_set_config(interp, 0, &c);
 	// Concatenate disparity (ACCUM1) sign onto the LUT index
 	c = interp_default_config();
 	interp_config_set_shift(&c, 30 - index_msb);
 	interp_config_set_mask(&c, index_msb + 1, index_msb + 1);
 	interp_set_config(interp, 1, &c);
 	interp->base[2] = (uint32_t)lutbase;
 	return oops;
 }
 void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
 	uint core = get_core_num();
 #if !TMDS_FULLRES_NO_INTERP_SAVE
 	interp_hw_save_t interp0_save, interp1_save;
 	interp_save(interp0_hw, &interp0_save);
 	interp_save(interp1_hw, &interp1_save);
 #endif
 	// There is a copy of the inner loop and the LUT in both scratch X and
 	// scratch Y memories. Use X on core 1 and Y on core 0 so the cores don't
 	// tread on each other's toes too much.
 	const uint32_t *lutbase = core ? tmds_table_fullres_x : tmds_table_fullres_y;
 	int lshift_lower = configure_interp_for_addrgen_fullres(interp0_hw, channel_msb, channel_lsb, 6, lutbase);
 	int lshift_upper = configure_interp_for_addrgen_fullres(interp1_hw, channel_msb + 16, channel_lsb + 16, 6, lutbase);
 	assert(!lshift_upper); (void)lshift_upper;
 	if (lshift_lower) {
 		(core ?
 			tmds_fullres_encode_loop_16bpp_leftshift_x :
 			tmds_fullres_encode_loop_16bpp_leftshift_y
 		)(pixbuf, symbuf, n_pix, lshift_lower);
 	}
 	else {
 		(core ?
 			tmds_fullres_encode_loop_16bpp_x :
 			tmds_fullres_encode_loop_16bpp_y
 		)(pixbuf, symbuf, n_pix);
 	}
 #if !TMDS_FULLRES_NO_INTERP_SAVE
 	interp_restore(interp0_hw, &interp0_save);
 	interp_restore(interp1_hw, &interp1_save);
 #endif
 }
 static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 };
 static inline int byte_imbalance(uint32_t x)
 {
 	return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF];
 }
 static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym)
 {
 	int pixel_imbalance = byte_imbalance(pixel);
 	uint32_t sym = pixel & 1;
 	if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) {
 		for (int i = 0; i < 7; ++i) {
 			sym |= (~((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1);
 		}
 	}
 	else {
 		for (int i = 0; i < 7; ++i) {
 			sym |= ( ((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1);
 		}
 		sym |= 0x100;
 	}
 	int imbalance = byte_imbalance(sym & 0xFF);
  if (imbalance == 0) {
 		if ((sym & 0x100) == 0) sym ^= 0x2ff;
 		*positive_balance_sym = sym;
 		*negative_balance_sym = sym;
 		return;
 	}
 	else if (imbalance > 0) {
 		*negative_balance_sym = (sym ^ 0x2ff) | (((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) & 0x3F) << 26);
 		*positive_balance_sym = sym | ((imbalance + imbalance_lookup[sym >> 8] + 2) << 26);
 	}
 	else {
 		*negative_balance_sym = sym | (((imbalance + imbalance_lookup[sym >> 8] + 2) & 0x3F) << 26);
 		*positive_balance_sym = (sym ^ 0x2ff) | ((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) << 26);
 	}
 }
 // This takes a 16-bit (RGB 565) colour palette and makes palettes of TMDS symbols suitable
 // for performing fullres encode.
 // The TMDS palette buffer should be 6 * n_palette words long.
 // n_palette must be a power of 2 <= 256.
 void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *tmds_palette, size_t n_palette) {
 	uint32_t* tmds_palette_blue = tmds_palette;
 	uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
 	uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
 	for (int i = 0; i < n_palette; ++i) {
 		uint16_t blue = (palette[i] << 3) & 0xf8;
 		uint16_t green = (palette[i] >> 3) & 0xfc;
 		uint16_t red = (palette[i] >> 8) & 0xf8;
 		tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]);
 		tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]);
 		tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]);
 	}
 }
 // This takes a 24-bit (RGB 888) colour palette and makes palettes of TMDS symbols suitable
 // for performing fullres encode.
 // The TMDS palette buffer should be 6 * n_palette words long.
 // n_palette must be a power of 2 <= 256.
 void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *tmds_palette, size_t n_palette) {
 	uint32_t* tmds_palette_blue = tmds_palette;
 	uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
 	uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
 	for (int i = 0; i < n_palette; ++i) {
 		uint16_t blue = palette[i] & 0xff;
 		uint16_t green = (palette[i] >> 8) & 0xff;
 		uint16_t red = (palette[i] >> 16) & 0xff;
 		tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]);
 		tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]);
 		tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]);
 	}
 }
 // Encode palette data for all 3 channels.
 // pixbuf is an array of n_pix 8-bit wide pixels containing palette values (32-bit word aligned)
 // tmds_palette is a palette of TMDS symbols produced by tmds_setup_palette_symbols
 // symbuf is 3*n_pix 32-bit words, this function writes the symbol values for each of the channels to it.
 void __not_in_flash_func(tmds_encode_palette_data)(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits) {
 	uint core = get_core_num();
 #if !TMDS_FULLRES_NO_INTERP_SAVE
 	interp_hw_save_t interp0_save, interp1_save;
 	interp_save(interp0_hw, &interp0_save);
 	interp_save(interp1_hw, &interp1_save);
 #endif
 	interp0_hw->base[2] = (uint32_t)tmds_palette;
 	interp1_hw->base[2] = (uint32_t)tmds_palette;
 	// Lane 0 on both interpolators masks the palette bits, starting at bit 2,
 	// The second interpolator also shifts to read the 2nd or 4th byte of the word.
 	interp0_hw->ctrl[0] =
 		(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
 		((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
 	interp1_hw->ctrl[0] =
 		(8 << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
 		(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
 		((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
 	// Lane 1 shifts and masks the sign bit into the right position to add to the symbol
 	// table index to choose the negative disparity symbols if the sign is negative.
 	const uint32_t ctrl_lane_1 =
 		((31 - (palette_bits + 2)) << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
 		(palette_bits + 2) * ((1 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) | (1 << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB));
 	interp0_hw->ctrl[1] = ctrl_lane_1;
 	interp1_hw->ctrl[1] = ctrl_lane_1;
 	if (core) {
 		tmds_palette_encode_loop_x(pixbuf, symbuf, n_pix);
 		interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
 		interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
 		tmds_palette_encode_loop_x(pixbuf, symbuf + (n_pix >> 1), n_pix);
 		interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
 		interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
 		tmds_palette_encode_loop_x(pixbuf, symbuf + n_pix, n_pix);
 	} else {
 		tmds_palette_encode_loop_y(pixbuf, symbuf, n_pix);
 		interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
 		interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
 		tmds_palette_encode_loop_y(pixbuf, symbuf + (n_pix >> 1), n_pix);
 		interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
 		interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
 		tmds_palette_encode_loop_y(pixbuf, symbuf + n_pix, n_pix);
 	}
 #if !TMDS_FULLRES_NO_INTERP_SAVE
 	interp_restore(interp0_hw, &interp0_save);
 	interp_restore(interp1_hw, &interp1_save);
 #endif
 }
--- a/src/libdvi/tmds_encode.h
+++ b/src/libdvi/tmds_encode.h
@ -0,0 +1,46 @@
 #ifndef _TMDS_ENCODE_H_
 #define _TMDS_ENCODE_H_
 #include "hardware/interp.h"
 #include "dvi_config_defs.h"
 #if defined(__cplusplus)
 extern "C"
 {
 #endif
 // Functions from tmds_encode.c
 void tmds_encode_data_channel_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
 void tmds_encode_data_channel_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
 void tmds_encode_data_channel_fullres_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
 void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *symbuf, size_t n_palette);
 void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *symbuf, size_t n_palette);
 void tmds_encode_palette_data(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits);
 // Functions from tmds_encode.S
 void tmds_encode_1bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 void tmds_encode_2bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 // Uses interp0:
 void tmds_encode_loop_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 void tmds_encode_loop_16bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
 // Uses interp0 and interp1:
 void tmds_encode_loop_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 void tmds_encode_loop_8bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
 // Uses interp0 and interp1:
 // (Note a copy is provided in scratch memories X and Y)
 void tmds_fullres_encode_loop_16bpp_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 void tmds_fullres_encode_loop_16bpp_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 void tmds_fullres_encode_loop_16bpp_leftshift_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
 void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
 void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 #if defined(__cplusplus)
 }
 #endif
 #endif
--- a/src/libdvi/tmds_encode_1bpp.pio
+++ b/src/libdvi/tmds_encode_1bpp.pio
@ -0,0 +1,46 @@
 .program tmds_encode_1bpp
 ; 1bpp black/white pixels go in, TMDS symbols come out.
 ; Each output word contains two output symbols, each 10 bits in size,
 ; right-justified. The least-significant symbol is displayed first.
 ;
 ; We can encode using the following LUT: (yes this is compliant)
 ;
 ; x % 2 | colour | symbol
 ; ------+--------+-------
 ; 0     | 0      | 0x100
 ; 0     | 1      | 0x200
 ; 1     | 0      | 0x1ff
 ; 1     | 1      | 0x2ff
 ;
 ; OSR: shift to right, autopull, threshold 32
 ; ISR: shift to right, autopush, threshold 24
 ;
 ; Note the ISR needs to be shifted to *right* so that we can get the first
 ; pixel in the less-significant position. Threshold 24 so we can get 8x 0-bits
 ; at the LSBs for free :)
 even_pixel:
    out x, 1
    mov y, ~x
    in y, 1
    in x, 1
 odd_pixel:
    mov x, ~null
    in x, 8
    out x, 1
    mov y, ~x
    in y, 1
    in x, 13     ; Bring total shift to 24, triggering push.
 % c-sdk {
 static inline void tmds_encode_1bpp_init(PIO pio, uint sm) {
    uint offset = pio_add_program(pio, &tmds_encode_1bpp_program);
    pio_sm_config c = tmds_encode_1bpp_program_get_default_config(offset);
    sm_config_set_out_shift(&c, true, true, 32);
    sm_config_set_in_shift(&c, true, true, 24);
    pio_sm_init(pio, sm, offset, &c);
    pio_sm_set_enabled(pio, sm, true);
 }
 %}
--- a/src/libdvi/tmds_table.h
+++ b/src/libdvi/tmds_table.h
@ -0,0 +1,76 @@
 // Generated from tmds_table_gen.py
 //
 // This table converts a 6 bit data input into a pair of TMDS data symbols
 // with data content *almost* equal (1 LSB off) to input value left shifted by
 // two. The pairs of symbols have a net DC balance of 0.
 //
 // The two symbols are concatenated in the 20 LSBs of a data word, with the
 // first symbol in least-significant position.
 //
 // Note the declaration isn't included here, just the table body. This is in
 // case you want multiple copies of the table in different SRAMs (particularly
 // scratch X/Y).
 0x7fd00u,
 0x40dfcu,
 0x41df8u,
 0x7ed04u,
 0x43df0u,
 0x7cd0cu,
 0x7dd08u,
 0x42df4u,
 0x47de0u,
 0x78d1cu,
 0x79d18u,
 0x46de4u,
 0x7bd10u,
 0x44decu,
 0x45de8u,
 0xafa41u,
 0x4fdc0u,
 0x70d3cu,
 0x71d38u,
 0x4edc4u,
 0x73d30u,
 0x4cdccu,
 0x4ddc8u,
 0xa7a61u,
 0x77d20u,
 0x48ddcu,
 0x49dd8u,
 0xa3a71u,
 0x4bdd0u,
 0xa1a79u,
 0xa0a7du,
 0x9fa81u,
 0x5fd80u,
 0x60d7cu,
 0x61d78u,
 0x5ed84u,
 0x63d70u,
 0x5cd8cu,
 0x5dd88u,
 0xb7a21u,
 0x67d60u,
 0x58d9cu,
 0x59d98u,
 0xb3a31u,
 0x5bd90u,
 0xb1a39u,
 0xb0a3du,
 0x8fac1u,
 0x6fd40u,
 0x50dbcu,
 0x51db8u,
 0xbba11u,
 0x53db0u,
 0xb9a19u,
 0xb8a1du,
 0x87ae1u,
 0x57da0u,
 0xbda09u,
 0xbca0du,
 0x83af1u,
 0xbea05u,
 0x81af9u,
 0x80afdu,
 0xbfa01u,
--- a/src/libdvi/tmds_table_fullres.h
+++ b/src/libdvi/tmds_table_fullres.h
@ -0,0 +1,139 @@
 // Each entry consists of a 10 bit TMDS symbol in pseudo-differential format
 // (10 LSBs) and the symbol's disparity as a 6 bit signed integer (the 6
 // MSBs). There is a 16 bit gap in between them, which is actually vital for
 // the way the TMDS encode works!
 //
 // There are 128 1-word entries. The lookup index should be the concatenation
 // of the sign bit of current running disparity, with 6 bits of colour channel
 // data.
 // Non-negative running disparity:
 0xe0000100,
 0xf8000303,
 0x00000307,
 0xe8000104,
 0x000001f0,
 0xf000010c,
 0xe8000108,
 0x0000030b,
 0xf80001e0,
 0xf800011c,
 0xf0000118,
 0x000001e4,
 0xe8000110,
 0x00000313,
 0x000001e8,
 0xf0000241,
 0xf00001c0,
 0x0000013c,
 0xf8000138,
 0xf80001c4,
 0xf0000130,
 0x000001cc,
 0xf80001c8,
 0xf8000261,
 0xe8000120,
 0x00000323,
 0x000001d8,
 0x00000271,
 0xf80001d0,
 0xf0000086,
 0xe8000082,
 0xf0000281,
 0xe8000180,
 0x00000383,
 0x00000178,
 0xf0000184,
 0xf8000170,
 0xf800018c,
 0xf0000188,
 0xf0000221,
 0xf0000160,
 0x0000019c,
 0xf8000198,
 0xf8000231,
 0xf0000190,
 0x00000239,
 0xf00000c2,
 0xf80002c1,
 0xe8000140,
 0x00000343,
 0x000001b8,
 0xf0000211,
 0xf80001b0,
 0xf8000219,
 0x0000021d,
 0x000002e1,
 0xf00001a0,
 0xf0000209,
 0xf800020d,
 0xf000000e,
 0xf0000205,
 0xe8000006,
 0xe0000002,
 0xe8000201,
 // Negative running disparity:
 0x280003ff,
 0x100001fc,
 0x080001f8,
 0x200003fb,
 0x000001f0,
 0x180003f3,
 0x200003f7,
 0x080001f4,
 0x1000031f,
 0x100003e3,
 0x180003e7,
 0x000001e4,
 0x200003ef,
 0x080001ec,
 0x000001e8,
 0x080000be,
 0x1800033f,
 0x0000013c,
 0x100003c7,
 0x1000033b,
 0x180003cf,
 0x000001cc,
 0x10000337,
 0x0000009e,
 0x200003df,
 0x080001dc,
 0x000001d8,
 0x00000271,
 0x1000032f,
 0x08000279,
 0x1000027d,
 0x0800007e,
 0x2000037f,
 0x0800017c,
 0x00000178,
 0x1800037b,
 0x1000038f,
 0x10000373,
 0x18000377,
 0x080000de,
 0x1800039f,
 0x0000019c,
 0x10000367,
 0x000000ce,
 0x1800036f,
 0x00000239,
 0x0800023d,
 0x0000003e,
 0x200003bf,
 0x080001bc,
 0x000001b8,
 0x080000ee,
 0x1000034f,
 0x000000e6,
 0x0000021d,
 0x000002e1,
 0x1800035f,
 0x080000f6,
 0x000000f2,
 0x080002f1,
 0x080000fa,
 0x100002f9,
 0x180002fd,
 0x100000fe,
--- a/src/libdvi/tmds_table_gen.py
+++ b/src/libdvi/tmds_table_gen.py
@ -0,0 +1,150 @@
 #!/usr/bin/env python3
 # The key fact is that, if x is even, and the encoder currently has a running
 # imbalance of 0, encoding x followed by x + 1 produces a symbol pair with a
 # net balance of 0.
 #
 # This is a reasonable constraint, because we only want RGB565 (so 6 valid
 # channel data bits -> data is multiple of 4), and can probably tolerate
 # 0.25LSB of noise :)
 #
 # This means that encoding a half-horizontal-resolution scanline buffer is a
 # simple LUT operation for each colour channel, because we have made the
 # encoding process stateless by guaranteeing 0 balance.
 def popcount(x):
 	n = 0
 	while x:
 		n += 1
 		x = x & (x - 1)
 	return n
 # Equivalent to N1(q) - N0(q) in the DVI spec
 def byteimbalance(x):
 	return 2 * popcount(x) - 8
 # This is a direct translation of "Figure 3-5. T.M.D.S. Encode Algorithm" on
 # page 29 of DVI 1.0 spec
 class TMDSEncode:
 	ctrl_syms = {
 		0b00: 0b1101010100,
 		0b01: 0b0010101011,
 		0b10: 0b0101010100,
 		0b11: 0b1010101011
 	}
 	def __init__(self):
 		self.imbalance = 0
 	def encode(self, d, c, de):
 		if not de:
 			self.imbalance = 0
 			return self.ctrl_syms[c]
 		# Minimise transitions
 		q_m = d & 0x1
 		if popcount(d) > 4 or (popcount(d) == 4 and not d & 0x1):
 			for i in range(7):
 				q_m = q_m | (~(q_m >> i ^ d >> i + 1) & 0x1) << i + 1
 		else:
 			for i in range(7):
 				q_m = q_m | ( (q_m >> i ^ d >> i + 1) & 0x1) << i + 1
 			q_m = q_m | 0x100
 		# Correct DC balance
 		inversion_mask = 0x2ff
 		q_out = 0
 		if self.imbalance == 0 or byteimbalance(q_m & 0xff) == 0:
 			q_out = q_m ^ (0 if q_m & 0x100 else inversion_mask)
 			if q_m & 0x100:
 				self.imbalance += byteimbalance(q_m & 0xff)
 			else:
 				self.imbalance -= byteimbalance(q_m & 0xff)
 		elif (self.imbalance > 0) == (byteimbalance(q_m & 0xff) > 0):
 			q_out = q_m ^ inversion_mask
 			self.imbalance += ((q_m & 0x100) >> 7) - byteimbalance(q_m & 0xff)
 		else:
 			q_out = q_m
 			self.imbalance += byteimbalance(q_m & 0xff) - ((~q_m & 0x100) >> 7)
 		return q_out
 # Turn a bitmap of width n into n pairs of pseudo-differential bits
 def differentialise(x, n):
 	accum = 0
 	for i in range(n):
 		accum <<= 2
 		if x & (1 << (n - 1)):
 			accum |= 0b01
 		else:
 			accum |= 0b10
 		x <<= 1
 	return accum
 enc = TMDSEncode()
 ###
 # Pixel-doubled table:
 # for i in range(0, 256, 4):
 # 	sym0 = enc.encode(i, 0, 1)
 # 	sym1 = enc.encode(i ^ 1, 0, 1)
 # 	assert(enc.imbalance == 0)
 # 	print(f"0x{sym0 | (sym1 << 10):05x}u,")
 ###
 # Fullres 1bpp table: (each entry is 2 words, 4 pixels)
 # (note trick here is that encoding 0x00 or 0xff sets imbalance to -8, and
 # (encoding 0x01 or 0xfe returns imbalance to 0, so we alternate between these
 # (two pairs of dark/light colours. Creates some fairly subtle vertical
 # (banding, but it's cheap.
 # for i in range(1 << 4):
 # 	syms = list(enc.encode((0xff if i & 1 << j else 0) ^ j & 0x01, 0, 1) for j in range(4))
 # 	print(f"0x{syms[0] | syms[1] << 10:05x}, 0x{syms[2] | syms[3] << 10:05x}")
 # 	assert(enc.imbalance == 0)
 ###
 # Fullres table stuff:
 # def disptable_format(sym):
 # 	return sym | ((popcount(sym) * 2 - 10 & 0x3f) << 26)
 # print("// Non-negative running disparity:")
 # for i in range(0, 256, 4):
 # 	enc.imbalance = 1
 # 	print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
 # print("// Negative running disparity:")
 # for i in range(0, 256, 4):
 # 	enc.imbalance = -1
 # 	print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
 ###
 # Control symbols:
 # for i in range(4):
 # 	sym = enc.encode(0, i, 0)
 # 	print(f"0x{sym << 10 | sym:05x},")
 ###
 # Find zero-balance symbols:
 # for i in range(256):
 # 	enc.imbalance = 0
 # 	sym = enc.encode(i, 0, 1)
 # 	if enc.imbalance == 0:
 # 		print(f"{i:02x}: {sym:03x}")
 ###
 # Generate 2bpp table based on above experiment:
 levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa]
 levels_2bpp_odd  = [0x04, 0x51, 0xae, 0xfb]
 for i1, p1 in enumerate(levels_2bpp_odd):
 	for i0, p0 in enumerate(levels_2bpp_even):
 		sym0 = enc.encode(p0, 0, 1)
 		sym1 = enc.encode(p1, 0, 1)
 		assert(enc.imbalance == 0)
 		print(f".word 0x{sym1 << 10 | sym0:05x} // {i0:02b}, {i1:02b}")
--- a/src/libdvi/util_queue_u32_inline.h
+++ b/src/libdvi/util_queue_u32_inline.h
@ -0,0 +1,83 @@
 #ifndef _UTIL_QUEUE_U32_INLINE_H
 #define _UTIL_QUEUE_U32_INLINE_H
 // Faster versions of the functions found in pico/util/queue.h, for the common
 // case of 32-bit-sized elements. Can be used on the same queue data
 // structure, and mixed freely with the generic access methods, as long as
 // element_size == 4.
 #include "pico/util/queue.h"
 #include "hardware/sync.h"
 static inline uint16_t _queue_inc_index_u32(queue_t *q, uint16_t index) {
    if (++index > q->element_count) { // > because we have element_count + 1 elements
        index = 0;
    }
    return index;
 }
 static inline bool queue_try_add_u32(queue_t *q, void *data) {
    bool success = false;
    uint32_t flags = spin_lock_blocking(q->core.spin_lock);
    if (queue_get_level_unsafe(q) != q->element_count) {
        ((uint32_t*)q->data)[q->wptr] = *(uint32_t*)data;
        q->wptr = _queue_inc_index_u32(q, q->wptr);
        success = true;
    }
    spin_unlock(q->core.spin_lock, flags);
    if (success) __sev();
    return success;
 }
 static inline bool queue_try_remove_u32(queue_t *q, void *data) {
    bool success = false;
    uint32_t flags = spin_lock_blocking(q->core.spin_lock);
    if (queue_get_level_unsafe(q) != 0) {
        *(uint32_t*)data = ((uint32_t*)q->data)[q->rptr];
        q->rptr = _queue_inc_index_u32(q, q->rptr);
        success = true;
    }
    spin_unlock(q->core.spin_lock, flags);
    if (success) __sev();
    return success;
 }
 static inline bool queue_try_peek_u32(queue_t *q, void *data) {
    bool success = false;
    uint32_t flags = spin_lock_blocking(q->core.spin_lock);
    if (queue_get_level_unsafe(q) != 0) {
        *(uint32_t*)data = ((uint32_t*)q->data)[q->rptr];
        success = true;
    }
    spin_unlock(q->core.spin_lock, flags);
    return success;
 }
 static inline void queue_add_blocking_u32(queue_t *q, void *data) {
    bool done;
    do {
        done = queue_try_add_u32(q, data);
        if (done) break;
        __wfe();
    } while (true);
 }
 static inline void queue_remove_blocking_u32(queue_t *q, void *data) {
    bool done;
    do {
        done = queue_try_remove_u32(q, data);
        if (done) break;
        __wfe();
    } while (true);
 }
 static inline void queue_peek_blocking_u32(queue_t *q, void *data) {
    bool done;
    do {
        done = queue_try_peek_u32(q, data);
        if (done) break;
        __wfe();
    } while (true);
 }
 #endif